In [1]:
import pandas as pd
import numpy as np
import re

### Create doc to vector

In [2]:
columns_movies = ["movie_id", "title", "imdb_idx",
                  "movie_kind", "release", "imdb_id", "phonetic", "episode_id",
                  "season", "episode", "series_years", "md5"]

df_IMDb_movies = pd.read_csv('data/title.csv', delimiter=',', names=columns_movies, encoding = "ISO-8859-1", low_memory=False) 
df_IMDb_movies = df_IMDb_movies.drop(columns=["imdb_idx", "imdb_id", "phonetic", "md5", "episode_id", "episode", "movie_kind", "season", "series_years"])
df_IMDb_movies = df_IMDb_movies.dropna(subset=['release'])
df_IMDb_movies["release"] = df_IMDb_movies["release"].apply(lambda x : str(int(x)).split("-")[-1])
# we lowered in MovieLens as well
df_IMDb_movies = df_IMDb_movies.dropna(subset=["title"])
df_IMDb_movies["title"] = df_IMDb_movies["title"].apply(lambda x: x.lower())

# drop rows where movie starts with brackets, those are some strange names...
df_IMDb_movies = df_IMDb_movies[~df_IMDb_movies.title.str.startswith("(")]

# handle seven (se7en) movies, creating new rows containing the content of brackets
_df = df_IMDb_movies[df_IMDb_movies.title.str.contains("(", regex=False)]
_df.title = _df.title.apply(lambda x: re.search(r'\((.*?)\)', x).group(1).strip() if re.search(r'\((.*?)\)', x) else x.strip())
df_IMDb_movies = df_IMDb_movies.append(_df)

print(df_IMDb_movies.dtypes)
print(df_IMDb_movies.shape)
df_IMDb_movies.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


movie_id     int64
title       object
release     object
dtype: object
(1776458, 3)


Unnamed: 0,movie_id,title,release
1,5156,josie duggar's 1st shoes,2010
5,40704,anniversary,1971
7,149337,mellody hobson,2005
8,32020,kiss me kate,2011
10,112085,the best of olivia,2008


In [3]:
def clean_movie_title(movie_title):
    if movie_title.split(" ")[-1].startswith("("):
        # remove year from the title, e.g. Toy Story (1995) --> Toy Story
        movie_title = (" ".join(movie_title.split(" ")[:-1])).strip()

    if movie_title.title().split(',')[-1].strip() in ['The', 'A']:
        # article + movie title, e.g. Saint, The --> The Saint
        movie_title = (movie_title.title().split(',')[-1].strip() + " " + " ".join(movie_title.title().split(',')[:-1])).strip()

    # otherwise, it was converting The Devil's Advocate to The Devil'S Advocate
    movie_title = movie_title.lower()
    return movie_title

In [4]:
column_item = ["movie_id_ml", "title", "release", "vrelease", "url", "unknown", 
                    "action", "adventure", "animation", "childrens", "comedy",
                   "crime", "documentary", "drama", "fantasy", "noir", "horror",
                   "musical", "mystery", "romance", "scifi", "thriller",
                   "war", "western"]

df_ML_movies = pd.read_csv('data/u.item', delimiter='|', names=column_item, encoding = "ISO-8859-1") 
df_ML_movies = df_ML_movies.drop(columns=["vrelease"])
df_ML_movies["title"] = df_ML_movies["title"].apply(lambda row : clean_movie_title(row))   
df_ML_movies["release"] = df_ML_movies["release"].apply(lambda x : str(x).split("-")[-1])

# drop rows where movie starts with brackets, those are some strange names...
df_ML_movies = df_ML_movies[~df_ML_movies.title.str.startswith("(")]

# handle seven (se7en) movies, creating new rows containing the content of brackets
_df = df_ML_movies[df_ML_movies.title.str.contains("(", regex=False)]
_df.title = _df.title.apply(lambda x: re.search(r'\((.*?)\)', x).group(1).strip() if re.search(r'\((.*?)\)', x) else x.strip())
df_ML_movies = df_ML_movies.append(_df)

print(df_ML_movies.shape)
print(df_ML_movies.dtypes)
df_ML_movies.head()

(1767, 23)
movie_id_ml     int64
title          object
release        object
url            object
unknown         int64
action          int64
adventure       int64
animation       int64
childrens       int64
comedy          int64
crime           int64
documentary     int64
drama           int64
fantasy         int64
noir            int64
horror          int64
musical         int64
mystery         int64
romance         int64
scifi           int64
thriller        int64
war             int64
western         int64
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


Unnamed: 0,movie_id_ml,title,release,url,unknown,action,adventure,animation,childrens,comedy,...,fantasy,noir,horror,musical,mystery,romance,scifi,thriller,war,western
0,1,toy story,1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,goldeneye,1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,four rooms,1995,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,get shorty,1995,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,5,copycat,1995,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [5]:
df = pd.merge(df_ML_movies, df_IMDb_movies, on=["title", "release"])
movie_ids = list(df.movie_id_ml.unique())

### Create keyword to vector

In [6]:
column_movie_keyword = ["mkid", "movie_id", "keyword_id"]

df_movie_keyword = pd.read_csv('data/movie_keyword.csv', delimiter=',', names=column_movie_keyword, encoding = "ISO-8859-1") 
print(df_movie_keyword.dtypes)
print(df_movie_keyword.shape)
df_movie_keyword = df_movie_keyword.drop(columns=["mkid"])
df_movie_keyword.head()

mkid          int64
movie_id      int64
keyword_id    int64
dtype: object
(4523930, 3)


Unnamed: 0,movie_id,keyword_id
0,2,1
1,11,2
2,22,2
3,44,3
4,24,2


In [7]:
df_movies_keywords = pd.merge(df, df_movie_keyword, on="movie_id")
df_movies_keywords = df_movies_keywords[["movie_id_ml","keyword_id"]]
df_movies_keywords.head()

Unnamed: 0,movie_id_ml,keyword_id
0,1,834
1,1,2956
2,1,66752
3,1,93318
4,1,73783


In [10]:
column_keyword = ["keyword_id", "keyword", "phonetic"]

df_keyword = pd.read_csv('data/keyword.csv', delimiter=',', names=column_keyword, encoding = "ISO-8859-1") 
print(df_keyword.dtypes)
print(df_keyword.shape)
df_keyword = df_keyword.drop(columns=["phonetic"])
df_keyword.head()

keyword_id     int64
keyword       object
phonetic      object
dtype: object
(134170, 3)


Unnamed: 0,keyword_id,keyword
0,2068,handcuffed-to-a-bed
1,157,jane-austen
2,8309,narcotic
3,1059,woods
4,3991,hanging


In [8]:
np_movies_keywords = df_movies_keywords.to_numpy()

In [11]:
keyword_vector = np.zeros((df_ML_movies.shape[0], df_keyword.shape[0]))

for i in range(np_movies_keywords.shape[0]) :
    row = np_movies_keywords[i,0]-1
    column = np_movies_keywords[i,1]-1
    keyword_vector[row, column] = 1
        

In [12]:
useful_keywords = []

for i in range(keyword_vector.shape[1]) :
    cnt = keyword_vector[:,i].sum()
    if cnt > 1 :
        useful_keywords += [i]
        
print(len(useful_keywords))
keyword_vector = keyword_vector[:,useful_keywords]
print(keyword_vector.shape)
print(keyword_vector.sum())

10243
(1767, 10243)
89924.0


### Create cast vector

In [13]:
columns_roles = ["role_id", "cast_role"]

df_roles = pd.read_csv('data/role_type.csv', delimiter=',', names=columns_roles, encoding = "ISO-8859-1") 
print(df_roles.dtypes)
print(df_roles.shape)
df_roles.head()

role_id       int64
cast_role    object
dtype: object
(12, 2)


Unnamed: 0,role_id,cast_role
0,1,actor
1,2,actress
2,3,producer
3,4,writer
4,5,cinematographer


In [14]:
column_cast = ["cast_id", "person_id", "movie_id", "person_role_id", "note", "nr_order", "role_id"]

df_cast = pd.read_csv('data/cast_info.csv', delimiter=',', names=column_cast, encoding = "ISO-8859-1", low_memory=False) 
df_cast['role_id'] = pd.to_numeric(df_cast['role_id'], errors='coerce')
df_cast = df_cast.drop(columns=["note", "nr_order", "person_role_id"])

print(df_cast.dtypes)
print(df_cast.shape)
df_cast.head()

cast_id        int64
person_id      int64
movie_id       int64
role_id      float64
dtype: object
(36243322, 4)


Unnamed: 0,cast_id,person_id,movie_id,role_id
0,1,1,968504,1.0
1,2,2,2163857,1.0
2,3,2,2324528,1.0
3,4,3,1851347,1.0
4,5,4,1681365,1.0


In [18]:
column_people = ["person_id", "cast_name", "imdb_idx", "imdb_id", "cast_gender", "name_cf", "name_nf", "surname", "md5"]

df_people = pd.read_csv('data/name.csv', delimiter=',', names=column_people, encoding = "ISO-8859-1", low_memory=False) 

print(df_people.dtypes)
print(df_people.shape)
df_people = df_people.drop(columns=["imdb_idx", "imdb_id", "md5", "name_cf", "name_nf", "surname"])
df_people.head()

person_id        int64
cast_name       object
imdb_idx        object
imdb_id        float64
cast_gender     object
name_cf         object
name_nf         object
surname         object
md5             object
dtype: object
(4167491, 9)


Unnamed: 0,person_id,cast_name,cast_gender
0,3343,"Abela, Mike",m
1,446,"A., David",m
2,126,"-Alverio, Esteban Rodriguez",m
3,1678,"Abbas, Athar",m
4,3610,"Aberer, Leo",m


In [15]:
df_cast_ml = pd.merge(df, df_cast, on="movie_id")
df_cast_ml = pd.merge(df_cast_ml, df_roles, on="role_id")
df_cast_ml = df_cast_ml[["movie_id_ml","person_id","cast_role"]]
df_cast_ml.head()

Unnamed: 0,movie_id_ml,person_id,cast_role
0,1,30260,actor
1,1,46332,actor
2,1,46332,actor
3,1,66226,actor
4,1,128339,actor


In [16]:
np_cast = df_cast_ml.to_numpy()

In [19]:
cast_vector = np.zeros((df_ML_movies.shape[0], df_people.shape[0]))

for i in range(np_cast.shape[0]) :
    row = np_cast[i,0]-1
    column = np_cast[i,1]-1
    cast_vector[row, column] = 1
        

In [20]:
useful_cast = []

for i in range(cast_vector.shape[1]) :
    cnt = cast_vector[:,i].sum()
    if cnt > 1 :
        useful_cast += [i]
        
print(len(useful_cast))
cast_vector = cast_vector[:,useful_cast]
print(cast_vector.shape)
print(cast_vector.sum())

19932
(1767, 19932)
64946.0


In [21]:
### Creating the total vector

In [23]:
movie_vec = np.concatenate((keyword_vector, cast_vector), axis=1)
movie_vec.shape

(1767, 30175)

In [24]:
genres = ["movie_id_ml", "unknown", "action", "adventure", "animation", "childrens", "comedy",
                   "crime", "documentary", "drama", "fantasy", "noir", "horror",
                   "musical", "mystery", "romance", "scifi", "thriller",
                   "war", "western"]

df_genres = df_ML_movies[genres]

In [33]:
np_genres = df_genres.to_numpy()

In [34]:
np_genres.shape

(1767, 20)

In [37]:
genre_vector = np.zeros((df_ML_movies.shape[0], 19))

for i in range(np_genres.shape[0]) :
    row = np_genres[i,0]-1
    genre_vector[row,:] += np_genres[i,1:]
    
genre_vector = (genre_vector > 0).astype(int)

In [38]:
genre_vector.shape

(1767, 19)

In [39]:
movie_vec = np.concatenate((movie_vec, genre_vector), axis=1)
movie_vec.shape

(1767, 30194)