In [2]:
import os
import sys
module_path = os.path.abspath(os.path.join('../scripts'))
if module_path not in sys.path:
    sys.path.append(module_path)
import query_database as function
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import LabelEncoder

In [3]:
ratings = pd.read_csv('data/ml-latest/ratings.csv')
movies = pd.read_csv('data/ml-latest/movies.csv')
links = pd.read_csv('data/ml-latest/links.csv')
tags = pd.read_csv('data/ml-latest/tags.csv')

In [66]:
movies[movies['title'].str.contains('Inception')]

Unnamed: 0,movieId,title,genres
15607,79132,Inception (2010),Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX


In [5]:
links['imdbId'] = 'tt' + links['imdbId'].astype(str)
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,tt114709,862.0
1,2,tt113497,8844.0
2,3,tt113228,15602.0
3,4,tt114885,31357.0
4,5,tt113041,11862.0


In [4]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,14,110,epic,1443148538
1,14,110,Medieval,1443148532
2,14,260,sci-fi,1442169410
3,14,260,space action,1442169421
4,14,318,imdb top 250,1442615195


In [48]:
drop_cols = ['tagline', 'imdb_avgRating', 'imdb_numVotes', 'poster_path', 'num_of_cast', 'top_10_cast_popularity_mean'
             'homepage', 'budget', 'revenue', 'imdb_numVotes', 'top_10_cast_popularity', 'top_10_crew_popularity_mean'
             'top_10_crew_popularity', 'originalTitle', 'belongs_to_collection', 'originalTitle', 'release_date']
combined_db = function.get_combined(release_year_cutoff=2005, drop_lst=drop_cols)

In [49]:
combined_db = combined_db.drop(columns = ['homepage', 'top_10_cast_popularity_mean', 
                                          'num_of_crew', 'top_10_crew_popularity', 
                                          'status', 'top_10_crew_popularity_mean'])

In [50]:
combined_db.shape

(96261, 19)

In [51]:
# preprocessing
combined = combined_db.copy()
combined['runtimeMinutes'] = combined['runtimeMinutes'].fillna(combined["runtimeMinutes"].median())
combined=combined.replace({'runtimeMinutes': {0: combined["runtimeMinutes"].median()}}) 
combined['genres'] = combined['genres'].fillna('None')
combined['Writers'] = combined['Writers'].fillna('None')
combined['Directors'] = combined['Directors'].fillna('None')
combined['casts'] = combined['casts'].fillna('None')
combined['keywords_name'] = combined['keywords_name'].fillna('None')
combined['company_name'] = combined['company_name'].fillna('None')
combined['overview'] = combined['overview'].fillna('')


# drop rows with the same title
combined = combined.drop_duplicates(subset='primaryTitle', keep="first")
combined = combined.reset_index()

In [52]:
def make_genresList(x):
    gen = []
    st = " "
    if not isinstance(x, list):
        if x == 'None':
            return ''
        else:
            gen.append(x)
            return (st.join(gen))
    for i in x:
        if i == 'Science Fiction':
            scifi = 'Sci-Fi'
            gen.append(scifi)
        else:
            gen.append(i)
    if gen == []:
        return ''
    else:
        return (st.join(gen))

In [53]:
def process_list(x):
    gen = []
    st = " "
    if not isinstance(x, list):
        if x == 'None':
            return ''
        else:
            gen.append(x)
            return (st.join(gen))
    for i in x:
        gen.append(i)
    if gen == [] or gen == '[]':
        return ''
    else:
        return (st.join(gen))

In [54]:
combined['genres'] = combined['genres'].map(lambda x: make_genresList(x))
combined['Directors'] = combined['Directors'].map(lambda x: process_list(x))
combined['casts'] = combined['casts'].map(lambda x: process_list(x))
combined['Writers'] = combined['Writers'].map(lambda x: process_list(x))
combined['company_name'] = combined['company_name'].map(lambda x: process_list(x))
combined['keywords_name'] = combined['keywords_name'].map(lambda x: process_list(x))
# don't need crews


In [55]:
combined['document'] = combined[['genres', 'Directors', 'casts', 'Writers',
                                 'company_name', 'keywords_name', 'overview']].apply(lambda x: ' '.join(x), axis=1)
combined = combined.astype({"isAdult": int})

In [67]:
combined.head()

Unnamed: 0,index,_id,primaryTitle,isAdult,release_year,runtimeMinutes,genres,Directors,Writers,casts,...,original_language,popularity,overview,company_name,keywords_name,crews,tmdb_avgRating,tmdb_numVotes,release_year_int,document
0,0,tt0011216,Spanish Fiesta,0,2019.0,67.0,Drama,Germaine Dulac,Louis Delluc,Ève Francis Gabriel Gabrio Gaston Modot Jean T...,...,fr,0.6,A Spanish festival reveals the emotional dista...,Les Films Louis Nalpas,woman director partially lost film,"[Germaine Dulac, Serge Sandberg, Paul Parguel,...",6.0,1,2019,Drama Germaine Dulac Ève Francis Gabriel Gabri...
1,1,tt0011801,Tötet nicht mehr,0,2019.0,127.0,Action Crime Drama,,Gerhard Lamprecht,Emilie Kurz Rudolf Klein-Rhoden Paul Rehkopf L...,...,de,0.6,The director and co-writer Lupu Pick plays mus...,Rex-Film GmbH,,"[Gerhard Lamprecht, Willi Herrmann, Lupu Pick,...",0.0,0,2019,Action Crime Drama Emilie Kurz Rudolf Klein-R...
2,2,tt0016906,Frivolinas,0,2014.0,80.0,Comedy Musical,Arturo Carballo,,María Caballé José López Alonso Ramón Álvarez ...,...,es,0.6,Don Casto is a widower who spends most of his ...,Ediciones Seleccionadas Arturo Carballo,,"[Arturo Carballo, Ramón de Baños]",0.0,0,2014,Comedy Musical Arturo Carballo María Caballé J...
3,3,tt0019996,Hongxia,0,2011.0,94.0,Action,Wen Yi-Min,,Xuepeng Fan Chu Shao-Chuen Wen Yi-Min Zhao Tai...,...,zh,0.6,Red Heroine was a smash hit on release and a p...,Youlian Film Company,,"[Shiquan Yao, Wen Yi-Min, Shang Guan-Wu]",6.3,3,2011,Action Wen Yi-Min Xuepeng Fan Chu Shao-Chuen W...
4,4,tt0036177,Muhomatsu no issho,0,2008.0,100.0,Action Adventure,Hiroshi Inagaki,Mansaku Itami Shunsaku Iwashita,Ryūnosuke Tsukigata Kyoji Sugi Yasushi Nagata ...,...,ja,0.997,Matsugoro is a poor rickshaw driver whose anim...,,,"[Kazuo Miyagawa, Hiroshi Inagaki, Mansaku Itam...",6.3,4,2008,Action Adventure Hiroshi Inagaki Ryūnosuke Tsu...


In [56]:
new_df = combined.copy()
new_df = new_df.drop(columns = ['release_year', 'Writers', 'casts',
                                'overview', 'company_name', 'runtimeMinutes',
                                'keywords_name', 'crews', 'index', 'isAdult'])
new_df = new_df.reset_index()
new_df.shape

(90323, 12)

In [57]:
new_df.head()

Unnamed: 0,index,_id,primaryTitle,genres,Directors,tmdb_id,original_language,popularity,tmdb_avgRating,tmdb_numVotes,release_year_int,document
0,0,tt0011216,Spanish Fiesta,Drama,Germaine Dulac,364671,fr,0.6,6.0,1,2019,Drama Germaine Dulac Ève Francis Gabriel Gabri...
1,1,tt0011801,Tötet nicht mehr,Action Crime Drama,,611205,de,0.6,0.0,0,2019,Action Crime Drama Emilie Kurz Rudolf Klein-R...
2,2,tt0016906,Frivolinas,Comedy Musical,Arturo Carballo,400531,es,0.6,0.0,0,2014,Comedy Musical Arturo Carballo María Caballé J...
3,3,tt0019996,Hongxia,Action,Wen Yi-Min,267384,zh,0.6,6.3,3,2011,Action Wen Yi-Min Xuepeng Fan Chu Shao-Chuen W...
4,4,tt0036177,Muhomatsu no issho,Action Adventure,Hiroshi Inagaki,125261,ja,0.997,6.3,4,2008,Action Adventure Hiroshi Inagaki Ryūnosuke Tsu...


In [16]:
imdb_ids = new_df._id.unique()
links_new = links[links['imdbId'].isin(imdb_ids)]
links_new.imdbId.nunique()

14917

In [17]:
links_new.head()

Unnamed: 0,movieId,imdbId,tmdbId
11944,53519,tt1028528,1991.0
12116,54995,tt1077258,1992.0
12126,55063,tt1093842,13241.0
12128,55069,tt1032846,2009.0
12290,56167,tt1024943,8079.0


In [18]:
movieIds = links_new.movieId.unique()
ratings_new = ratings[ratings['movieId'].isin(movieIds)]
ratings_new.movieId.nunique()

14418

In [None]:
ratings_new.head()

In [19]:
movieIds = ratings_new.movieId.unique()
movies_new = movies[movies['movieId'].isin(movieIds)]
movies_new.movieId.nunique()

14418

In [68]:
movies_new.head()

Unnamed: 0,movieId,title,genres
11944,53519,Death Proof (2007),Action|Adventure|Crime|Horror|Thriller
12116,54995,Planet Terror (2007),Action|Horror|Sci-Fi
12126,55063,My Winnipeg (2007),Documentary|Fantasy
12128,55069,"4 Months, 3 Weeks and 2 Days (4 luni, 3 saptam...",Drama
12290,56167,Om Shanti Om (2007),Action|Drama|Musical|Romance


In [20]:
movieIds = ratings_new.movieId.unique()
tags_new = tags[tags['movieId'].isin(movieIds)]
tags_new.movieId.nunique()

10521

In [21]:
tags_new.head()

Unnamed: 0,userId,movieId,tag,timestamp
46,56,80917,love story,1354483209
47,56,80917,social commentary,1354483137
48,56,84880,British accents,1324483431
49,56,84880,Jews,1324483406
50,56,84880,Religion,1324483414


In [47]:
tags_new.to_csv('data/tags_new.csv', index=False)

In [58]:
new_df = new_df.merge(links_new, left_on='_id', right_on='imdbId')

In [59]:
new_df.drop(['imdbId', 'tmdbId'], axis=1, inplace=True)

In [60]:
movieIds = ratings_new.movieId.unique()
new_df = new_df[new_df['movieId'].isin(movieIds)]
new_df._id.nunique()

14418

In [29]:
all_movies_imdbId = new_df._id.unique()
with open('all_movies_imdbId.pkl', 'wb') as fh:
    pickle.dump(all_movies_imdbId, fh)

In [61]:
new_df.head()

Unnamed: 0,index,_id,primaryTitle,genres,Directors,tmdb_id,original_language,popularity,tmdb_avgRating,tmdb_numVotes,release_year_int,document,movieId
0,10513,tt1000095,Pokémon Ranger and the Temple of the Sea,Action Adventure Animation Family Fantasy Sci-Fi,Kunihiko Yuyama Armen Mazlumian,16808,ja,11.775,6.5,126,2006,Action Adventure Animation Family Fantasy Sci-...,139584
1,10539,tt1000769,Prince of Broadway,Drama Comedy,Sean Baker,87081,en,5.875,6.6,7,2008,Drama Comedy Sean Baker Adesuwa Addy Iyare Cin...,161098
2,10540,tt1000774,Sex and the City,Comedy Drama Romance,Michael Patrick King None,4564,en,34.529,6.6,1562,2008,Comedy Drama Romance Michael Patrick King None...,59725
4,10590,tt1001508,He's Just Not That Into You,Comedy Drama Romance,Ken Kwapis,10184,en,15.839,6.5,2782,2009,Comedy Drama Romance Ken Kwapis Scarlett Johan...,66203
5,10594,tt1001526,Megamind,Action Animation Comedy Family Sci-Fi,Tom McGrath None,38055,en,49.965,6.9,4953,2010,Action Animation Comedy Family Sci-Fi Tom McGr...,81564


In [62]:
new_df.to_csv('data/combined_new.csv', index=False)

In [63]:
print("{} unique movies in dataset".format(len(new_df._id.unique())))

14418 unique movies in dataset


In [None]:
ratings_new.to_csv('data/ratings_new.csv', index=False)

In [None]:
movies_new.to_csv('data/movies_new.csv', index=False)

In [35]:
movie_unique_ids = ratings_new.movieId.unique()
movie_to_index = {o:i for i,o in enumerate(movie_unique_ids)}
ratings_new['movie'] = ratings_new['movieId'].apply(lambda x: movie_to_index[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [37]:
user_unique_ids = ratings_new.userId.unique()
user_to_index = {o:i for i,o in enumerate(user_unique_ids)}
ratings_new['user'] = ratings_new['userId'].apply(lambda x: user_to_index[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [39]:
with open('movie_to_index.pkl', 'wb') as fh:
    pickle.dump(movie_to_index, fh)

In [40]:
with open('user_to_index.pkl', 'wb') as fh:
    pickle.dump(user_to_index, fh)

In [41]:
ratings_new.head()

Unnamed: 0,userId,movieId,rating,timestamp,movie,user
1198,14,63082,4.5,1442169823,0,0
1200,14,64969,3.5,1522100366,1,0
1202,14,68954,4.0,1442169632,2,0
1203,14,71033,4.0,1442615079,3,0
1211,14,79091,4.5,1442171360,4,0


In [42]:
new_df.movieId = new_df.movieId.apply(lambda x: movie_to_index[x])

In [43]:
new_df.head()

Unnamed: 0,index,_id,primaryTitle,genres,Directors,tmdb_id,original_language,popularity,tmdb_avgRating,tmdb_numVotes,release_year_int,document,movieId
0,10513,tt1000095,Pokémon Ranger and the Temple of the Sea,Action Adventure Animation Family Fantasy Sci-Fi,Kunihiko Yuyama Armen Mazlumian,16808,ja,11.775,6.5,126,2006,Action Adventure Animation Family Fantasy Sci-...,5005
1,10539,tt1000769,Prince of Broadway,Drama Comedy,Sean Baker,87081,en,5.875,6.6,7,2008,Drama Comedy Sean Baker Adesuwa Addy Iyare Cin...,8101
2,10540,tt1000774,Sex and the City,Comedy Drama Romance,Michael Patrick King None,4564,en,34.529,6.6,1562,2008,Comedy Drama Romance Michael Patrick King None...,252
4,10590,tt1001508,He's Just Not That Into You,Comedy Drama Romance,Ken Kwapis,10184,en,15.839,6.5,2782,2009,Comedy Drama Romance Ken Kwapis Scarlett Johan...,1070
5,10594,tt1001526,Megamind,Action Animation Comedy Family Sci-Fi,Tom McGrath None,38055,en,49.965,6.9,4953,2010,Action Animation Comedy Family Sci-Fi Tom McGr...,335


In [44]:
new_df.to_csv('data/all_movies.csv', index=False)