### This notebook was developped in order to encode all the features of the movies. The data is then exported into a csv file and used in the 'features_distance' notebook.

In [1]:
import tensorflow as tf
import pandas as pd

In [2]:
movies = pd.read_csv('data/IMDb_data/IMDb movies.csv',
                   usecols=['imdb_title_id','genre','country','director','writer','actors'])
movies.head()

Unnamed: 0,imdb_title_id,genre,country,director,writer,actors
0,tt0000009,Romance,USA,Alexander Black,Alexander Black,"Blanche Bayliss, William Courtenay, Chauncey D..."
1,tt0000574,"Biography, Crime, Drama",Australia,Charles Tait,Charles Tait,"Elizabeth Tait, John Tait, Norman Campbell, Be..."
2,tt0001892,Drama,"Germany, Denmark",Urban Gad,"Urban Gad, Gebhard Schätzler-Perasini","Asta Nielsen, Valdemar Psilander, Gunnar Helse..."
3,tt0002101,"Drama, History",USA,Charles L. Gaskill,Victorien Sardou,"Helen Gardner, Pearl Sindelar, Miss Fielding, ..."
4,tt0002130,"Adventure, Drama, Fantasy",Italy,"Francesco Bertolini, Adolfo Padovan",Dante Alighieri,"Salvatore Papa, Arturo Pirovano, Giuseppe de L..."


In [77]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85855 entries, 0 to 85854
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   imdb_title_id  85855 non-null  object
 1   genre          85855 non-null  object
 2   country        85791 non-null  object
 3   director       85768 non-null  object
 4   writer         84283 non-null  object
 5   actors         85786 non-null  object
dtypes: object(6)
memory usage: 3.9+ MB


### For the tokenizer we need to prepare data according to this specific format : 'name1' 'name2' etc.

In [79]:
movies['genre'] = movies['genre'].apply(lambda x: x.replace(' ','').replace(',',' ').lower() if type(x) == str else '')
movies['writer'] = movies['writer'].apply(lambda x: x.replace(' ','').replace(',',' ').replace('.','').replace('-','') if type(x) == str else '')
movies['country'] = movies['country'].apply(lambda x: x.replace(' ','').replace(',',' ') if type(x) == str else '')
movies['director'] = movies['director'].apply(lambda x: x.replace(' ','').replace(',',' ').replace('.','').replace('-','') if type(x) == str else '')

### Regarding the actors, we focus only on the 10 first names for simplicity.

In [80]:
movies['actors_processed'] = movies['actors'].apply(lambda x: x.replace(' ','').replace('.','').replace('-','').split(',') if type(x) == str else '')
# we keep only the first 10 names :
movies['actors_processed'] = movies['actors_processed'].apply(lambda x: x[:10])
movies['actors_processed'] = movies['actors_processed'].apply(lambda x: ','.join(x).replace(',',' '))

In [81]:
movies = movies.drop('actors', axis=1)

In [82]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85855 entries, 0 to 85854
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   imdb_title_id     85855 non-null  object
 1   genre             85855 non-null  object
 2   country           85855 non-null  object
 3   director          85855 non-null  object
 4   writer            85855 non-null  object
 5   actors_processed  85855 non-null  object
dtypes: object(6)
memory usage: 3.9+ MB


In [83]:
movies.head()

Unnamed: 0,imdb_title_id,genre,country,director,writer,actors_processed
0,tt0000009,romance,USA,AlexanderBlack,AlexanderBlack,BlancheBayliss WilliamCourtenay ChaunceyDepew
1,tt0000574,biography crime drama,Australia,CharlesTait,CharlesTait,ElizabethTait JohnTait NormanCampbell BellaCol...
2,tt0001892,drama,Germany Denmark,UrbanGad,UrbanGad GebhardSchätzlerPerasini,AstaNielsen ValdemarPsilander GunnarHelsengree...
3,tt0002101,drama history,USA,CharlesLGaskill,VictorienSardou,HelenGardner PearlSindelar MissFielding MissRo...
4,tt0002130,adventure drama fantasy,Italy,FrancescoBertolini AdolfoPadovan,DanteAlighieri,SalvatorePapa ArturoPirovano GiuseppedeLiguoro...


In [85]:
for col in ['genre','country', 'director','writer','actors_processed']:
    # by limiting the 'num_words' parameter, we keep only the 10,000 most frequent features.
    # It is especially useful for actors (~300k) and directors (~35k).
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000) # 
    tokenizer.fit_on_texts(movies[col])
    movies[col + "_encoded"] = tokenizer.texts_to_sequences(movies[col])

In [86]:
movies

Unnamed: 0,imdb_title_id,genre,country,director,writer,actors_processed,genre_encoded,country_encoded,director_encoded,writer_encoded,actors_processed_encoded
0,tt0000009,romance,USA,AlexanderBlack,AlexanderBlack,BlancheBayliss WilliamCourtenay ChaunceyDepew,[3],[1],[],[],[]
1,tt0000574,biography crime drama,Australia,CharlesTait,CharlesTait,ElizabethTait JohnTait NormanCampbell BellaCol...,"[14, 6, 1]",[15],[],[],[]
2,tt0001892,drama,Germany Denmark,UrbanGad,UrbanGad GebhardSchätzlerPerasini,AstaNielsen ValdemarPsilander GunnarHelsengree...,[1],"[6, 21]",[],[],[]
3,tt0002101,drama history,USA,CharlesLGaskill,VictorienSardou,HelenGardner PearlSindelar MissFielding MissRo...,"[1, 15]",[1],[],[],[]
4,tt0002130,adventure drama fantasy,Italy,FrancescoBertolini AdolfoPadovan,DanteAlighieri,SalvatorePapa ArturoPirovano GiuseppedeLiguoro...,"[8, 1, 11]",[5],[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...
85850,tt9908390,comedy,France Belgium,LudovicColbeauJustin,AlexandreCoquelle MatthieuLeNaour,DanyBoon PhilippeKaterine AnneSerra SamuelJouy...,[2],"[2, 12]",[],[3350],"[4158, 5690]"
85851,tt9911196,comedy drama,Netherlands,JohanNijenhuis,RadekBajgar HermanFinkers,HermanFinkers JohannaterSteege LeonieterBraak ...,"[2, 1]",[20],[564],[],[4763]
85852,tt9911774,drama,India,VineeshAaradya,VineeshAaradya VineeshAaradya,AnoopChandran Indrans SonaNair SimonBrittoRodr...,[1],[4],[],[],[962]
85853,tt9914286,drama family,Turkey,AhmetFaikAkinci,AhmetFaikAkinci KasimUçkan,AhmetFaikAkinci BelmaMamati MetinKeçeci Burhan...,"[1, 10]",[11],[],[],[8926]


In [88]:
# Finally, we export the results in a csv file
movies.to_csv('all_features_encoded.csv',index=False)

In [None]:
#####

In [None]:
imdb = imdb[imdb['year'] != 'TV Movie 2019']

In [None]:
imdb['imdb_title_id'] = imdb['imdb_title_id'].apply(lambda x: int(x.replace('tt', '')))
imdb.rename(columns={'imdb_title_id':'imdbId', 'avg_vote':'imdb_score'}, inplace=True) # to match with the movielens file regarding ID

In [None]:
tf_idf = pd.read_csv(path + '/tf_idf.csv')

In [None]:
tf_idf.columns

Index(['000', '10', '12', '15', '17', '20', '30', 'abandon', 'abduct',
       'ability',
       ...
       'write', 'writer', 'wrong', 'wwii', 'year', 'york', 'young', 'youth',
       'zombie', 'imdbId'],
      dtype='object', length=1001)

In [None]:
credits_for_tokenizer = pd.read_csv(path + '/credits_processed_for_tokenizer.csv')

In [None]:
credits_for_tokenizer.columns

Index(['imdbId', 'title', 'genre', 'country', 'director', 'writer',
       'actors_processed'],
      dtype='object')

In [None]:
credits_for_tokenizer.shape

(84162, 7)

In [None]:
# attention pour le tokenizer, on ne devrait le fitter que sur le train (à faire plus tard, après la mise en oeuvre d'un premier passage)

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000)
tokenizer.fit_on_texts(credits_for_tokenizer.title)
credits_for_tokenizer["title_encoded"] = tokenizer.texts_to_sequences(credits_for_tokenizer.title)
credits_for_tokenizer["len_encoded_title"] = credits_for_tokenizer["title_encoded"].apply(lambda x: len(x))
credits_for_tokenizer = credits_for_tokenizer[credits_for_tokenizer["len_encoded_title"]!=0]

In [None]:
credits_for_tokenizer.shape

(68428, 9)

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000)
tokenizer.fit_on_texts(credits_for_tokenizer.genre)
credits_for_tokenizer["genre_encoded"] = tokenizer.texts_to_sequences(credits_for_tokenizer.genre)
credits_for_tokenizer["len_encoded_genre"] = credits_for_tokenizer["genre_encoded"].apply(lambda x: len(x))
credits_for_tokenizer = credits_for_tokenizer[credits_for_tokenizer["len_encoded_genre"]!=0]

In [None]:
credits_for_tokenizer.shape

(68428, 11)

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000)
tokenizer.fit_on_texts(credits_for_tokenizer.country)
credits_for_tokenizer["country_encoded"] = tokenizer.texts_to_sequences(credits_for_tokenizer.country)
credits_for_tokenizer["len_encoded_country"] = credits_for_tokenizer["country_encoded"].apply(lambda x: len(x))
credits_for_tokenizer = credits_for_tokenizer[credits_for_tokenizer["len_encoded_country"]!=0]

In [None]:
credits_for_tokenizer.shape

(68428, 13)

In [None]:
#problem avec les . et les - dansles noms, à revoir.
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000)
tokenizer.fit_on_texts(credits_for_tokenizer.director)
credits_for_tokenizer["director_encoded"] = tokenizer.texts_to_sequences(credits_for_tokenizer.director)
credits_for_tokenizer["len_encoded_director"] = credits_for_tokenizer["director_encoded"].apply(lambda x: len(x))
credits_for_tokenizer = credits_for_tokenizer[credits_for_tokenizer["len_encoded_director"]!=0]

In [None]:
credits_for_tokenizer.shape

(50435, 15)

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000)
tokenizer.fit_on_texts(credits_for_tokenizer.writer)
credits_for_tokenizer["writer_encoded"] = tokenizer.texts_to_sequences(credits_for_tokenizer.writer)
credits_for_tokenizer["len_encoded_writer"] = credits_for_tokenizer["writer_encoded"].apply(lambda x: len(x))
credits_for_tokenizer = credits_for_tokenizer[credits_for_tokenizer["len_encoded_writer"]!=0]

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000)
tokenizer.fit_on_texts(credits_for_tokenizer.actors_processed)
credits_for_tokenizer["actors_processed_encoded"] = tokenizer.texts_to_sequences(credits_for_tokenizer.actors_processed)
credits_for_tokenizer["len_encoded_actors_processed"] = credits_for_tokenizer["actors_processed_encoded"].apply(lambda x: len(x))
credits_for_tokenizer = credits_for_tokenizer[credits_for_tokenizer["len_encoded_actors_processed"]!=0]

In [None]:
credits_for_tokenizer.shape

(36007, 19)

In [None]:
credits_for_tokenizer

Unnamed: 0,imdbId,title,genre,country,director,writer,actors_processed,title_encoded,len_encoded_title,genre_encoded,len_encoded_genre,country_encoded,len_encoded_country,director_encoded,len_encoded_director,writer_encoded,len_encoded_writer,actors_processed_encoded,len_encoded_actors_processed
6,2423,madame dubarry,biography drama romance,Germany,ErnstLubitsch,NorbertFalk HannsKräly,PolaNegri EmilJannings HarryLiedtke EduardvonW...,[998],1,"[14, 1, 3]",3,[8],1,[106],1,"[4621, 321]",2,"[6999, 4460, 9097, 7951]",4
11,2844,fantômas - à l'ombre de la guillotine,crime drama,France,LouisFeuillade,MarcelAllain LouisFeuillade,RenéNavarre EdmundBreon GeorgesMelchior RenéeC...,"[5699, 152, 12, 2]",4,"[5, 1]",2,[2],1,[1125],1,"[1842, 1078]",2,"[3077, 4956]",2
13,3037,juve contre fantômas,crime drama,France,LouisFeuillade,MarcelAllain LouisFeuillade,RenéNavarre EdmundBreon GeorgesMelchior RenéeC...,"[3056, 5699]",2,"[5, 1]",2,[2],1,[1125],1,"[1842, 1078]",2,[3077],1
16,3165,le mort qui tue,crime drama mystery,France,LouisFeuillade,MarcelAllain LouisFeuillade,RenéNavarre EdmundBreon GeorgesMelchior AndréL...,"[13, 1740, 407, 4183]",4,"[5, 1, 9]",3,[2],1,[1125],1,"[1842, 1078]",2,"[3077, 4956]",2
17,3167,amore di madre,drama,USA,D.W.Griffith,D.W.Griffith H.E.Aitken,HenryB.Walthall JosephineCrowell LillianGish D...,"[46, 4, 565]",3,[1],1,[1],1,"[6, 5, 67]",3,"[10, 7, 77, 9, 12]",5,"[1285, 1869, 9098, 2036, 9099, 4957, 9100, 9101]",8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84112,9844358,satsujinki o kau onna,horror,Japan,HideoNakata,KaoriYoshida KeiÔishi,RinAsuka Shin'yaHamada AiriMatsuyama KenjiMizu...,"[23, 6599, 630]",3,[7],1,[6],1,[544],1,[9149],1,[5987],1
84130,9872556,momenti di trascurabile felicità,comedy,Italy,DanieleLuchetti,FrancescoPiccolo FrancescoPiccolo,Pif Thony RenatoCarpentieri FranzCantalupo Vin...,"[4, 809]",2,[2],1,[4],1,[946],1,"[2199, 2199]",2,[5360],1
84136,9880982,dulce familia,comedy,Mexico Chile,NicolásLópez,GuillermoAmoedo CocaGómez,PazBascuñán ReginaBlandón FernandaCastillo Luc...,"[7891, 1695]",2,[2],1,"[15, 47]",2,[1117],1,[1296],1,[4935],1
84140,9887580,bulletproof 2,action comedy,USA,DonMichaelPaul,DonMichaelPaul RichWilkes,FaizonLove KirkFox TonyTodd CassieClare JadeHu...,[15],1,"[4, 2]",2,[1],1,[2668],1,"[4101, 2134]",2,"[7678, 3030]",2


In [None]:
imdb_for_ML = pd.merge(imdb, tf_idf, how='left', on=['imdbId'])

In [None]:
imdb_for_ML

Unnamed: 0,imdbId,year_x,duration,imdb_score,000,10,12,15,17,20,30,abandon,abduct,ability,able,abuse,accept,accident,accidentally,accuse,act,action,actor,actress,actually,addict,adopt,adult,adventure,affair,africa,african,age,aged,agency,agent,ago,agree,aid,air,...,violent,visit,wait,wake,walk,want,war,warrior,watch,water,way,wealthy,weapon,wedding,week,weekend,west,western,white,widow,wife,wild,win,wish,witch,witness,woman,wood,work,worker,world,write,writer,wrong,wwii,year_y,york,young,youth,zombie
0,9,1894,45,5.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.553697,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,574,1906,70,6.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1892,1911,53,5.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.443246,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2101,1912,100,5.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.521787,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2130,1911,68,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85849,9908390,2020,95,5.3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
85850,9911196,2020,103,7.7,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
85851,9911774,2019,130,7.9,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
85852,9914286,2019,98,6.4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
credits_for_tokenizer.columns

Index(['imdbId', 'title', 'genre', 'country', 'director', 'writer',
       'actors_processed', 'title_encoded', 'len_encoded_title',
       'genre_encoded', 'len_encoded_genre', 'country_encoded',
       'len_encoded_country', 'director_encoded', 'len_encoded_director',
       'writer_encoded', 'len_encoded_writer', 'actors_processed_encoded',
       'len_encoded_actors_processed'],
      dtype='object')

In [None]:
imdb_tokens = credits_for_tokenizer[['imdbId','title_encoded','genre_encoded','country_encoded','director_encoded','writer_encoded','actors_processed_encoded']]

In [None]:
imdb_for_ML = pd.merge(imdb_for_ML, imdb_tokens, how='left', on=['imdbId'])

In [None]:
imdb_for_ML

Unnamed: 0,imdbId,year_x,duration,imdb_score,000,10,12,15,17,20,30,abandon,abduct,ability,able,abuse,accept,accident,accidentally,accuse,act,action,actor,actress,actually,addict,adopt,adult,adventure,affair,africa,african,age,aged,agency,agent,ago,agree,aid,air,...,war,warrior,watch,water,way,wealthy,weapon,wedding,week,weekend,west,western,white,widow,wife,wild,win,wish,witch,witness,woman,wood,work,worker,world,write,writer,wrong,wwii,year_y,york,young,youth,zombie,title_encoded,genre_encoded,country_encoded,director_encoded,writer_encoded,actors_processed_encoded
0,9,1894,45,5.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.553697,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,
1,574,1906,70,6.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,
2,1892,1911,53,5.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.443246,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,
3,2101,1912,100,5.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.521787,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,
4,2130,1911,68,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85849,9908390,2020,95,5.3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
85850,9911196,2020,103,7.7,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
85851,9911774,2019,130,7.9,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
85852,9914286,2019,98,6.4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
#data = pd.merge(movielens, imdb_for_ML, how='left', on=['imdbId'])

In [None]:
#imdb_for_ML.to_csv('imdb_for_ML',index=False)

In [None]:
imdb_for_ML_light = imdb_for_ML.dropna()

In [None]:
imdb_for_ML_light.shape

(35107, 1010)

In [None]:
imdb_for_ML_light.to_csv(path+'/imdb_for_ML_light.csv',index=False)

In [None]:
movies_to_keep = list(imdb_for_ML_light['imdbId'])

In [None]:
movies_to_keep[:10]

[2423, 2844, 3037, 3165, 3167, 3419, 3643, 3657, 3883, 3930]

In [None]:
movielens.columns

Index(['userId', 'movieId', 'rating', 'imdbId'], dtype='object')

In [None]:
idx_to_keep = []
for i in range(movielens.shape[0]):
  imdbId = movielens.iloc[i,3]
  if imdbId in movies_to_keep:
    idx_to_keep.append(i)

KeyboardInterrupt: ignored

In [None]:
idx_to_keep[:20]

[0, 1, 2, 3, 4, 7, 8, 9]

In [None]:
movies_light = movielens.iloc[idx_to_keep,:]

In [None]:
movies_light

Unnamed: 0,userId,movieId,rating,imdbId
0,1,296,5.0,110912
1,1,306,3.5,111495
2,1,307,5.0,108394
3,1,665,5.0,114787
4,1,899,3.5,45152
7,1,1217,3.5,89881
8,1,1237,5.0,50976
9,1,1250,4.0,50212


In [None]:
data = pd.merge(movielens_light, imdb_for_ML_light, how='left', on=['imdbId'])