In [23]:
import pickle
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import pairwise_distances, pairwise_distances_chunked
from scipy.sparse import vstack
from nltk.corpus.reader.wordnet import NOUN
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [24]:
netflix = pd.read_csv("data/netflix_titles.csv")

In [25]:
netflix_recommend = netflix[["title",  "release_year","listed_in","director", "cast", "description"]]

In [26]:
imdb_movies = pd.read_csv("data/IMDb movies.csv", low_memory=False)

In [113]:
imdb_movies = imdb_movies[(imdb_movies.country=="USA")|(imdb_movies.language.str.contains("English"))]

In [27]:
imdb_recomend = imdb_movies[["original_title", "year", "genre","director", "actors", "description"]]

In [28]:
def remove_spaces(lst):
    spaces_removed = []
    for name in lst:
        spaces_removed.append(name.replace(" ", ""))
    return spaces_removed

In [29]:
def list_for_remove(string):
    string=str(string)
    return " ".join(remove_spaces(string.split(", ")))

In [30]:
def clean_up(imdb_recomend,netflix_recommend):
    #fill any NAN values
    imdb_recomend,netflix_recommend =imdb_recomend.copy(),netflix_recommend.copy()
    imdb_recomend.director.fillna("Unlisted",inplace=True)
    imdb_recomend.actors.fillna("Unavailable",inplace=True)
    imdb_recomend.genre.fillna("Unknown",inplace=True)
    netflix_recommend.listed_in.fillna("Unknown",inplace=True)
    netflix_recommend.director.fillna("Unlisted",inplace=True)
    netflix_recommend.cast.fillna("Unavailable",inplace=True)    
    
    #remove spaces from actors and directors names
    imdb_recomend["actors"] = imdb_recomend["actors"].apply(lambda x: list_for_remove(x))
    imdb_recomend["director"] = imdb_recomend["director"].apply(lambda x: list_for_remove(x))
    imdb_recomend["genre"] = imdb_recomend["genre"].apply(lambda x: list_for_remove(x))
#     imdb_recomend["original_title"] = imdb_recomend["original_title"].apply(lambda x: list_for_remove(x))
    netflix_recommend["listed_in"] = netflix_recommend["listed_in"].apply(lambda x: list_for_remove(x))
    netflix_recommend["cast"] = netflix_recommend["cast"].apply(lambda x: list_for_remove(x))
    netflix_recommend["director"] = netflix_recommend["director"].apply(lambda x: list_for_remove(x))
#     netflix_recommend["title"] = netflix_recommend["title"].apply(lambda x: list_for_remove(x))
    return imdb_recomend, netflix_recommend

In [31]:
imdb_recomend, netflix_recommend = clean_up(imdb_recomend, netflix_recommend)

In [32]:
def get_keywords(imdb_recomend, netflix_recommend):
    imdb_recomend,netflix_recommend =imdb_recomend.copy(),netflix_recommend.copy()
    imdb_recomend.description.fillna("Unknown",inplace=True) 
    netflix_recommend.description.fillna("Unknown",inplace=True) 
    imdb_recomend.description = imdb_recomend.description.apply(lambda x: make_keywords(x))
    netflix_recommend.description = netflix_recommend.description.apply(lambda x: make_keywords(x))
    imdb_recomend.original_title = imdb_recomend.original_title.apply(lambda x: make_keywords(x))
    netflix_recommend.title = netflix_recommend.title.apply(lambda x: make_keywords(x))
    return imdb_recomend, netflix_recommend

In [33]:
def make_keywords(string):
    tokens = word_tokenize(string)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    import string
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words(['arabic', 'azerbaijani', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'greek','hungarian', 'indonesian', 'italian', 'kazakh', 'nepali', 'norwegian', 'portuguese', 'romanian', 'russian', 'slovene', 'spanish', 'swedish', 'tajik', 'turkish']))
    words = [w for w in words if not w in stop_words]
    lem = WordNetLemmatizer()
    words = [lem.lemmatize(w) for w in words]
    return " ".join(words)

In [34]:
imdb_recomend, netflix_recommend = get_keywords(imdb_recomend, netflix_recommend)

In [35]:
imdb_recomend

Unnamed: 0,original_title,year,genre,director,actors,description
0,miss jerry,1894,Romance,AlexanderBlack,BlancheBayliss WilliamCourtenay ChaunceyDepew,adventure female reporter
1,story kelly gang,1906,Biography Crime Drama,CharlesTait,ElizabethTait JohnTait NormanCampbell BellaCol...,true story notorious australian outlaw kelly
2,sorte drøm,1911,Drama,UrbanGad,AstaNielsen ValdemarPsilander GunnarHelsengree...,two high rank wooing beautiful famous equestri...
3,cleopatra,1912,Drama History,CharlesL.Gaskill,HelenGardner PearlSindelar MissFielding MissRo...,fabled queen egypt affair roman general marc a...
4,linferno,1911,Adventure Drama Fantasy,FrancescoBertolini AdolfoPadovan,SalvatorePapa ArturoPirovano GiuseppedeLiguoro...,loosely adapted dante divine comedy inspired i...
...,...,...,...,...,...,...
85850,lion,2020,Comedy,LudovicColbeau-Justin,DanyBoon PhilippeKaterine AnneSerra SamuelJouy...,psychiatric hospital patient pretend crazy cha...
85851,beentjes sinthildegard,2020,Comedy Drama,JohanNijenhuis,HermanFinkers JohannaterSteege LeonieterBraak ...,middleaged veterinary surgeon belief wife pamp...
85852,padmavyuhathile abhimanyu,2019,Drama,VineeshAaradya,AnoopChandran Indrans SonaNair SimonBrittoRodr...,unknown
85853,sokagin çocuklari,2019,Drama Family,AhmetFaikAkinci,AhmetFaikAkinci BelmaMamati MetinKeçeci Burhan...,unknown


In [36]:
def mashup(imdb):
    str_list = []
    for i in range(imdb.shape[0]):
        if i not in imdb.index:
            continue
        key = imdb.description[i]+" "+imdb.actors[i]+" "+imdb.director[i]+" "+imdb.genre[i]+" "+imdb.year[i]+" "+imdb.original_title[i]
        str_list.append(key)
        print(key)
    return str_list

In [37]:
def net_mashup(netflix):
    str_list = []
    for i in range(netflix.shape[0]):
        key = netflix.description[i]+" "+(netflix.cast[i])+" "+(netflix.director[i])+" "+(netflix.listed_in[i])+" "+str(netflix.release_year[i])+" "+str(netflix.title[i])
        str_list.append(key)
    return str_list

In [38]:
net_list = net_mashup(netflix_recommend)

In [131]:
net_list

['future elite inhabit island paradise far crowded slum get one chance join saved squalor JoãoMiguel BiancaComparato MichelGomes RodolfoValente VanezaOliveira RafaelLozano VivianePorto MelFronckowiak SergioMamberti ZezéMotta CelsoFrateschi Unlisted InternationalTVShows TVDramas TVSci-Fi&Fantasy 2020 ',
 'devastating earthquake hit mexico city trapped survivor walk life wait rescued trying desperately stay alive DemiánBichir HéctorBonilla OscarSerrano AzaliaOrtiz OctavioMichel CarmenBeato JorgeMichelGrau Dramas InternationalMovies 2016 ',
 'army recruit found dead fellow soldier forced confront terrifying secret haunting jungle island training camp TeddChan StellaChung HenleyHii LawrenceKoh TommyKuan JoshLai MarkLee SusanLeong BenjaminLim GilbertChan HorrorMovies InternationalMovies 2011 ',
 'postapocalyptic world ragdoll robot hide fear dangerous machine exterminate brave newcomer join group ElijahWood JohnC.Reilly JenniferConnelly ChristopherPlummer CrispinGlover MartinLandau FredTata

In [71]:
vectorizor = CountVectorizer()

In [74]:
imdb_recomend.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47836 entries, 0 to 85846
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   original_title  47836 non-null  object
 1   year            47836 non-null  object
 2   genre           47836 non-null  object
 3   director        47791 non-null  object
 4   actors          47798 non-null  object
 5   description     47836 non-null  object
dtypes: object(6)
memory usage: 3.6+ MB


In [39]:
imdb_lst = mashup(imdb_recomend)

adventure female reporter BlancheBayliss WilliamCourtenay ChaunceyDepew AlexanderBlack Romance 1894 miss jerry
true story notorious australian outlaw kelly ElizabethTait JohnTait NormanCampbell BellaCola WillCoyne SamCrewes JackEnnis JohnForde VeraLinden Mr.Marshall Mr.McKenzie FrankMills OllieWilson CharlesTait Biography Crime Drama 1906 story kelly gang
two high rank wooing beautiful famous equestrian acrobat stella stella ignores jeweler hirsch accepts count waldberg offer follow home AstaNielsen ValdemarPsilander GunnarHelsengreen EmilAlbes HugoFlink MaryHagen UrbanGad Drama 1911 sorte drøm
fabled queen egypt affair roman general marc antony ultimately disastrous HelenGardner PearlSindelar MissFielding MissRobson HeleneCostello CharlesSindelar Mr.Howard JamesR.Waite Mr.Osborne HarryKnowles Mr.Paul Mr.Brady Mr.Corker CharlesL.Gaskill Drama History 1912 cleopatra
loosely adapted dante divine comedy inspired illustration gustav doré original silent film restored new score tangerine dr

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



unknown TürkanSoray TarikAkan SüleymanTuran MetinSerezli NubarTerziyan AhmetDanyalTopatan MuammerGözalan HasanCeylan AliSeyhan AhmetKostarika MuzafferTema YalçinGülhan MuratDüzer SedefInci ErdoganSeren MehmetDinler Romance 1971 asrin kadini melek seytan
four time personal problem reunite curling team compete bonspiel restore honour JamesB.Douglas MollyParker PaulGross BarbaraGordon MichelleNolden ConnorPrice StanColes JamesAllodi DarrylCasselman Mike'Nug'Nahrgang JedRees JaneSpidell PollyShannon PeterOuterbridge KariMatchett PaulGross Comedy Drama Romance 2002 broom
train station official win lottery hide villager truth come lost mind find emptiness SenerSen MünirÖzkul UgurYücel TulugÇizgen TayfunCoragan GüzinÇoragan AdileNasit KemalInci MügeAkyamaç NecatiBilgiç TuncayAkça ErgunKöknar AytaçÖztuna TomrisOguzalp BilgeZobu KartalTibet Comedy Drama 1986 milyarder
psychic investigation disappearance senator daughter lead dangerous cult truth mysterious past AntonioSabatoJr. EmmanuelleVaugie

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



set shop another diner new client familiar one make deal choose complete task report progress exchange desire fulfilled MichelleAllsopp XanderBerkeley KeeganBoos Unlisted Drama Mystery 2014 booth
rural ireland anglo irish twin rachel edward share strange existence crumbling family estate night property becomes domain sinister presence CharlotteVega BillMilner EugeneSimon DavidBradley DeirdreO'Kane MoeDunford RoisinMurphy BrendanO'Rourke EmmetKelly AnthonyMurphy ElijahEgan MatthewSludds RonanByrne JackO'Malley RachelBennett BrianO'Malley Drama Horror Mystery 2017 lodger
divorced loving father year old girl meet beautiful woman past happens like child DiegoPeretti MaribelVerdú GuadalupeManent MartínPiroyansky HoracioFontova MarinaBellati GuillermoArengo PabloRago JorgelinaAruzzi ErikadeSautuRiestra LucíaMaciel EstebanMenis IairSaid ArielPérezdeMaría ChangHungCheng ArielWinograd Comedy Romance 2015 hijos
rise fall famous clown chocolat first black circus performer revolutionised stagnant 

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [40]:
ivectorizor = CountVectorizer()
ikeys = ivectorizor.fit_transform(imdb_lst,y=imdb_recomend.original_title)

In [18]:
    dbfile = open('mashup.pkl', 'ab')
    # source, destination
    pickle.dump(imdb_lst, dbfile)                     
    dbfile.close()

In [26]:
keys = vectorizor.fit_transform(net_list,y=netflix_recommend.title)

In [None]:
distances = pairwise_distances(ikeys,metric='cosine')

In [17]:
keys

<85855x506938 sparse matrix of type '<class 'numpy.int64'>'
	with 2820909 stored elements in Compressed Sparse Row format>

In [34]:
imdb_recomend.title.to_csv("small_title.csv")

In [18]:
def cosine_similarity_n_space(m1, m2, batch_size=100):
    assert m1.shape[1] == m2.shape[1]
    ret = np.ndarray((m1.shape[0], m2.shape[0]))
    for row_i in range(0, int(m1.shape[0] / batch_size) + 1):
        start = row_i * batch_size
        end = min([(row_i + 1) * batch_size, m1.shape[0]])
        if end <= start:
            break # cause I'm too lazy to elegantly handle edge cases
        rows = m1[start: end]
        sim = cosine_similarity(rows, m2) # rows is O(1) size
        ret[start: end] = sim
    return ret

In [46]:
stuff = np.argsort(distances[7198])[0:11]

In [47]:
netflix_recommend.title[stuff]

7198                       Trash Truck
242            A Trash Truck Christmas
4113                         Mini Wolf
3183                        JingleKids
5044                      Qurious Como
4167                            Molang
1043                             Booba
4788                    Pat a Pat Como
3819    Luna Petunia: Return to Amazia
3258                       Justin Time
4083               Mighty Little Bheem
Name: title, dtype: object

In [48]:
netflix_recommend[netflix_recommend.title.str.contains("Trash")]

Unnamed: 0,title,release_year,listed_in,director,cast,description
242,A Trash Truck Christmas,2020,Children&FamilyMovies,EddieRosas,HenryKeane GlenKeane LucasNeff BrianBaumgartne...,santa crashlands junkyard christmas eve hank t...
7196,Trash,2014,Dramas IndependentMovies Thrillers,StephenDaldry,WagnerMoura MartinSheen RooneyMara SeltonMello...,three poor brazilian teen find something suspi...
7197,Trash Fire,2016,Comedies Dramas IndependentMovies,RichardBatesJr.,AdrianGrenier AngelaTrimbur AnnaLynneMcCord Fi...,surprise news girlfriend pregnant sends loutis...
7198,Trash Truck,2020,Kids'TV,,HenryKeane GlenKeane LucasNeff BrianBaumgartne...,sixyear old hank best pal giant trash truck ex...


In [41]:
idist = pairwise_distances_chunked(ikeys, metric='cosine')

In [42]:
y=0
while (y < imdb_recomend.shape[0]):
    pull = next(idist)
    y += pull.shape[0]
    dbfile = open(f'chunks/imdb_test{y}.pkl', 'ab')
    # source, destination
    pickle.dump(pull, dbfile)                     
    dbfile.close()
        

In [99]:
print(pull.shape[0])
studs = np.argsort(pull[1452])
imdb_recomend.title[studs]

1453


85854                 La vida sense la Sara Amat
52867                             Adutha Chodyam
58619                                 Cletaraxia
85852                  Padmavyuhathile Abhimanyu
50561                               Rowdy Mogudu
                          ...                   
36276                            Where's George?
80560    Howard Lovecraft & the Undersea Kingdom
83058                    The Steam Engines of Oz
44616                        Lovesick: Sick Love
49875                            Bled Number One
Name: title, Length: 85855, dtype: object

In [86]:
po = next(idist)

In [87]:
idist.gi_yieldfrom()

TypeError: 'NoneType' object is not callable

In [88]:
po.shape

(1563, 85855)

In [89]:
stud = np.argsort(po[0])

In [90]:
imdb_recomend.title[stud]

3126                       La rivincita di Tarzan
68186    Kureyon Shinchan: Hendarando no Daiboken
76954                                     Krepost
64531        Crayon Shin-chan: Unkokusai no Yabou
55969                                Donga Police
                           ...                   
80560     Howard Lovecraft & the Undersea Kingdom
83058                     The Steam Engines of Oz
36276                             Where's George?
44616                         Lovesick: Sick Love
49875                             Bled Number One
Name: title, Length: 85855, dtype: object

In [169]:
imdb_recomend.iloc[imdb_recomend.index==38640]

Unnamed: 0,original_title,year,genre,director,actors,description
38640,sweet home alabama,2002,Comedy Romance,AndyTennant,ReeseWitherspoon JoshLucas PatrickDempsey Cand...,young woman reinvented new york city socialite...


In [156]:
star_trek = imdb_movies[imdb_movies.imdb_title_id=='tt0110912'].index[0]

In [187]:
def get_pickle(imdbid):
    ind = imdb_movies[imdb_movies.imdb_title_id==imdbid].index[0]
#     files = ((np.round(((ind/1563)))+1)*1563)
    lst = []
    for i in range(1,56):
        lst.append((i*1563)-1)
    for i in range(len(lst)):
        if ind < lst[i]:
            files = (lst[i])+1
            break
    dbfile = open(f'chunks/imdb_test{int(files)}.pkl', 'rb')
    # source, destination
    pull = pickle.load(dbfile)                     
    dbfile.close()
    return pull, ind

In [119]:
((np.round(((49912/1563))))*1563)//1563
49912%1563

1459

In [104]:
((np.round(((49912/1563)))+1)*1563)

51579.0

In [120]:
85855/1563

54.929622520793345

In [162]:
star_trek

28381

In [158]:
dbfile = open(f'chunks/imdb_test29697.pkl', 'rb')
# source, destination
pull = pickle.load(dbfile)                     
dbfile.close()

In [131]:
lst = []
for i in range(1,56):
    lst.append((i*1563)-1)
for i in range(len(lst)):
    if star_trek < lst[i]:
        files = (lst[i])+1
        break

50015


In [188]:
pull,ind = get_pickle('tt0100802')

In [189]:
ind

25645

In [None]:
for i in range(1,56):
    lst.append((i*1563)-1)
for i in range(len(lst)):
    if ind < lst[i]:
        files = (lst[i])+1
        break

In [184]:
duration = 500

In [185]:
looking_for = 10

In [190]:
np.argsort(pull[0])[0]

25008

In [193]:
pull,ind = get_pickle('tt0100802')
dex = (ind-np.argsort(pull[0])[0])
print(imdb_movies.iloc[np.argsort(pull[dex])[0]].original_title)
recommends = np.argsort(pull[dex])[1:]
imdb_movies.iloc[recommends][imdb_movies.iloc[recommends].duration<duration][0:looking_for]

Total Recall


Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
77752,tt5464234,Kill Switch - La guerra dei mondi,Kill Switch,2017,2017-06-01,"Action, Sci-Fi, Thriller",91,"Netherlands, Germany, USA",English,Tim Smit,...,"Dan Stevens, Bérénice Marlohe, Mike Reus, Bas ...",A pilot battles to save his family and the pla...,4.8,6783,,,,31.0,93.0,41.0
35929,tt0199753,Pianeta rosso,Red Planet,2000,2001-01-12,"Action, Sci-Fi, Thriller",106,"USA, Australia",English,Antony Hoffman,...,"Val Kilmer, Carrie-Anne Moss, Tom Sizemore, Be...","Astronauts, and their robotic dog AMEE (Autono...",5.7,54732,$ 80000000,$ 17480890,$ 33463969,34.0,356.0,150.0
8703,tt0049223,Il pianeta proibito,Forbidden Planet,1956,1956-12-21,"Action, Adventure, Sci-Fi",98,"USA, Japan",English,Fred M. Wilcox,...,"Walter Pidgeon, Anne Francis, Leslie Nielsen, ...",A starship crew goes to investigate the silenc...,7.6,44412,$ 1900000,,,,360.0,101.0
30053,tt0117330,Petticoat Planet,Petticoat Planet,1996,1996-11-26,"Comedy, Romance, Sci-Fi",78,"USA, Romania",English,David DeCoteau,...,"Elizabeth Kaitan, Troy Vincent, Lesli Kay, Bet...",A man crash lands on a Western themed planet i...,2.6,395,,,,,5.0,1.0
32242,tt0133152,Planet of the Apes - Il pianeta delle scimmie,Planet of the Apes,2001,2001-09-14,"Action, Adventure, Sci-Fi",119,USA,English,Tim Burton,...,"Mark Wahlberg, Tim Roth, Helena Bonham Carter,...","In 2029, an Air Force astronaut crash-lands on...",5.7,207951,$ 100000000,$ 180011740,$ 362211740,50.0,1377.0,224.0
57595,tt1388371,Ha-Trempist,Ha-Trempist,1972,1972,"Action, Comedy, Sci-Fi",95,Israel,"English, Hebrew",Amos Sefer,...,"Asher Tzarfati, Shmuel Wolf, Lily Avidan, Tzil...",Incited by a disillusioned young man who has d...,5.0,553,$ 60000,,,,12.0,34.0
18613,tt0078089,Il pianeta dei dinosauri,Planet of Dinosaurs,1977,1979-08-27,"Sci-Fi, Drama",84,USA,English,James K. Shea,...,"Mary Appleseth, Harvey Shain, Derna Wylde, Max...",A space-ship gets lost and is forced to make a...,4.0,1398,,,,,52.0,42.0
73676,tt4083740,Robot World,Robot World,2015,2015-12-04,"Sci-Fi, Thriller",82,UK,English,Neil Rowe,...,"Ian Rowe, Tamsyn Pickford, Neil Rowe, Jacob Pe...",A pilot is marooned on an alien planet and soo...,3.7,363,,,,,28.0,8.0
84323,tt8484586,Neevevaro,Neevevaro,2018,2018-08-24,"Action, Romance, Thriller",130,India,Telugu,Hari Nath,...,"Taapsee Pannu, Ritika Singh, Aadhi, Vennela Ki...","As Kalyan, a blind chef and Vennela, who comes...",6.9,589,,,,,9.0,1.0
69848,tt3105350,Titanium,Vychislitel,2014,2014-12-18,"Action, Sci-Fi, Thriller",82,Russia,Russian,Dmitriy Grachev,...,"Evgeniy Mironov, Anna Chipovskaya, Vinnie Jone...",Ten prisoners condemned to exile on a hostile ...,4.4,954,,,$ 844037,,4.0,21.0


In [217]:
np.argsort(pull[0])

array([37512, 62192, 21894, ..., 32249, 32266, 42927])

In [194]:
imdb_movies[imdb_movies.imdb_title_id == 'tt0082971']

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
20114,tt0082971,I predatori dell'arca perduta,Raiders of the Lost Ark,1981,1981-06-12,"Action, Adventure",115,USA,"English, German, Hebrew, Spanish, Arabic, Nepali",Steven Spielberg,...,"Harrison Ford, Karen Allen, Paul Freeman, Rona...","In 1936, archaeologist and adventurer Indiana ...",8.4,865510,$ 18000000,$ 248159971,$ 390133212,85.0,948.0,258.0


In [161]:
gen = pairwise_distances_chunked(ikeys, metric='cosine', working_memory=0)

In [162]:
pop = next(gen)

In [167]:
sums = np.argsort(pop[0])[0:11]

In [168]:
imdb_movies.iloc[sums]

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000009,Miss Jerry,Miss Jerry,1894,1894-10-09,Romance,45,USA,,Alexander Black,...,"Blanche Bayliss, William Courtenay, Chauncey D...",The adventures of a female reporter in the 1890s.,5.9,154,,,,,1.0,2.0
57166,tt1340775,Forecast,Forecast,2008,2009-04-17,"Adventure, Drama, Romance",97,Bulgaria,"English, Bulgarian, Serbo-Croatian, Macedonian...",Zornitsa Sophia,...,"Assen Blatechki, Teodora Duhovnikova, Kresimir...",Romance and political drama tight in an advent...,6.6,308,EUR 1000000,,$ 61331,,2.0,
1583,tt0024151,Pescicani - Contrabbando Giallo,I Cover the Waterfront,1933,1933-05-19,"Drama, Romance",80,USA,English,James Cruze,...,"Ben Lyon, Claudette Colbert, Ernest Torrence, ...",An investigative reporter romances a suspected...,6.3,470,,,,,20.0,3.0
4562,tt0036208,Non c'è tempo per l'amore,No Time for Love,1943,1943-01-01,"Comedy, Romance",83,USA,English,Mitchell Leisen,...,"Claudette Colbert, Fred MacMurray, Ilka Chase,...",An upper-class female reporter is (despite her...,6.9,735,,,,,20.0,10.0
61265,tt1721028,Med cezir manzaralari,Med cezir manzaralari,1989,1989,"Adventure, Drama, Romance",78,Turkey,Turkish,Mahinur Ergun,...,"Kadir Inanir, Zuhal Olcay, Yilmaz Zafer, Bülen...",,6.1,123,,,,,,
77172,tt5240372,I Married an Anti-Fan,I Married an Anti-Fan,2016,2016-06-30,"Comedy, Romance",120,China,"Chinese, Mandarin, Korean",Jae-Young Kim,...,"Chan-Yeol Park, Shanshan Yuan, Seohyun, Chao J...",The story is about a female reporter who marri...,5.9,653,,$ 89408,$ 12194083,,5.0,2.0
752,tt0020018,Notte di tradimento,In Old Arizona,1928,1929-01-20,"Action, Adventure, Romance",95,USA,"English, Spanish, Italian",Irving Cummings,...,"Edmund Lowe, Warner Baxter, Dorothy Burgess","A charming, happy-go-lucky bandit in old Arizo...",5.5,898,,,,,29.0,28.0
99,tt0008309,A Modern Musketeer,A Modern Musketeer,1917,1917-12-30,"Adventure, Comedy, Western",68,USA,English,Allan Dwan,...,"Douglas Fairbanks, Marjorie Daw, Kathleen Kirk...","A restless young man travels west, encounterin...",6.7,257,,,,,11.0,7.0
84674,tt8751976,Dev,Dev,2019,2019-02-14,"Action, Adventure, Romance",157,India,"Tamil, Telugu",Rajath Ravishankar,...,"Karthi, Rakul Preet Singh, Karthik, Prakash Ra...","Dev, a youngster from a well-to-do family, is ...",4.8,757,INR 550000000,,$ 72048,,24.0,6.0
2389,tt0027657,Il giardino di Allah,The Garden of Allah,1936,1936-11-19,"Adventure, Drama, Romance",79,USA,English,Richard Boleslawski,...,"Marlene Dietrich, Charles Boyer, Tilly Losch, ...",The star-crossed desert romance of a cloistere...,5.9,1320,$ 2200000,,,,45.0,22.0


In [213]:
imdb_movies[(imdb_movies.country=="USA")|(imdb_movies.language.str.contains("English"))]

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000009,Miss Jerry,Miss Jerry,1894,1894-10-09,Romance,45,USA,,Alexander Black,...,"Blanche Bayliss, William Courtenay, Chauncey D...",The adventures of a female reporter in the 1890s.,5.9,154,,,,,1.0,2.0
3,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,...,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",The fabled queen of Egypt's affair with Roman ...,5.2,446,$ 45000,,,,25.0,3.0
5,tt0002199,"From the Manger to the Cross; or, Jesus of Naz...","From the Manger to the Cross; or, Jesus of Naz...",1912,1913,"Biography, Drama",60,USA,English,Sidney Olcott,...,"R. Henderson Bland, Percy Dyer, Gene Gauntier,...","An account of the life of Jesus Christ, based ...",5.7,484,,,,,13.0,5.0
9,tt0002461,Richard III,Richard III,1912,1912-10-15,Drama,55,"France, USA",English,"André Calmettes, James Keane",...,"Robert Gemp, Frederick Warde, Albert Gardner, ...",Richard of Gloucester uses manipulation and mu...,5.5,225,$ 30000,,,,8.0,1.0
17,tt0003167,Amore di madre,"Home, Sweet Home",1914,1914-05-17,Drama,55,USA,English,D.W. Griffith,...,"Henry B. Walthall, Josephine Crowell, Lillian ...",John Howard Payne at his most miserable point ...,5.8,187,,,,,6.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85837,tt9894470,VFW,VFW,2019,2020-02-14,"Action, Crime, Horror",92,USA,English,Joe Begos,...,"Stephen Lang, William Sadler, Fred Williamson,...",A group of old war veterans put their lives on...,6.1,4178,,,$ 23101,72.0,83.0,94.0
85838,tt9896916,The Pilgrim's Progress,The Pilgrim's Progress,2019,2019-04-18,"Animation, Adventure, Family",108,USA,English,Robert Fernandez,...,"David Thorpe, John Rhys-Davies, Kristyn Getty,...","An epic journey, faithfully adapted to modern-...",5.7,442,,$ 1294596,$ 3173282,,28.0,3.0
85839,tt9898858,Coffee & Kareem,Coffee & Kareem,2020,2020-04-03,"Action, Comedy",88,USA,English,Michael Dowse,...,"Ed Helms, Taraji P. Henson, Terrence Little Ga...",Twelve-year-old Kareem Manning hires a crimina...,5.1,10627,,,,35.0,388.0,64.0
85841,tt9899880,Columbus,Columbus,2018,2018-12-05,"Comedy, Drama",82,Iran,"Persian, English",Hatef Alimardani,...,"Farhad Aslani, Majid Salehi, Saeed Poursamimi,...",A rich family are deciding to immigrate to the...,4.0,209,,,,,,13.0
