In [509]:
import pickle
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import pairwise_distances, pairwise_distances_chunked
from scipy.sparse import vstack
from nltk.corpus.reader.wordnet import NOUN
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [510]:
netflix = pd.read_csv("data/netflix_titles.csv")

In [511]:
netflix_recommend = netflix[["title",  "release_year","listed_in","director", "cast", "description"]]

In [512]:
imdb_movies = pd.read_csv("data/IMDb movies.csv", low_memory=False)

In [513]:
imdb_movies = imdb_movies[(imdb_movies.country=="USA")|(imdb_movies.language.str.contains("English"))]

In [514]:
imdb_recomend = imdb_movies[["original_title", "year", "genre","director", "actors", "description"]]

In [515]:
def remove_spaces(lst):
    spaces_removed = []
    for name in lst:
        spaces_removed.append(name.replace(" ", ""))
    return spaces_removed

In [516]:
def list_for_remove(string):
    string=str(string)
    return " ".join(remove_spaces(string.split(", ")))

In [517]:
def clean_up(imdb_recomend,netflix_recommend):
    #fill any NAN values
    imdb_recomend,netflix_recommend =imdb_recomend.copy(),netflix_recommend.copy()
    imdb_recomend.director.fillna("Unlisted",inplace=True)
    imdb_recomend.actors.fillna("Unavailable",inplace=True)
    imdb_recomend.genre.fillna("Unknown",inplace=True)
    netflix_recommend.listed_in.fillna("Unknown",inplace=True)
    netflix_recommend.director.fillna("Unlisted",inplace=True)
    netflix_recommend.cast.fillna("Unavailable",inplace=True)    
    
    #remove spaces from actors and directors names
    imdb_recomend["actors"] = imdb_recomend["actors"].apply(lambda x: list_for_remove(x))
    imdb_recomend["director"] = imdb_recomend["director"].apply(lambda x: list_for_remove(x))
    imdb_recomend["genre"] = imdb_recomend["genre"].apply(lambda x: list_for_remove(x))
#     imdb_recomend["original_title"] = imdb_recomend["original_title"].apply(lambda x: list_for_remove(x))
    netflix_recommend["listed_in"] = netflix_recommend["listed_in"].apply(lambda x: list_for_remove(x))
    netflix_recommend["cast"] = netflix_recommend["cast"].apply(lambda x: list_for_remove(x))
    netflix_recommend["director"] = netflix_recommend["director"].apply(lambda x: list_for_remove(x))
#     netflix_recommend["title"] = netflix_recommend["title"].apply(lambda x: list_for_remove(x))
    return imdb_recomend, netflix_recommend

In [518]:
imdb_recomend, netflix_recommend = clean_up(imdb_recomend, netflix_recommend)

In [520]:
def get_keywords(imdb_recomend, netflix_recommend):
    imdb_recomend,netflix_recommend =imdb_recomend.copy(),netflix_recommend.copy()
    imdb_recomend.description.fillna("Unknown",inplace=True) 
    netflix_recommend.description.fillna("Unknown",inplace=True) 
    imdb_recomend.description = imdb_recomend.description.apply(lambda x: make_keywords(x,dis=True))
    netflix_recommend.description = netflix_recommend.description.apply(lambda x: make_keywords(x,dis=True))
    imdb_recomend.original_title = imdb_recomend.original_title.apply(lambda x: make_keywords(x))
    netflix_recommend.title = netflix_recommend.title.apply(lambda x: make_keywords(x))
    return imdb_recomend, netflix_recommend

In [521]:
def make_keywords(string,dis=False):
    if dis:
        string=translated.translate(string)
    tokens = word_tokenize(string)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    import string
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words('english'))
    if not dis:
        stop_words = set(stopwords.words(['arabic', 'azerbaijani', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'greek','hungarian', 'indonesian', 'italian', 'kazakh', 'nepali', 'norwegian', 'portuguese', 'romanian', 'russian', 'slovene', 'spanish', 'swedish', 'tajik', 'turkish']))
    words = [w for w in words if not w in stop_words]
    lem = WordNetLemmatizer()
    words = [lem.lemmatize(w) for w in words]
    return " ".join(words)

In [522]:
imdb_recomend, netflix_recommend = get_keywords(imdb_recomend, netflix_recommend)

In [523]:
imdb_recomend

Unnamed: 0,original_title,year,genre,director,actors,description
0,miss jerry,1894,Romance,AlexanderBlack,BlancheBayliss WilliamCourtenay ChaunceyDepew,adventure female reporter
3,cleopatra,1912,Drama History,CharlesL.Gaskill,HelenGardner PearlSindelar MissFielding MissRo...,fabled queen egypt affair roman general marc a...
5,manger cross jesus nazareth,1912,Biography Drama,SidneyOlcott,R.HendersonBland PercyDyer GeneGauntier AliceH...,account life jesus christ based book new testa...
9,richard iii,1912,Drama,AndréCalmettes JamesKeane,RobertGemp FrederickWarde AlbertGardner JamesK...,richard gloucester us manipulation murder gain...
17,home sweet home,1914,Drama,D.W.Griffith,HenryB.Walthall JosephineCrowell LillianGish D...,john howard payne miserable point life writes ...
...,...,...,...,...,...,...
85837,vfw,2019,Action Crime Horror,JoeBegos,StephenLang WilliamSadler FredWilliamson Marti...,group old veteran put life line defend young w...
85838,pilgrim progress,2019,Animation Adventure Family,RobertFernandez,DavidThorpe JohnRhys-Davies KristynGetty Trist...,epic journey faithfully adapted modernday chri...
85839,coffee kareem,2020,Action Comedy,MichaelDowse,EdHelms TarajiP.Henson TerrenceLittleGardenhig...,twelveyearold kareem manning hire criminal sca...
85841,columbus,2018,Comedy Drama,HatefAlimardani,FarhadAslani MajidSalehi SaeedPoursamimi Shabn...,rich family deciding immigrate usa family manu...


In [524]:
def mashup(imdb):
    str_list = []
    for i in range(imdb.shape[0]):
        if i not in imdb.index:
            continue
        key = imdb.description[i]+" "+imdb.actors[i]+" "+imdb.actors[i]+" "+imdb.actors[i]+" "+imdb.actors[i]+" "+imdb.actors[i]+" "+imdb.director[i]+" "+imdb.director[i]+" "+imdb.director[i]+" "+imdb.genre[i]+" "+imdb.year[i]+" "+imdb.original_title[i]+" "+imdb.original_title[i]+" "+imdb.original_title[i]+" "+imdb.original_title[i]
        str_list.append(key)
        print(key)
    return str_list

In [37]:
def net_mashup(netflix):
    str_list = []
    for i in range(netflix.shape[0]):
        key = netflix.description[i]+" "+(netflix.cast[i])+" "+(netflix.director[i])+" "+(netflix.listed_in[i])+" "+str(netflix.release_year[i])+" "+str(netflix.title[i])
        str_list.append(key)
    return str_list

In [None]:
net_list = net_mashup(netflix_recommend)

In [None]:
net_list

In [71]:
vectorizor = CountVectorizer()

In [74]:
imdb_recomend.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47836 entries, 0 to 85846
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   original_title  47836 non-null  object
 1   year            47836 non-null  object
 2   genre           47836 non-null  object
 3   director        47791 non-null  object
 4   actors          47798 non-null  object
 5   description     47836 non-null  object
dtypes: object(6)
memory usage: 3.6+ MB


In [506]:
imdb_lst = mashup(imdb_recomend)

The adventures of a female reporter in the 1890s. BlancheBayliss WilliamCourtenay ChaunceyDepew BlancheBayliss WilliamCourtenay ChaunceyDepew BlancheBayliss WilliamCourtenay ChaunceyDepew BlancheBayliss WilliamCourtenay ChaunceyDepew BlancheBayliss WilliamCourtenay ChaunceyDepew AlexanderBlack AlexanderBlack AlexanderBlack Romance 1894 Miss Jerry Miss Jerry Miss Jerry Miss Jerry
The fabled queen of Egypt's affair with Roman general Marc Antony is ultimately disastrous for both of them. HelenGardner PearlSindelar MissFielding MissRobson HeleneCostello CharlesSindelar Mr.Howard JamesR.Waite Mr.Osborne HarryKnowles Mr.Paul Mr.Brady Mr.Corker HelenGardner PearlSindelar MissFielding MissRobson HeleneCostello CharlesSindelar Mr.Howard JamesR.Waite Mr.Osborne HarryKnowles Mr.Paul Mr.Brady Mr.Corker HelenGardner PearlSindelar MissFielding MissRobson HeleneCostello CharlesSindelar Mr.Howard JamesR.Waite Mr.Osborne HarryKnowles Mr.Paul Mr.Brady Mr.Corker HelenGardner PearlSindelar MissFielding M

TypeError: unsupported operand type(s) for +: 'float' and 'str'

In [525]:
ivectorizor = CountVectorizer()
ikeys = ivectorizor.fit_transform(imdb_lst,y=imdb_recomend.original_title)

In [18]:
    dbfile = open('mashup.pkl', 'ab')
    # source, destination
    pickle.dump(imdb_lst, dbfile)                     
    dbfile.close()

In [26]:
keys = vectorizor.fit_transform(net_list,y=netflix_recommend.title)

In [None]:
distances = pairwise_distances(ikeys,metric='cosine')

In [17]:
keys

<85855x506938 sparse matrix of type '<class 'numpy.int64'>'
	with 2820909 stored elements in Compressed Sparse Row format>

In [34]:
imdb_recomend.title.to_csv("small_title.csv")

In [18]:
def cosine_similarity_n_space(m1, m2, batch_size=100):
    assert m1.shape[1] == m2.shape[1]
    ret = np.ndarray((m1.shape[0], m2.shape[0]))
    for row_i in range(0, int(m1.shape[0] / batch_size) + 1):
        start = row_i * batch_size
        end = min([(row_i + 1) * batch_size, m1.shape[0]])
        if end <= start:
            break # cause I'm too lazy to elegantly handle edge cases
        rows = m1[start: end]
        sim = cosine_similarity(rows, m2) # rows is O(1) size
        ret[start: end] = sim
    return ret

In [46]:
stuff = np.argsort(distances[7198])[0:11]

In [47]:
netflix_recommend.title[stuff]

7198                       Trash Truck
242            A Trash Truck Christmas
4113                         Mini Wolf
3183                        JingleKids
5044                      Qurious Como
4167                            Molang
1043                             Booba
4788                    Pat a Pat Como
3819    Luna Petunia: Return to Amazia
3258                       Justin Time
4083               Mighty Little Bheem
Name: title, dtype: object

In [48]:
netflix_recommend[netflix_recommend.title.str.contains("Trash")]

Unnamed: 0,title,release_year,listed_in,director,cast,description
242,A Trash Truck Christmas,2020,Children&FamilyMovies,EddieRosas,HenryKeane GlenKeane LucasNeff BrianBaumgartne...,santa crashlands junkyard christmas eve hank t...
7196,Trash,2014,Dramas IndependentMovies Thrillers,StephenDaldry,WagnerMoura MartinSheen RooneyMara SeltonMello...,three poor brazilian teen find something suspi...
7197,Trash Fire,2016,Comedies Dramas IndependentMovies,RichardBatesJr.,AdrianGrenier AngelaTrimbur AnnaLynneMcCord Fi...,surprise news girlfriend pregnant sends loutis...
7198,Trash Truck,2020,Kids'TV,,HenryKeane GlenKeane LucasNeff BrianBaumgartne...,sixyear old hank best pal giant trash truck ex...


In [528]:
idist = pairwise_distances_chunked(ikeys, metric='cosine')

In [529]:
y=0
while (y < imdb_recomend.shape[0]):
    pull = next(idist)
    y += pull.shape[0]
    dbfile = open(f'chunks/weighted_imdb_test{y}.pkl', 'ab')
    # source, destination
    pickle.dump(pull, dbfile)                     
    dbfile.close()
        

In [99]:
print(pull.shape[0])
studs = np.argsort(pull[1452])
imdb_recomend.title[studs]

1453


85854                 La vida sense la Sara Amat
52867                             Adutha Chodyam
58619                                 Cletaraxia
85852                  Padmavyuhathile Abhimanyu
50561                               Rowdy Mogudu
                          ...                   
36276                            Where's George?
80560    Howard Lovecraft & the Undersea Kingdom
83058                    The Steam Engines of Oz
44616                        Lovesick: Sick Love
49875                            Bled Number One
Name: title, Length: 85855, dtype: object

In [86]:
po = next(idist)

In [87]:
idist.gi_yieldfrom()

TypeError: 'NoneType' object is not callable

In [88]:
po.shape

(1563, 85855)

In [89]:
stud = np.argsort(po[0])

In [90]:
imdb_recomend.title[stud]

3126                       La rivincita di Tarzan
68186    Kureyon Shinchan: Hendarando no Daiboken
76954                                     Krepost
64531        Crayon Shin-chan: Unkokusai no Yabou
55969                                Donga Police
                           ...                   
80560     Howard Lovecraft & the Undersea Kingdom
83058                     The Steam Engines of Oz
36276                             Where's George?
44616                         Lovesick: Sick Love
49875                             Bled Number One
Name: title, Length: 85855, dtype: object

In [169]:
imdb_recomend.iloc[imdb_recomend.index==38640]

Unnamed: 0,original_title,year,genre,director,actors,description
38640,sweet home alabama,2002,Comedy Romance,AndyTennant,ReeseWitherspoon JoshLucas PatrickDempsey Cand...,young woman reinvented new york city socialite...


In [156]:
star_trek = imdb_movies[imdb_movies.imdb_title_id=='tt0110912'].index[0]

In [187]:
def get_pickle(imdbid):
    ind = imdb_movies[imdb_movies.imdb_title_id==imdbid].index[0]
#     files = ((np.round(((ind/1563)))+1)*1563)
    lst = []
    for i in range(1,56):
        lst.append((i*1563)-1)
    for i in range(len(lst)):
        if ind < lst[i]:
            files = (lst[i])+1
            break
    dbfile = open(f'chunks/imdb_test{int(files)}.pkl', 'rb')
    # source, destination
    pull = pickle.load(dbfile)                     
    dbfile.close()
    return pull, ind

In [119]:
((np.round(((49912/1563))))*1563)//1563
49912%1563

1459

In [104]:
((np.round(((49912/1563)))+1)*1563)

51579.0

In [120]:
85855/1563

54.929622520793345

In [162]:
star_trek

28381

In [158]:
dbfile = open(f'chunks/imdb_test29697.pkl', 'rb')
# source, destination
pull = pickle.load(dbfile)                     
dbfile.close()

In [131]:
lst = []
for i in range(1,56):
    lst.append((i*1563)-1)
for i in range(len(lst)):
    if star_trek < lst[i]:
        files = (lst[i])+1
        break

50015


In [188]:
pull,ind = get_pickle('tt0100802')

In [189]:
ind

25645

In [None]:
for i in range(1,56):
    lst.append((i*1563)-1)
for i in range(len(lst)):
    if ind < lst[i]:
        files = (lst[i])+1
        break

In [184]:
duration = 500

In [185]:
looking_for = 10

In [190]:
np.argsort(pull[0])[0]

25008

In [193]:
pull,ind = get_pickle('tt0100802')
dex = (ind-np.argsort(pull[0])[0])
print(imdb_movies.iloc[np.argsort(pull[dex])[0]].original_title)
recommends = np.argsort(pull[dex])[1:]
imdb_movies.iloc[recommends][imdb_movies.iloc[recommends].duration<duration][0:looking_for]

Total Recall


Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
77752,tt5464234,Kill Switch - La guerra dei mondi,Kill Switch,2017,2017-06-01,"Action, Sci-Fi, Thriller",91,"Netherlands, Germany, USA",English,Tim Smit,...,"Dan Stevens, Bérénice Marlohe, Mike Reus, Bas ...",A pilot battles to save his family and the pla...,4.8,6783,,,,31.0,93.0,41.0
35929,tt0199753,Pianeta rosso,Red Planet,2000,2001-01-12,"Action, Sci-Fi, Thriller",106,"USA, Australia",English,Antony Hoffman,...,"Val Kilmer, Carrie-Anne Moss, Tom Sizemore, Be...","Astronauts, and their robotic dog AMEE (Autono...",5.7,54732,$ 80000000,$ 17480890,$ 33463969,34.0,356.0,150.0
8703,tt0049223,Il pianeta proibito,Forbidden Planet,1956,1956-12-21,"Action, Adventure, Sci-Fi",98,"USA, Japan",English,Fred M. Wilcox,...,"Walter Pidgeon, Anne Francis, Leslie Nielsen, ...",A starship crew goes to investigate the silenc...,7.6,44412,$ 1900000,,,,360.0,101.0
30053,tt0117330,Petticoat Planet,Petticoat Planet,1996,1996-11-26,"Comedy, Romance, Sci-Fi",78,"USA, Romania",English,David DeCoteau,...,"Elizabeth Kaitan, Troy Vincent, Lesli Kay, Bet...",A man crash lands on a Western themed planet i...,2.6,395,,,,,5.0,1.0
32242,tt0133152,Planet of the Apes - Il pianeta delle scimmie,Planet of the Apes,2001,2001-09-14,"Action, Adventure, Sci-Fi",119,USA,English,Tim Burton,...,"Mark Wahlberg, Tim Roth, Helena Bonham Carter,...","In 2029, an Air Force astronaut crash-lands on...",5.7,207951,$ 100000000,$ 180011740,$ 362211740,50.0,1377.0,224.0
57595,tt1388371,Ha-Trempist,Ha-Trempist,1972,1972,"Action, Comedy, Sci-Fi",95,Israel,"English, Hebrew",Amos Sefer,...,"Asher Tzarfati, Shmuel Wolf, Lily Avidan, Tzil...",Incited by a disillusioned young man who has d...,5.0,553,$ 60000,,,,12.0,34.0
18613,tt0078089,Il pianeta dei dinosauri,Planet of Dinosaurs,1977,1979-08-27,"Sci-Fi, Drama",84,USA,English,James K. Shea,...,"Mary Appleseth, Harvey Shain, Derna Wylde, Max...",A space-ship gets lost and is forced to make a...,4.0,1398,,,,,52.0,42.0
73676,tt4083740,Robot World,Robot World,2015,2015-12-04,"Sci-Fi, Thriller",82,UK,English,Neil Rowe,...,"Ian Rowe, Tamsyn Pickford, Neil Rowe, Jacob Pe...",A pilot is marooned on an alien planet and soo...,3.7,363,,,,,28.0,8.0
84323,tt8484586,Neevevaro,Neevevaro,2018,2018-08-24,"Action, Romance, Thriller",130,India,Telugu,Hari Nath,...,"Taapsee Pannu, Ritika Singh, Aadhi, Vennela Ki...","As Kalyan, a blind chef and Vennela, who comes...",6.9,589,,,,,9.0,1.0
69848,tt3105350,Titanium,Vychislitel,2014,2014-12-18,"Action, Sci-Fi, Thriller",82,Russia,Russian,Dmitriy Grachev,...,"Evgeniy Mironov, Anna Chipovskaya, Vinnie Jone...",Ten prisoners condemned to exile on a hostile ...,4.4,954,,,$ 844037,,4.0,21.0


In [217]:
np.argsort(pull[0])

array([37512, 62192, 21894, ..., 32249, 32266, 42927])

In [194]:
imdb_movies[imdb_movies.imdb_title_id == 'tt0082971']

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
20114,tt0082971,I predatori dell'arca perduta,Raiders of the Lost Ark,1981,1981-06-12,"Action, Adventure",115,USA,"English, German, Hebrew, Spanish, Arabic, Nepali",Steven Spielberg,...,"Harrison Ford, Karen Allen, Paul Freeman, Rona...","In 1936, archaeologist and adventurer Indiana ...",8.4,865510,$ 18000000,$ 248159971,$ 390133212,85.0,948.0,258.0


In [161]:
gen = pairwise_distances_chunked(ikeys, metric='cosine', working_memory=0)

In [162]:
pop = next(gen)

In [167]:
sums = np.argsort(pop[0])[0:11]

In [168]:
imdb_movies.iloc[sums]

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000009,Miss Jerry,Miss Jerry,1894,1894-10-09,Romance,45,USA,,Alexander Black,...,"Blanche Bayliss, William Courtenay, Chauncey D...",The adventures of a female reporter in the 1890s.,5.9,154,,,,,1.0,2.0
57166,tt1340775,Forecast,Forecast,2008,2009-04-17,"Adventure, Drama, Romance",97,Bulgaria,"English, Bulgarian, Serbo-Croatian, Macedonian...",Zornitsa Sophia,...,"Assen Blatechki, Teodora Duhovnikova, Kresimir...",Romance and political drama tight in an advent...,6.6,308,EUR 1000000,,$ 61331,,2.0,
1583,tt0024151,Pescicani - Contrabbando Giallo,I Cover the Waterfront,1933,1933-05-19,"Drama, Romance",80,USA,English,James Cruze,...,"Ben Lyon, Claudette Colbert, Ernest Torrence, ...",An investigative reporter romances a suspected...,6.3,470,,,,,20.0,3.0
4562,tt0036208,Non c'è tempo per l'amore,No Time for Love,1943,1943-01-01,"Comedy, Romance",83,USA,English,Mitchell Leisen,...,"Claudette Colbert, Fred MacMurray, Ilka Chase,...",An upper-class female reporter is (despite her...,6.9,735,,,,,20.0,10.0
61265,tt1721028,Med cezir manzaralari,Med cezir manzaralari,1989,1989,"Adventure, Drama, Romance",78,Turkey,Turkish,Mahinur Ergun,...,"Kadir Inanir, Zuhal Olcay, Yilmaz Zafer, Bülen...",,6.1,123,,,,,,
77172,tt5240372,I Married an Anti-Fan,I Married an Anti-Fan,2016,2016-06-30,"Comedy, Romance",120,China,"Chinese, Mandarin, Korean",Jae-Young Kim,...,"Chan-Yeol Park, Shanshan Yuan, Seohyun, Chao J...",The story is about a female reporter who marri...,5.9,653,,$ 89408,$ 12194083,,5.0,2.0
752,tt0020018,Notte di tradimento,In Old Arizona,1928,1929-01-20,"Action, Adventure, Romance",95,USA,"English, Spanish, Italian",Irving Cummings,...,"Edmund Lowe, Warner Baxter, Dorothy Burgess","A charming, happy-go-lucky bandit in old Arizo...",5.5,898,,,,,29.0,28.0
99,tt0008309,A Modern Musketeer,A Modern Musketeer,1917,1917-12-30,"Adventure, Comedy, Western",68,USA,English,Allan Dwan,...,"Douglas Fairbanks, Marjorie Daw, Kathleen Kirk...","A restless young man travels west, encounterin...",6.7,257,,,,,11.0,7.0
84674,tt8751976,Dev,Dev,2019,2019-02-14,"Action, Adventure, Romance",157,India,"Tamil, Telugu",Rajath Ravishankar,...,"Karthi, Rakul Preet Singh, Karthik, Prakash Ra...","Dev, a youngster from a well-to-do family, is ...",4.8,757,INR 550000000,,$ 72048,,24.0,6.0
2389,tt0027657,Il giardino di Allah,The Garden of Allah,1936,1936-11-19,"Adventure, Drama, Romance",79,USA,English,Richard Boleslawski,...,"Marlene Dietrich, Charles Boyer, Tilly Losch, ...",The star-crossed desert romance of a cloistere...,5.9,1320,$ 2200000,,,,45.0,22.0


In [213]:
imdb_movies[(imdb_movies.country=="USA")|(imdb_movies.language.str.contains("English"))]

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000009,Miss Jerry,Miss Jerry,1894,1894-10-09,Romance,45,USA,,Alexander Black,...,"Blanche Bayliss, William Courtenay, Chauncey D...",The adventures of a female reporter in the 1890s.,5.9,154,,,,,1.0,2.0
3,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,...,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",The fabled queen of Egypt's affair with Roman ...,5.2,446,$ 45000,,,,25.0,3.0
5,tt0002199,"From the Manger to the Cross; or, Jesus of Naz...","From the Manger to the Cross; or, Jesus of Naz...",1912,1913,"Biography, Drama",60,USA,English,Sidney Olcott,...,"R. Henderson Bland, Percy Dyer, Gene Gauntier,...","An account of the life of Jesus Christ, based ...",5.7,484,,,,,13.0,5.0
9,tt0002461,Richard III,Richard III,1912,1912-10-15,Drama,55,"France, USA",English,"André Calmettes, James Keane",...,"Robert Gemp, Frederick Warde, Albert Gardner, ...",Richard of Gloucester uses manipulation and mu...,5.5,225,$ 30000,,,,8.0,1.0
17,tt0003167,Amore di madre,"Home, Sweet Home",1914,1914-05-17,Drama,55,USA,English,D.W. Griffith,...,"Henry B. Walthall, Josephine Crowell, Lillian ...",John Howard Payne at his most miserable point ...,5.8,187,,,,,6.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85837,tt9894470,VFW,VFW,2019,2020-02-14,"Action, Crime, Horror",92,USA,English,Joe Begos,...,"Stephen Lang, William Sadler, Fred Williamson,...",A group of old war veterans put their lives on...,6.1,4178,,,$ 23101,72.0,83.0,94.0
85838,tt9896916,The Pilgrim's Progress,The Pilgrim's Progress,2019,2019-04-18,"Animation, Adventure, Family",108,USA,English,Robert Fernandez,...,"David Thorpe, John Rhys-Davies, Kristyn Getty,...","An epic journey, faithfully adapted to modern-...",5.7,442,,$ 1294596,$ 3173282,,28.0,3.0
85839,tt9898858,Coffee & Kareem,Coffee & Kareem,2020,2020-04-03,"Action, Comedy",88,USA,English,Michael Dowse,...,"Ed Helms, Taraji P. Henson, Terrence Little Ga...",Twelve-year-old Kareem Manning hires a crimina...,5.1,10627,,,,35.0,388.0,64.0
85841,tt9899880,Columbus,Columbus,2018,2018-12-05,"Comedy, Drama",82,Iran,"Persian, English",Hatef Alimardani,...,"Farhad Aslani, Majid Salehi, Saeed Poursamimi,...",A rich family are deciding to immigrate to the...,4.0,209,,,,,,13.0


In [269]:
for k, v in enumerate(title_lst):
    title_lst[k] = v + " ("+imdb_movies.year.iloc[k]+")"

In [289]:
title_lst

['Miss Jerry (1894)',
 'The Story of the Kelly Gang (1906)',
 'Den sorte drøm (1911)',
 'Cleopatra (1912)',
 "L'Inferno (1911)",
 'From the Manger to the Cross; or, Jesus of Nazareth (1912)',
 'Madame DuBarry (1919)',
 'Quo Vadis? (1913)',
 'Independenta Romaniei (1912)',
 'Richard III (1912)',
 'Atlantis (1913)',
 "Fantômas - À l'ombre de la guillotine (1913)",
 'Ingeborg Holm (1913)',
 'Juve contre Fantômas (1913)',
 "Ma l'amor mio non muore... (1914)",
 'Maudite soit la guerre (1914)',
 'Le mort qui tue (1913)',
 'Home, Sweet Home (1914)',
 'Der Student von Prag (1913)',
 'Traffic in Souls (1913)',
 'Gli ultimi giorni di Pompei (1913)',
 'Assunta Spina (1915)',
 "The Avenging Conscience: or 'Thou Shalt Not Kill' (1914)",
 'The Bargain (1914)',
 'Cabiria (1914)',
 'Cinderella (1914)',
 "L'enfant de Paris (1913)",
 'Fantômas contre Fantômas (1914)',
 'A Florida Enchantment (1914)',
 'Der Golem (1915)',
 'Det hemmelighedsfulde X (1914)',
 'His Majesty, the Scarecrow of Oz (1914)',
 'Hy

In [199]:
pd.read_csv("small_title.csv").title

0                            Miss Jerry
1           The Story of the Kelly Gang
2                        Den sorte drøm
3                             Cleopatra
4                             L'Inferno
                      ...              
85850                           Le lion
85851    De Beentjes van Sint-Hildegard
85852         Padmavyuhathile Abhimanyu
85853                 Sokagin Çocuklari
85854        La vida sense la Sara Amat
Name: title, Length: 85855, dtype: object

In [268]:
title_lst = imdb_movies.original_title.to_list()

In [266]:
len(title_lst)

85855

In [291]:
dbfile = open(f'title_list2.pkl', 'ab')
# source, destination
pickle.dump(title_lst, dbfile)                     
dbfile.close()

In [292]:
dbfile = open(f'title_list2.pkl', 'rb')
# source, destination
title = pickle.load(dbfile)                     
dbfile.close()

In [293]:
title

['Miss Jerry (1894)',
 'The Story of the Kelly Gang (1906)',
 'Den sorte drøm (1911)',
 'Cleopatra (1912)',
 "L'Inferno (1911)",
 'From the Manger to the Cross; or, Jesus of Nazareth (1912)',
 'Madame DuBarry (1919)',
 'Quo Vadis? (1913)',
 'Independenta Romaniei (1912)',
 'Richard III (1912)',
 'Atlantis (1913)',
 "Fantômas - À l'ombre de la guillotine (1913)",
 'Ingeborg Holm (1913)',
 'Juve contre Fantômas (1913)',
 "Ma l'amor mio non muore... (1914)",
 'Maudite soit la guerre (1914)',
 'Le mort qui tue (1913)',
 'Home, Sweet Home (1914)',
 'Der Student von Prag (1913)',
 'Traffic in Souls (1913)',
 'Gli ultimi giorni di Pompei (1913)',
 'Assunta Spina (1915)',
 "The Avenging Conscience: or 'Thou Shalt Not Kill' (1914)",
 'The Bargain (1914)',
 'Cabiria (1914)',
 'Cinderella (1914)',
 "L'enfant de Paris (1913)",
 'Fantômas contre Fantômas (1914)',
 'A Florida Enchantment (1914)',
 'Der Golem (1915)',
 'Det hemmelighedsfulde X (1914)',
 'His Majesty, the Scarecrow of Oz (1914)',
 'Hy

In [226]:
imdb_movies.original_title[imdb_movies.original_title.str.contains(texts)]

28381    Pulp Fiction
Name: original_title, dtype: object

In [230]:
less = imdb_movies[['imdb_title_id','original_title','year','genre','duration','country','language']].copy()

In [232]:
less.to_csv('movies_less_info.csv')

In [233]:
less.original_title

0                            Miss Jerry
1           The Story of the Kelly Gang
2                        Den sorte drøm
3                             Cleopatra
4                             L'Inferno
                      ...              
85850                           Le lion
85851    De Beentjes van Sint-Hildegard
85852         Padmavyuhathile Abhimanyu
85853                 Sokagin Çocuklari
85854        La vida sense la Sara Amat
Name: original_title, Length: 85855, dtype: object

In [256]:
test = 'Rocky'

In [248]:
if test not in less.original_title.values:
    print('Ouch')

In [249]:
test in less.original_title.values

True

In [255]:
less.original_title.values[0].contains('1')

AttributeError: 'str' object has no attribute 'contains'

In [258]:
less[less.original_title==test]

Unnamed: 0,imdb_title_id,original_title,year,genre,duration,country,language
17688,tt0075148,Rocky,1976,"Drama, Sport",120,USA,English
36757,tt0215132,Rocky,1981,"Action, Romance",151,India,Hindi
50228,tt0814295,Rocky,2006,"Action, Drama, Romance",125,India,"English, Hindi"


In [280]:
title_lst[0].replace(")","").split(" (")

['Miss Jerry', '1894']

In [294]:
posters = pd.read_csv('data/MovieGenre.csv')

In [298]:
post = posters[['imdbId','Poster']].copy()

In [300]:
post.dropna(inplace=True)

In [302]:
post.to_csv("poster_image.csv")

In [303]:
imdb_movies.iloc[recommends].imdb_title_id[0:10].values

array(['tt5464234', 'tt0199753', 'tt0049223', 'tt0117330', 'tt0133152',
       'tt1388371', 'tt0078089', 'tt4083740', 'tt8484586', 'tt3105350'],
      dtype=object)

In [321]:
for vals in imdb_movies.iloc[recommends].imdb_title_id[0:10].values:
    val = int(vals[2:])
    if val in post.imdbId.values:
        print(post[post.imdbId==val].Poster.values[0])

https://images-na.ssl-images-amazon.com/images/M/MV5BMTY2MzE0MjAwOF5BMl5BanBnXkFtZTYwNDM4Mzg2._V1_UX182_CR0,0,182,268_AL_.jpg
https://images-na.ssl-images-amazon.com/images/M/MV5BOTdmODZkNmQtYjU4Mi00MTcyLTg5YmEtNmVjMWU1M2Y5NzgyXkEyXkFqcGdeQXVyNDYyMDk5MTU@._V1_UX182_CR0,0,182,268_AL_.jpg
https://images-na.ssl-images-amazon.com/images/M/MV5BY2RlMDhlY2MtMjQ1Zi00NzI5LTgxOTgtZjliNWMzYTY3NWZkL2ltYWdlL2ltYWdlXkEyXkFqcGdeQXVyMTQxNzMzNDI@._V1_UX182_CR0,0,182,268_AL_.jpg
https://images-na.ssl-images-amazon.com/images/M/MV5BMTU0NDI2ODQxOF5BMl5BanBnXkFtZTgwNTI1NDAxMDE@._V1_UX182_CR0,0,182,268_AL_.jpg
https://images-na.ssl-images-amazon.com/images/M/MV5BMTY2Nzc2Njc4MF5BMl5BanBnXkFtZTgwMDAzMDU2MzE@._V1_UY268_CR20,0,182,268_AL_.jpg


In [306]:
for vals in imdb_movies.iloc[recommends].imdb_title_id[0:10].values:
    print(vals[2:])

5464234
0199753
0049223
0117330
0133152
1388371
0078089
4083740
8484586
3105350


In [309]:
if 113497 in post.imdbId.values:
    print("yeah!")

yeah!


In [311]:
type(post.imdbId.values[0])

numpy.int64

In [314]:
int(imdb_movies.iloc[recommends].imdb_title_id[0:10].values[0][2:])

5464234

In [None]:
<img src='https://images-na.ssl-images-amazon.com/images/M/MV5BMDU2ZWJlMjktMTRhMy00ZTA5LWEzNDgtYmNmZTEwZTViZWJkXkEyXkFqcGdeQXVyNDQ2OTk4MzI@._V1_UX182_CR0,0,182,268_AL_.jpg'>


In [323]:
imdb_movies.iloc[recommends][imdb_movies.iloc[recommends].duration<duration].original_title[0:looking_for].values

array(['Kill Switch', 'Red Planet', 'Forbidden Planet',
       'Petticoat Planet', 'Planet of the Apes', 'Ha-Trempist',
       'Planet of Dinosaurs', 'Robot World', 'Neevevaro', 'Vychislitel'],
      dtype=object)

In [324]:
imdb_movies.iloc[recommends].original_title[0:10].values

array(['Kill Switch', 'Red Planet', 'Forbidden Planet',
       'Petticoat Planet', 'Planet of the Apes', 'Ha-Trempist',
       'Planet of Dinosaurs', 'Robot World', 'Neevevaro', 'Vychislitel'],
      dtype=object)

In [326]:
type(imdb_movies.duration[0])

numpy.int64

In [330]:
lst = imdb_movies.genre.unique()

In [332]:
len(lst)

1257

In [333]:
gen = "Action"

In [334]:
imdb_movies.original_title[imdb_movies.genre.str.contains(gen)]

36                                  The Perils of Pauline
37                                          The Squaw Man
61                                           Les vampires
63                           20,000 Leagues Under the Sea
80                                        Reggie Mixes In
                               ...                       
85799    Code Geass: Lelouch of the Rebellion Episode III
85830                                       Bulletproof 2
85837                                                 VFW
85839                                     Coffee & Kareem
85843                                              Kaithi
Name: original_title, Length: 12948, dtype: object

In [336]:
type(imdb_movies.duration.min())

int

In [339]:
np.sort(imdb_movies.duration.unique())

array([ 41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,
        54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,
        67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
        80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,
        93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104, 105,
       106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118,
       119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131,
       132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144,
       145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157,
       158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170,
       171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183,
       184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196,
       197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
       210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 22

In [355]:
zip_test_titles = imdb_movies.original_title[:10].values

In [358]:
zip_test_urls = post.Poster[:10].values

In [359]:
for titles, urls in zip(zip_test_titles,zip_test_urls):
    print(f"{titles}: {urls}\n")

Miss Jerry: https://images-na.ssl-images-amazon.com/images/M/MV5BMDU2ZWJlMjktMTRhMy00ZTA5LWEzNDgtYmNmZTEwZTViZWJkXkEyXkFqcGdeQXVyNDQ2OTk4MzI@._V1_UX182_CR0,0,182,268_AL_.jpg

The Story of the Kelly Gang: https://images-na.ssl-images-amazon.com/images/M/MV5BZTk2ZmUwYmEtNTcwZS00YmMyLWFkYjMtNTRmZDA3YWExMjc2XkEyXkFqcGdeQXVyMTQxNzMzNDI@._V1_UY268_CR10,0,182,268_AL_.jpg

Den sorte drøm: https://images-na.ssl-images-amazon.com/images/M/MV5BMjQxM2YyNjMtZjUxYy00OGYyLTg0MmQtNGE2YzNjYmUyZTY1XkEyXkFqcGdeQXVyMTQxNzMzNDI@._V1_UX182_CR0,0,182,268_AL_.jpg

Cleopatra: https://images-na.ssl-images-amazon.com/images/M/MV5BMTczMTMyMTgyM15BMl5BanBnXkFtZTcwOTc4OTQyMQ@@._V1_UY268_CR4,0,182,268_AL_.jpg

L'Inferno: https://images-na.ssl-images-amazon.com/images/M/MV5BOTEyNzg5NjYtNDU4OS00MWYxLWJhMTItYWU4NTkyNDBmM2Y0XkEyXkFqcGdeQXVyMTQxNzMzNDI@._V1_UX182_CR0,0,182,268_AL_.jpg

From the Manger to the Cross; or, Jesus of Nazareth: https://images-na.ssl-images-amazon.com/images/M/MV5BNGMwNzUwNjYtZWM5NS00YzMyLWI4NjA

In [365]:
genres_maybe = imdb_movies.genre.unique()

In [367]:
len(genres_maybe)

1257

In [370]:
collect = []
for genre in genres_maybe:
    collect.append(genre.split(", "))

In [373]:
set_test = set()
    

In [376]:
for val in collect:
    if isinstance(val,list):
        for tes in val:
            set_test.add(tes)
    else:
        set_test.add(val)

In [377]:
set_test

{'Action',
 'Adult',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'Film-Noir',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'News',
 'Reality-TV',
 'Romance',
 'Sci-Fi',
 'Sport',
 'Thriller',
 'War',
 'Western'}

In [409]:
from translate import Translator


In [411]:
import translate

In [445]:
translate.providers

<module 'translate.providers' from '/home/stingl/anaconda3/lib/python3.8/site-packages/translate/providers/__init__.py'>

In [443]:
bob=translate.translate.Translator(from_lang="ja", to_lang='en')

In [444]:
bob.translate("Doragon bôru Z")

'Doragon bôru Z'

In [388]:
transla.translate("Nippon")

'Nippon'

In [454]:
from googletrans import Translator

In [455]:
googletrans.Translator()

<googletrans.client.Translator at 0x7fa36824b760>

In [456]:
steve = Translator()

In [457]:
print(googletrans.LANGUAGES)

{'af': 'afrikaans', 'sq': 'albanian', 'am': 'amharic', 'ar': 'arabic', 'hy': 'armenian', 'az': 'azerbaijani', 'eu': 'basque', 'be': 'belarusian', 'bn': 'bengali', 'bs': 'bosnian', 'bg': 'bulgarian', 'ca': 'catalan', 'ceb': 'cebuano', 'ny': 'chichewa', 'zh-cn': 'chinese (simplified)', 'zh-tw': 'chinese (traditional)', 'co': 'corsican', 'hr': 'croatian', 'cs': 'czech', 'da': 'danish', 'nl': 'dutch', 'en': 'english', 'eo': 'esperanto', 'et': 'estonian', 'tl': 'filipino', 'fi': 'finnish', 'fr': 'french', 'fy': 'frisian', 'gl': 'galician', 'ka': 'georgian', 'de': 'german', 'el': 'greek', 'gu': 'gujarati', 'ht': 'haitian creole', 'ha': 'hausa', 'haw': 'hawaiian', 'iw': 'hebrew', 'he': 'hebrew', 'hi': 'hindi', 'hmn': 'hmong', 'hu': 'hungarian', 'is': 'icelandic', 'ig': 'igbo', 'id': 'indonesian', 'ga': 'irish', 'it': 'italian', 'ja': 'japanese', 'jw': 'javanese', 'kn': 'kannada', 'kk': 'kazakh', 'km': 'khmer', 'ko': 'korean', 'ku': 'kurdish (kurmanji)', 'ky': 'kyrgyz', 'lo': 'lao', 'la': 'lat

In [458]:
steve

<googletrans.client.Translator at 0x7fa3682c90a0>

In [459]:
translated = steve.translate('Бороди́нское сраже́ние')

AttributeError: 'NoneType' object has no attribute 'group'

In [463]:
from googletrans import Translator

In [464]:
translator = Translator()

In [465]:
result = translator.translate('Mitä sinä teet')

AttributeError: 'NoneType' object has no attribute 'group'

In [466]:

from googletrans import Translator

detector = Translator()

dec_lan = detector.detect('이 문장은 한글로 쓰여졌습니다.')

print(dec_lan)



AttributeError: 'NoneType' object has no attribute 'group'

In [467]:
from deep_translator import GoogleTranslator

In [468]:
translated = GoogleTranslator(source='auto', target='en')

In [470]:
translated.translate("Hola, mi amour")

'Hello my love'

In [490]:
for val in imdb_movies.dropna().description.values:
    if val :
        print(translated.translate(val))

In a futuristic city sharply divided between the working class and the city planners, the son of the city's mastermind falls in love with a working class prophet who predicts the coming of a savior to mediate their differences.
With the aid of a wealthy erratic tippler, a dewy-eyed tramp who has fallen in love with a sightless flower girl accumulates money to be able to help her medically.
The Tramp struggles to live in modern industrial society with the help of a young homeless woman.
A wanted gangster is both king and prisoner of the Casbah. He is protected from arrest by his friends, but is torn by his desire for freedom outside. A visiting Parisian beauty may just tempt his fate.
Exiled into the dangerous forest by her wicked stepmother, a princess is rescued by seven dwarf miners who make her part of their household.
A manipulative woman and a roguish man conduct a turbulent romance during the American Civil War and Reconstruction periods.
A naive man is appointed to fill a vacanc

In [476]:
imdb_movies[imdb_movies.language.str.contains('German')]

ValueError: Cannot mask with non-boolean array containing NA / NaN values

In [477]:
trans_test = imdb_movies.language.fillna("Not Provided")

In [479]:
trans_test

0                 None
1                 None
2         Not Provided
3              English
4              Italian
             ...      
85850           French
85851    German, Dutch
85852        Malayalam
85853          Turkish
85854          Catalan
Name: language, Length: 85855, dtype: object

In [486]:
imdb_movies.iloc[4].description

"Loosely adapted from Dante's Divine Comedy and inspired by the illustrations of Gustav Doré the original silent film has been restored and has a new score by Tangerine Dream."

In [488]:
translated.translate(imdb_movies.iloc[700].description)

'The business tycoon Nicolas Saccard is nearly ruined by his rival Gunderman, when he tries to raise capital for his company. To push up the price of his stock, Saccard plans a publicity ...'