In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt


In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [3]:
movies = movies.merge(credits, on ='title')

In [4]:
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'movie_id', 'cast', 'crew'],
      dtype='object')

In [5]:
movies = movies[['genres','id','keywords','overview','popularity','tagline','title','vote_average','vote_count','cast','crew']]

In [6]:
movies.isnull().sum()

genres            0
id                0
keywords          0
overview          3
popularity        0
tagline         844
title             0
vote_average      0
vote_count        0
cast              0
crew              0
dtype: int64

In [7]:
movies['tagline'].fillna("", inplace = True)

In [8]:
movies.dropna(inplace = True)

In [9]:
movies.shape

(4806, 11)

In [10]:
movies.reset_index(drop=True, inplace = True)

In [11]:
vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
m = vote_counts.quantile(0.50)

def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [12]:
movies['vote_count'] = movies['vote_count'].astype('int')
movies['vote_average'] = movies['vote_average'].astype('int')
movies['wr'] = movies.apply(weighted_rating, axis=1)

In [13]:
movies.head(3)

Unnamed: 0,genres,id,keywords,overview,popularity,tagline,title,vote_average,vote_count,cast,crew,wr
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di...",150.437577,Enter the World of Pandora.,Avatar,7,11800,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...",6.973607
1,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","Captain Barbossa, long believed to be dead, ha...",139.082615,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6,4500,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...",5.982757
2,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",A cryptic message from Bond’s past sends him o...,107.376788,A Plan No One Escapes,Spectre,6,4466,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...",5.982632


In [14]:
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [15]:
import ast

def convert(obj):
    l=[]
    for i in ast.literal_eval(obj):
        l.append(i["name"])
    return l

In [16]:
movies['genres']=movies['genres'].apply(convert)

In [17]:
movies.iloc[0].genres

['Action', 'Adventure', 'Fantasy', 'Science Fiction']

In [18]:
movies['keywords']=movies['keywords'].apply(convert)

In [19]:
import ast

def convert1(obj):
    l=[]
    counter =0
    for i in ast.literal_eval(obj):
        if counter < 3:
            l.append(i["name"])
            counter +=1
        else:
            break
    return l

In [20]:
movies['actor']=movies['cast'].apply(convert1)

In [21]:
import ast

def convert2(obj):
    l=[]

    for i in ast.literal_eval(obj):
        if i['job'] == "Director":
            l.append(i["name"])
            break
    return l

In [22]:
movies['crew']=movies['crew'].apply(convert2)

In [23]:
import ast

def convert1(obj):
    l=[]
    counter =0
    for i in ast.literal_eval(obj):
        if counter < 3:
            l.append(i["character"])
            counter +=1
        else:
            break
    return l

In [24]:
movies['character']=movies['cast'].apply(convert1)

In [25]:
movies['overview']=movies['overview'].apply(lambda x :x.split())

In [26]:
movies['tagline']=movies['tagline'].apply(lambda x :x.split())

In [27]:
movies['sub_title']=movies['title'].apply(lambda x :x.split())

In [28]:
movies.head(3)

Unnamed: 0,genres,id,keywords,overview,popularity,tagline,title,vote_average,vote_count,cast,crew,wr,actor,character,sub_title
0,"[Action, Adventure, Fantasy, Science Fiction]",19995,"[culture clash, future, space war, space colon...","[In, the, 22nd, century,, a, paraplegic, Marin...",150.437577,"[Enter, the, World, of, Pandora.]",Avatar,7,11800,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...",[James Cameron],6.973607,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[Jake Sully, Neytiri, Dr. Grace Augustine]",[Avatar]
1,"[Adventure, Fantasy, Action]",285,"[ocean, drug abuse, exotic island, east india ...","[Captain, Barbossa,, long, believed, to, be, d...",139.082615,"[At, the, end, of, the, world,, the, adventure...",Pirates of the Caribbean: At World's End,6,4500,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...",[Gore Verbinski],5.982757,"[Johnny Depp, Orlando Bloom, Keira Knightley]","[Captain Jack Sparrow, Will Turner, Elizabeth ...","[Pirates, of, the, Caribbean:, At, World's, End]"
2,"[Action, Adventure, Crime]",206647,"[spy, based on novel, secret agent, sequel, mi...","[A, cryptic, message, from, Bond’s, past, send...",107.376788,"[A, Plan, No, One, Escapes]",Spectre,6,4466,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...",[Sam Mendes],5.982632,"[Daniel Craig, Christoph Waltz, Léa Seydoux]","[James Bond, Blofeld, Madeleine]",[Spectre]


In [29]:
movies.shape

(4806, 15)

In [30]:
movies['genres']=movies['genres'].apply(lambda x :[i.replace(" ","") for i in x])
movies['keywords_conv']=movies['keywords'].apply(lambda x :[i.replace(" ","") for i in x])
movies['crew']=movies['crew'].apply(lambda x :[i.replace(" ","") for i in x])
movies['actor']=movies['actor'].apply(lambda x :[i.replace(" ","") for i in x])
movies['character']=movies['character'].apply(lambda x :[i.replace(" ","") for i in x])



In [31]:
movies.head(2)

Unnamed: 0,genres,id,keywords,overview,popularity,tagline,title,vote_average,vote_count,cast,crew,wr,actor,character,sub_title,keywords_conv
0,"[Action, Adventure, Fantasy, ScienceFiction]",19995,"[culture clash, future, space war, space colon...","[In, the, 22nd, century,, a, paraplegic, Marin...",150.437577,"[Enter, the, World, of, Pandora.]",Avatar,7,11800,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...",[JamesCameron],6.973607,"[SamWorthington, ZoeSaldana, SigourneyWeaver]","[JakeSully, Neytiri, Dr.GraceAugustine]",[Avatar],"[cultureclash, future, spacewar, spacecolony, ..."
1,"[Adventure, Fantasy, Action]",285,"[ocean, drug abuse, exotic island, east india ...","[Captain, Barbossa,, long, believed, to, be, d...",139.082615,"[At, the, end, of, the, world,, the, adventure...",Pirates of the Caribbean: At World's End,6,4500,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...",[GoreVerbinski],5.982757,"[JohnnyDepp, OrlandoBloom, KeiraKnightley]","[CaptainJackSparrow, WillTurner, ElizabethSwann]","[Pirates, of, the, Caribbean:, At, World's, End]","[ocean, drugabuse, exoticisland, eastindiatrad..."


In [32]:
movies['tags'] = movies['genres'] + movies['overview'] + movies['tagline'] + movies['keywords_conv']+movies['crew']+movies['actor']+ movies['character']

In [33]:
movies['description'] = movies['genres'] + movies['keywords'] + movies['overview'] + movies['tagline'] + movies['crew']+movies['actor']+movies['character']+movies['sub_title']

In [34]:
new_id = list(range(0,movies.shape[0]))
movies['temp_index']=new_id

In [35]:
new_df = movies[['temp_index','id','title','tags','vote_average','vote_count','popularity','wr','description']]

In [36]:
new_df.head(3)

Unnamed: 0,temp_index,id,title,tags,vote_average,vote_count,popularity,wr,description
0,0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction, I...",7,11800,150.437577,6.973607,"[Action, Adventure, Fantasy, ScienceFiction, c..."
1,1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action, Captain, Barbossa...",6,4500,139.082615,5.982757,"[Adventure, Fantasy, Action, ocean, drug abuse..."
2,2,206647,Spectre,"[Action, Adventure, Crime, A, cryptic, message...",6,4466,107.376788,5.982632,"[Action, Adventure, Crime, spy, based on novel..."


In [37]:
new_df['tags']=new_df['tags'].apply(lambda x :" ".join(x))
new_df['tags']= new_df['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].apply(lambda x :" ".join(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']= new_df['tags'].apply(lambda x:x.lower())


In [38]:
new_df.head(2)

Unnamed: 0,temp_index,id,title,tags,vote_average,vote_count,popularity,wr,description
0,0,19995,Avatar,action adventure fantasy sciencefiction in the...,7,11800,150.437577,6.973607,"[Action, Adventure, Fantasy, ScienceFiction, c..."
1,1,285,Pirates of the Caribbean: At World's End,"adventure fantasy action captain barbossa, lon...",6,4500,139.082615,5.982757,"[Adventure, Fantasy, Action, ocean, drug abuse..."


In [39]:
new_df['description']=new_df['description'].apply(lambda x :" ".join(x))
new_df['description']= new_df['description'].apply(lambda x:x.lower())
new_df['description'] = new_df['description'].str.strip('[]').str.replace(' ',' ').str.replace("'",' ').str.replace('"',' ')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['description']=new_df['description'].apply(lambda x :" ".join(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['description']= new_df['description'].apply(lambda x:x.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['description'] = new_df['description'].str.strip('[]')

In [40]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')
def stemm(text):
    y = []
    for i in text.split():
        y.append(stemmer.stem(i))
        
    return " ".join(y)

new_df['tags'] = new_df['tags'].apply(stemm)
        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stemm)


In [41]:
new_df['tags']

0       action adventur fantasi sciencefict in the 22n...
1       adventur fantasi action captain barbossa, long...
2       action adventur crime a cryptic messag from bo...
3       action crime drama thriller follow the death o...
4       action adventur sciencefict john carter is a w...
                              ...                        
4801    action crime thriller el mariachi just want to...
4802    comedi romanc a newlyw coupl honeymoon is upen...
4803    comedi drama romanc tvmovi "signed, sealed, de...
4804    when ambiti new york attorney sam is sent to s...
4805    documentari ever sinc the second grade when he...
Name: tags, Length: 4806, dtype: object

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words = 'english')

tfidf = vectorizer.fit_transform(new_df['tags']).toarray()

In [43]:
tfidf.shape

(4806, 43412)

In [44]:
from sklearn.metrics.pairwise import cosine_similarity
similarity1 = cosine_similarity(tfidf)    

In [45]:
similarity1

array([[1.        , 0.02051937, 0.0159563 , ..., 0.0155875 , 0.        ,
        0.        ],
       [0.02051937, 1.        , 0.01217914, ..., 0.01180416, 0.00745192,
        0.        ],
       [0.0159563 , 0.01217914, 1.        , ..., 0.00924531, 0.        ,
        0.        ],
       ...,
       [0.0155875 , 0.01180416, 0.00924531, ..., 1.        , 0.01038072,
        0.01673488],
       [0.        , 0.00745192, 0.        , ..., 0.01038072, 1.        ,
        0.01248439],
       [0.        , 0.        , 0.        , ..., 0.01673488, 0.01248439,
        1.        ]])

In [46]:
similarity1.shape

(4806, 4806)

In [47]:
new_df['similarity_col'] = similarity1.tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['similarity_col'] = similarity1.tolist()


In [48]:
def predictMovie1(name):
    try:
        new_entry = new_df[new_df['title']==name].index[0]
        distance = similarity1[new_entry]
#         return distance
        movie_list = sorted(list(enumerate(distance)), reverse = True, key = lambda x : x[1])[0:20]
#         return movie_list
        for i in movie_list:
            print(str(i[1]) +" - " + str(new_df.iloc[i[0]].temp_index) + "-" + new_df.iloc[i[0]].title)
        
    except IndexError as e:
        print('---')
        

In [49]:
# 1.0000000000000002 - 160-How to Train Your Dragon 2
# 0.3855587197941137 - 92-How to Train Your Dragon
# 0.2703066833838685 - 292-Eragon
# 0.25918264132768054 - 1309-Dragon Nest: Warriors' Dawn
# 0.243885054889215 - 746-Reign of Fire
# 0.24001639110857517 - 2449-Dragon Hunters
# 0.2162190653334646 - 2372-Dragonslayer
# 0.20986105065071606 - 3017-Pete's Dragon
# 0.2077188765260499 - 1530-George and the Dragon
# 0.17364857063928021 - 352-Mulan
# 0.16460923218361104 - 152-Kung Fu Panda 3
# 0.15533280567423 - 1318-Rock Star
# 0.147532764253935 - 2142-Team America: World Police
# 0.1445234641710509 - 22-The Hobbit: The Desolation of Smaug
# 0.12929512563908088 - 234-The Croods
# 0.11665910609041501 - 106-Shrek the Third
# 0.11324386774486173 - 678-Dragon Blade
# 0.11148674984308268 - 66-Up
# 0.1106100920009728 - 942-The Book of Life
# 0.11032201008395734 - 506-Despicable Me 2

In [50]:
predictMovie1('Fifty Shades of Grey')

1.0 - 1157-Fifty Shades of Grey
0.17592880339775513 - 3618-Saved!
0.13510363392895844 - 3928-Real Women Have Curves
0.1296351973534074 - 3780-Red State
0.12220766518524193 - 3613-Fifty Shades of Black
0.09659214442475605 - 3482-College
0.09628016992024693 - 4522-To Save A Life
0.09579323724408781 - 4650-The Canyons
0.09480995763575108 - 4446-Latter Days
0.08892230075683803 - 4208-Grace Unplugged
0.08843778815779996 - 4294-Sex, Lies, and Videotape
0.08212963511231809 - 4422-Steppin: The Movie
0.07825887172550784 - 3800-Mondays in the Sun
0.07744339854381185 - 36-Transformers: Age of Extinction
0.07526706664892824 - 4381-Ajami
0.07140930005698491 - 4176-Nowhere Boy
0.07007434231183866 - 4495-After
0.06962818585153968 - 3070-Black or White
0.06841360821631941 - 650-Eyes Wide Shut
0.06644173709161358 - 4183-Higher Ground


In [51]:
# threshold - 0.08222

In [52]:
new_df.iloc[4312]

temp_index                                                     4312
id                                                            86331
title                                                        Desire
tags              drama romanc in a social context deterior by a...
vote_average                                                      4
vote_count                                                      140
popularity                                                20.422246
wr                                                         5.038133
description       drama romance france female nudity sex sexuali...
similarity_col    [0.006625090163270772, 0.00403640968629574, 0....
Name: 4312, dtype: object

In [53]:
new_df[new_df['description'].str.contains(r"\b{}\b".format('marvel'))]

Unnamed: 0,temp_index,id,title,tags,vote_average,vote_count,popularity,wr,description,similarity_col
5,5,559,Spider-Man 3,fantasi action adventur the seem invinc spider...,5,3576,115.699814,5.040487,fantasy action adventure dual identity amnesia...,"[0.032689948612018196, 0.0420275429486921, 0.0..."
7,7,99861,Avengers: Age of Ultron,action adventur sciencefict when toni stark tr...,7,6767,134.279229,6.954639,action adventure sciencefiction marvel comic s...,"[0.03920364303150096, 0.05101555754194145, 0.0..."
16,16,24428,The Avengers,sciencefict action adventur when an unexpect e...,7,11776,144.448633,6.973555,sciencefiction action adventure new york shiel...,"[0.014976470731779067, 0.04644197820444866, 0...."
20,20,1930,The Amazing Spider-Man,action adventur fantasi peter parker is an out...,6,6586,89.866276,5.98803,action adventure fantasy loss of father vigila...,"[0.022202431870110385, 0.029860471669754692, 0..."
26,26,271110,Captain America: Civil War,adventur action sciencefict follow the event o...,7,7241,198.372395,6.957515,adventure action sciencefiction civil war war ...,"[0.042341211754588444, 0.04105070920536811, 0...."
30,30,558,Spider-Man 2,action adventur fantasi peter parker is go thr...,6,4321,35.149586,5.98208,action adventure fantasy dual identity love of...,"[0.010497778944072002, 0.047271690396223934, 0..."
31,31,68721,Iron Man 3,action adventur sciencefict when toni stark wo...,6,8806,77.68208,5.990969,action adventure sciencefiction terrorist war ...,"[0.038495213400598416, 0.02302555559899858, 0...."
33,33,36668,X-Men: The Last Stand,adventur action sciencefict thriller when a cu...,6,3525,3.857526,5.978287,adventure action sciencefiction thriller mutan...,"[0.012003566064186545, 0.023331081086574054, 0..."
38,38,102382,The Amazing Spider-Man 2,"action adventur fantasi for peter parker, life...",6,4179,89.270217,5.981503,action adventure fantasy obsession marvel comi...,"[0.016830236873404097, 0.021520797294458414, 0..."
46,46,127585,X-Men: Days of Future Past,action adventur fantasi sciencefict the ultim ...,7,6032,118.078691,6.94932,action adventure fantasy sciencefiction 1970s ...,"[0.026398193569356777, 0.029393035275868545, 0..."


In [54]:
new_df[new_df['temp_index'] == 1157]

Unnamed: 0,temp_index,id,title,tags,vote_average,vote_count,popularity,wr,description,similarity_col
1157,1157,216015,Fifty Shades of Grey,drama romanc thriller when colleg senior anast...,5,3254,98.755657,5.044223,drama romance thriller based on novel perversi...,"[0.0027305322826506593, 0.003327214176053737, ..."


In [55]:
# popularMdf = new_df.sort_values('popularity', ascending=False)

In [56]:
# popularMdf.head(5)

In [57]:
for i, j in new_df.iterrows():
    if j['title'] == 'Fifty Shades of Grey':
        print(j['temp_index'])

1157


In [59]:
def predictMovie2(name):
    try:
        
        new_entry = new_df[new_df['title']==name].index[0]
        distance = similarity1[new_entry]
#         return distance
        movie_list = sorted(list(enumerate(distance)), reverse = True, key = lambda x : x[1])[0:20]
#         return movie_list
        for i in movie_list:
            print(str(i[1]) +" - " + str(new_df.iloc[i[0]].temp_index) + "-" + new_df.iloc[i[0]].title)
        
    except IndexError as e:
        name2 = name.lower()
        movie_list = new_df[new_df['description'].str.contains(r"\b{}\b".format(name2))]
        w_c = []
        for index, rows in movie_list.iterrows():
#             print(rows['description'])
            w_c.append((index, rows['description'].count(name2), rows['title'], rows['popularity']))
        w_c = sorted(w_c, reverse=True, key = lambda x : x[1])
            
#         print(w_c)
        ml2 = []
        if len(w_c) == 0:
            print('type again')
        elif len(w_c) > 6:
            for i in range(0, 6):
                ml2.append(w_c[i])
        else:
            ml2 = w_c
#         print(ml2)
        ml2p = sorted(ml2, reverse = True, key = lambda x : x[3])
#         return ml2p
        temp_mov_l = []
        for i in ml2p:
            movie_name = i[2]
#             print(movie_name)
            new_entry = new_df[new_df['title']==movie_name].index[0]
            dist = similarity1[new_entry]
#             print(dist)
            movie_list1 = sorted(list(enumerate(dist)), reverse = True, key = lambda x : x[1])[0:6]
#             print(movie_list1)
            for i in movie_list1:
                temp_mov_l.append((i[1],new_df.iloc[i[0]].temp_index,new_df.iloc[i[0]].title,new_df.iloc[i[0]].popularity))
#         print(temp_mov_l)
        temp_l = []
        fin_mov_l = []
        for i in temp_mov_l:
            if i[1] not in temp_l:
                temp_l.append(i[1])
                fin_mov_l.append(i)
            else:
                continue
        fin_mov_l = sorted(fin_mov_l, reverse=True, key = lambda x : x[0])[0:9]
#         print(fin_mov_l)
        for i in fin_mov_l:
            print(str(i[1])+"-"+str(i[0])+"-"+ i[2])
        
    except:
        print("Type Again")
    
    
    
    
    
    
    
    
    
    
    
    
    
#         ret_m_l = []
#         for i in ml2p:
#             movie_name = i[2]
#             new_entry = new_df[new_df['title']==movie_name].index[0]
#             dist = similarity1[new_entry]
# #             print(dist)
#             movie_list1 = sorted(list(enumerate(dist)), reverse = True, key = lambda x : x[1])[0:6]
# #             print(movie_list1)
#             for i in movie_list1:
# #                 ret_m_l.append(str(i[1]) +" - " + str(new_df.iloc[i[0]].temp_index) + "-" + new_df.iloc[i[0]].title)
#                 ret_m_l.append((i[1],new_df.iloc[i[0]].temp_index,new_df.iloc[i[0]].title,new_df.iloc[i[0]].popularity))
# #                 print(str(i[1]) +" - " + str(new_df.iloc[i[0]].temp_index) + "-" + new_df.iloc[i[0]].title)
# #         print(len(ret_m_l))
#         print(ret_m_l)
# #         f_m_l = sorted(list(set(ret_m_l)), reverse=True, key = lambda x : x[3])
        
# #         for i in f_m_l:
# #             print(str(i[1])+"-"+str(i[0])+"-"+ i[2])
    
    
            
            
            
            
#         print(sorted(w_c, reverse=True, key = lambda x : x[1]))
#         return movie_list
        

In [66]:
#0.0788

In [69]:
predictMovie2("Titanic")

0.9999999999999999 - 25-Titanic
0.11847121034236179 - 2149-Ghost Ship
0.09694709777199711 - 2293-I Can Do Bad All By Myself
0.09080047406680264 - 3695-Dear Frankie
0.09034899743292288 - 104-Poseidon
0.08476379289612435 - 3501-The Greatest
0.08338604840728774 - 3216-The Rose
0.08132473974249226 - 17-Pirates of the Caribbean: On Stranger Tides
0.08043494279876243 - 310-In the Heart of the Sea
0.07937810129707813 - 775-Supernova
0.07598749093918332 - 2135-The Black Hole
0.0722878169729196 - 2906-Triangle
0.07034994567713962 - 2302-The Bounty
0.06960781971395671 - 57-WALL·E
0.06927297987084903 - 4613-Pieces of April
0.06896888370375284 - 770-Event Horizon
0.06551305304851172 - 4291-Niagara
0.06527337907639058 - 1-Pirates of the Caribbean: At World's End
0.06493554802911117 - 3701-The Blue Lagoon
0.06434473621401081 - 1565-The Notebook


In [396]:
new_df[new_df['title'] == "Fifty Shades of Grey"]['similarity_col']

1157    [0.0027305322826506593, 0.003327214176053737, ...
Name: similarity_col, dtype: object

In [364]:
PopularMdf = movies.sort_values('popularity', ascending=False)

In [365]:
PopularMdf.head(4)

Unnamed: 0,genres,id,keywords,overview,popularity,tagline,title,vote_average,vote_count,cast,crew,wr,actor,character,sub_title,keywords_conv,tags,temp_index
546,"[Family, Animation, Adventure, Comedy]",211672,"[assistant, aftercreditsstinger, duringcredits...","[Minions, Stuart,, Kevin, and, Bob, are, recru...",875.581305,"[Before, Gru,, they, had, a, history, of, bad,...",Minions,6,4571,"[{""cast_id"": 22, ""character"": ""Scarlet Overkil...",[KyleBalda],5.983012,"[SandraBullock, JonHamm, MichaelKeaton]","[ScarletOverkill(voice), HerbOverkill(voice), ...",[Minions],"[assistant, aftercreditsstinger, duringcredits...","[Family, Animation, Adventure, Comedy, Minions...",546
95,"[Adventure, Drama, ScienceFiction]",157336,"[saving the world, artificial intelligence, fa...","[Interstellar, chronicles, the, adventures, of...",724.247784,"[Mankind, was, born, on, Earth., It, was, neve...",Interstellar,8,10867,"[{""cast_id"": 9, ""character"": ""Joseph Cooper"", ...",[ChristopherNolan],7.950134,"[MatthewMcConaughey, JessicaChastain, AnneHath...","[JosephCooper, MurphCooper, Dr.AmeliaBrand]",[Interstellar],"[savingtheworld, artificialintelligence, fathe...","[Adventure, Drama, ScienceFiction, Interstella...",95
788,"[Action, Adventure, Comedy]",293660,"[anti hero, mercenary, marvel comic, superhero...","[Deadpool, tells, the, origin, story, of, form...",514.569956,"[Witness, the, beginning, of, a, happy, ending]",Deadpool,7,10995,"[{""cast_id"": 99, ""character"": ""Wade Wilson / D...",[TimMiller],6.971716,"[RyanReynolds, MorenaBaccarin, EdSkrein]","[WadeWilson/Deadpool, VanessaCarlysle/Copycat,...",[Deadpool],"[antihero, mercenary, marvelcomic, superhero, ...","[Action, Adventure, Comedy, Deadpool, tells, t...",788
94,"[Action, ScienceFiction, Adventure]",118340,"[marvel comic, spaceship, space, outer space, ...","[Light, years, from, Earth,, 26, years, after,...",481.098624,"[All, heroes, start, somewhere.]",Guardians of the Galaxy,7,9742,"[{""cast_id"": 1, ""character"": ""Peter Quill / St...",[JamesGunn],6.968164,"[ChrisPratt, ZoeSaldana, DaveBautista]","[PeterQuill/Star-Lord, Gamora, DraxtheDestroyer]","[Guardians, of, the, Galaxy]","[marvelcomic, spaceship, space, outerspace, or...","[Action, ScienceFiction, Adventure, Light, yea...",94


In [366]:
PopularMdf['description'] = PopularMdf['genres'] + PopularMdf['keywords'] + PopularMdf['overview'] + PopularMdf['tagline'] + PopularMdf['crew']+PopularMdf['actor']+PopularMdf['character']+PopularMdf['sub_title']

In [367]:
PopularMdf.head(2)

Unnamed: 0,genres,id,keywords,overview,popularity,tagline,title,vote_average,vote_count,cast,crew,wr,actor,character,sub_title,keywords_conv,tags,temp_index,description
546,"[Family, Animation, Adventure, Comedy]",211672,"[assistant, aftercreditsstinger, duringcredits...","[Minions, Stuart,, Kevin, and, Bob, are, recru...",875.581305,"[Before, Gru,, they, had, a, history, of, bad,...",Minions,6,4571,"[{""cast_id"": 22, ""character"": ""Scarlet Overkil...",[KyleBalda],5.983012,"[SandraBullock, JonHamm, MichaelKeaton]","[ScarletOverkill(voice), HerbOverkill(voice), ...",[Minions],"[assistant, aftercreditsstinger, duringcredits...","[Family, Animation, Adventure, Comedy, Minions...",546,"[Family, Animation, Adventure, Comedy, assista..."
95,"[Adventure, Drama, ScienceFiction]",157336,"[saving the world, artificial intelligence, fa...","[Interstellar, chronicles, the, adventures, of...",724.247784,"[Mankind, was, born, on, Earth., It, was, neve...",Interstellar,8,10867,"[{""cast_id"": 9, ""character"": ""Joseph Cooper"", ...",[ChristopherNolan],7.950134,"[MatthewMcConaughey, JessicaChastain, AnneHath...","[JosephCooper, MurphCooper, Dr.AmeliaBrand]",[Interstellar],"[savingtheworld, artificialintelligence, fathe...","[Adventure, Drama, ScienceFiction, Interstella...",95,"[Adventure, Drama, ScienceFiction, saving the ..."


In [368]:
PopularMdf = PopularMdf[['id','temp_index', 'title','description','wr','popularity','vote_count']]

In [369]:
PopularMdf.head(3)

Unnamed: 0,id,temp_index,title,description,wr,popularity,vote_count
546,211672,546,Minions,"[Family, Animation, Adventure, Comedy, assista...",5.983012,875.581305,4571
95,157336,95,Interstellar,"[Adventure, Drama, ScienceFiction, saving the ...",7.950134,724.247784,10867
788,293660,788,Deadpool,"[Action, Adventure, Comedy, anti hero, mercena...",6.971716,514.569956,10995


In [302]:
# genres
# PopularMdf['description'] = PopularMdf['description'].str.strip('[]').str.replace(' ','').str.replace("'",'')
# PopularMdf['description'] = PopularMdf['description'].str.strip('[]').str.replace(' ','').str.replace("'",'').str.replace('"','')

In [370]:
PopularMdf['description']=PopularMdf['description'].apply(lambda x :" ".join(x))
PopularMdf['description']= PopularMdf['description'].apply(lambda x:x.lower())
PopularMdf['description'] = PopularMdf['description'].str.strip('[]').str.replace(' ',' ').str.replace("'",' ').str.replace('"',' ')

In [371]:
PopularMdf['description'][270]

'drama adventure sciencefiction based on novel mars nasa isolation botanist stranded spaceship space engineering survival astronaut science deep space explorer duringcreditsstinger battle for survival during a manned mission to mars, astronaut mark watney is presumed dead after a fierce storm and left behind by his crew. but watney has survived and finds himself stranded and alone on the hostile planet. with only meager supplies, he must draw upon his ingenuity, wit and spirit to subsist and find a way to signal to earth that he is alive. bring him home ridleyscott mattdamon jessicachastain kristenwiig markwatney melissalewis anniemontrose the martian'

In [372]:
PopularMdf['description'] = PopularMdf['description'].str.strip('[]').str.replace(' ',' ').str.replace("'",' ').str.replace('"',' ')

In [373]:
PopularMdf['description'][29]

'action adventure thriller spy secret agent sociopath killer art gallery british secret service istanbul turkey imax uzi booby trap impersonating a police officer macao when bond s latest assignment goes gravely wrong and agents around the world are exposed, mi6 is attacked forcing m to relocate the agency. these events cause her authority and position to be challenged by gareth mallory, the new chairman of the intelligence and security committee. with mi6 now compromised from both inside and out, m is left with one ally she can trust: bond. 007 takes to the shadows - aided only by field agent, eve - following a trail to the mysterious silva, whose lethal and hidden motives have yet to reveal themselves. think on your sins. sammendes danielcraig judidench javierbardem jamesbond m silva skyfall'

In [59]:
n = 'Mouse'
s1 = pd.Series(['Mouse', 'dog', 'house and parrot', '23', np.NaN])
s1.str.contains(r"\b{}\b".format(n))
# p = PopularMdf[PopularMdf['description'].str.contains(r"\bstr(n)\b")]

0     True
1    False
2    False
3    False
4      NaN
dtype: object

In [436]:
c = []
for i, j in p.iterrows():
    if j['description'].count('jamesbond') > 0:
        c.append((j['title'],j['popularity'],j['description'].count('jamesbond')))
    else:
        continue
print(sorted(c,reverse=True, key = lambda x : x[2]))

[('Quantum of Solace', 107.928811, 1), ('Spectre', 107.376788, 1), ('Skyfall', 93.004993, 1), ('Casino Royale', 88.935165, 1), ('GoldenEye', 59.824565, 1), ('Die Another Day', 54.159392, 1), ('Dr. No', 48.901542, 1), ('Goldfinger', 47.812466, 1), ('Tomorrow Never Dies', 42.887121, 1), ('From Russia with Love', 41.298723, 1), ('The World Is Not Enough', 39.604363, 1), ('Diamonds Are Forever', 34.634181, 1), ('Thunderball', 31.036, 1), ('Live and Let Die', 30.465138, 1), ('The Man with the Golden Gun', 30.214716, 1), ('Moonraker', 29.887404, 1), ('You Only Live Twice', 28.675891, 1), ('Licence to Kill', 28.22119, 1), ('A View to a Kill', 27.230493, 1), ('The Spy Who Loved Me', 27.17347, 1), ('For Your Eyes Only', 26.090746, 1), ('Octopussy', 25.633663, 1), ("On Her Majesty's Secret Service", 25.289665, 1), ('Never Say Never Again', 23.380757, 1), ('The Living Daylights', 23.331459, 1)]


In [322]:
PopularMdf.iloc[95].description

'adventure fantasy family witch magic broom school of witchcraft wizardry apparition teenage crush werewolf as harry begins his sixth year at hogwarts, he discovers an old book marked as  property of the half-blood prince , and begins to learn more about lord voldemort s dark past. dark secrets revealed davidyates danielradcliffe rupertgrint emmawatson harrypotter ronweasley hermionegranger harry potter and the half-blood prince'

In [153]:
np.median(similarity1)

0.004576272915725767

In [152]:
# average(cosine_similarities)+alpha*standard_deviation(cosine_similarities)

In [None]:
t = np.average(similarity1)+alpha*standard_deviation(cosine_similarities)

In [188]:
a = 8
# Avatar
aSim = new_df.iloc[3341].similarity_col

T = np.average(aSim) + a * np.std(aSim)

In [171]:
T

0.1553887512883075

In [199]:
aSim = new_df.iloc[17].similarity_col

In [200]:
(0.078 -np.average(aSim))/np.std(aSim)

3.313075545325979

In [175]:
# 3.15
#3.17
# 3.57
# 3.44
# 3.78
# 3.89
# 3.31