In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
#from surprise import Reader, Dataset, SVD, evaluate

import warnings; warnings.simplefilter('ignore')

In [3]:
input_cleaned_file_path = '/Users/gauridhumal/Development Projects/UOL-PROJECTs/CRS/DS/crs_ds/data/processed/'
output_files = '/Users/gauridhumal/Development Projects/UOL-PROJECTs/CRS/DS/crs_ds/data/processed/outputs'

In [4]:
def read_file(file_name):
    return pd.read_csv(open(f"{input_cleaned_file_path}/{file_name}", 'r'))

In [5]:
def write_file(df,file_name):
    return df.to_csv(f"{output_files}/{file_name}", index=False)

In [6]:
df_movie_summary = read_file('MovieSummaries/wiki_movie_summary.csv')

In [7]:
df_movie_summary.head()

Unnamed: 0,m_plot_summary,cleaned_m_plot_summary,m_freebase_id,m_title,m_release_date,m_box_off_revenue,m_runtime,m_languages,m_countries,m_genres,m_wikipedia_id
0,"Shlykov, a hard-working taxi driver and Lyosha...","Shlykov, a hard-working taxi driver and Lyosha...",/m/076w2lb,Taxi Blues,1990-09-07,,110.0,"{""/m/06b_j"": ""Russian Language""}","{""/m/0f8l9c"": ""France"", ""/m/05vz3zq"": ""Soviet ...","{""/m/07s9rl0"": ""Drama"", ""/m/03q4nz"": ""World ci...",23890098
1,The nation of Panem consists of a wealthy Capi...,The nation of Panem consists of a wealthy Capi...,/m/0gkz15s,The Hunger Games,2012-03-12,686533290.0,142.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/03btsm8"": ""Action/Adventure"", ""/m/06n90"":...",31186339
2,Poovalli Induchoodan is sentenced for six yea...,Poovalli Induchoodan is sentenced for six yea...,/m/051zjwb,Narasimham,2000,,175.0,"{""/m/0999q"": ""Malayalam Language""}","{""/m/03rk0"": ""India""}","{""/m/04t36"": ""Musical"", ""/m/02kdv5l"": ""Action""...",20663735
3,"The Lemon Drop Kid , a New York City swindler,...","The Lemon Drop Kid , a New York City swindler,...",/m/06xtz3,The Lemon Drop Kid,1951-03-08,2300000.0,91.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/06qm3"": ""Screwball comedy"", ""/m/01z4y"": ""...",2231378
4,Seventh-day Adventist Church pastor Michael Ch...,Seventh-day Adventist Church pastor Michael Ch...,/m/02tqm5,A Cry in the Dark,1988-11-03,6908797.0,121.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America"", ""/m/...","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D...",595909


In [8]:
# split release date into year, month and day parts
df_movie_summary = df_movie_summary.join(df_movie_summary['m_release_date'].str.split('-', expand=True).rename(columns={0:'year', 1:'month', 2:'day'}))

In [9]:
df_movie_summary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42303 entries, 0 to 42302
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   m_plot_summary          42303 non-null  object 
 1   cleaned_m_plot_summary  42303 non-null  object 
 2   m_freebase_id           42204 non-null  object 
 3   m_title                 42204 non-null  object 
 4   m_release_date          39586 non-null  object 
 5   m_box_off_revenue       7587 non-null   float64
 6   m_runtime               35580 non-null  float64
 7   m_languages             42204 non-null  object 
 8   m_countries             42204 non-null  object 
 9   m_genres                42204 non-null  object 
 10  m_wikipedia_id          42303 non-null  int64  
 11  year                    39586 non-null  object 
 12  month                   24322 non-null  object 
 13  day                     22944 non-null  object 
dtypes: float64(2), int64(1), object(11)
me

In [10]:
df_imdb_movie = read_file('imdb/cleaned_title_basics.csv')

In [11]:
df_imdb_movie.shape

(10232298, 9)

In [12]:
df_imdb_movie.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [13]:
df_imdb_movie[(df_imdb_movie['originalTitle']=='The Hunger Games')& (df_imdb_movie['titleType'] == 'movie')]

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
3110874,tt1392170,movie,The Hunger Games,The Hunger Games,0,2012,\N,142,"Action,Adventure,Sci-Fi"


In [14]:
df_imdb_movie_filtered = df_imdb_movie[df_imdb_movie['titleType'] == 'movie']

In [15]:
df_imdb_movie_filtered.shape

(659696, 9)

In [16]:
df_imdb_movie_filtered.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,\N,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,\N,100,"Documentary,News,Sport"
498,tt0000502,movie,Bohemios,Bohemios,0,1905,\N,100,\N
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama


In [17]:
(df_movie_summary['m_title'].str.lower().isin(df_imdb_movie_filtered['primaryTitle'].str.lower())).value_counts()

m_title
True     31170
False    11133
Name: count, dtype: int64

In [18]:
(df_movie_summary['m_title'].str.lower().isin(df_imdb_movie_filtered['originalTitle'].str.lower())).value_counts()

m_title
True     29359
False    12944
Name: count, dtype: int64

In [19]:
((df_movie_summary['m_title'].str.lower().isin(df_imdb_movie_filtered['primaryTitle'].str.lower())) | (df_movie_summary['m_title'].str.lower().isin(df_imdb_movie_filtered['originalTitle'].str.lower()))).value_counts()

m_title
True     32643
False     9660
Name: count, dtype: int64

In [20]:
df_summary_found = df_movie_summary[((df_movie_summary['m_title'].str.lower().isin(df_imdb_movie_filtered['primaryTitle'].str.lower())) | (df_movie_summary['m_title'].str.lower().isin(df_imdb_movie_filtered['originalTitle'].str.lower())))]

In [21]:
df_summary_found.shape

(32643, 14)

In [22]:
df_summary_found.head()

Unnamed: 0,m_plot_summary,cleaned_m_plot_summary,m_freebase_id,m_title,m_release_date,m_box_off_revenue,m_runtime,m_languages,m_countries,m_genres,m_wikipedia_id,year,month,day
0,"Shlykov, a hard-working taxi driver and Lyosha...","Shlykov, a hard-working taxi driver and Lyosha...",/m/076w2lb,Taxi Blues,1990-09-07,,110.0,"{""/m/06b_j"": ""Russian Language""}","{""/m/0f8l9c"": ""France"", ""/m/05vz3zq"": ""Soviet ...","{""/m/07s9rl0"": ""Drama"", ""/m/03q4nz"": ""World ci...",23890098,1990,9.0,7.0
1,The nation of Panem consists of a wealthy Capi...,The nation of Panem consists of a wealthy Capi...,/m/0gkz15s,The Hunger Games,2012-03-12,686533290.0,142.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/03btsm8"": ""Action/Adventure"", ""/m/06n90"":...",31186339,2012,3.0,12.0
2,Poovalli Induchoodan is sentenced for six yea...,Poovalli Induchoodan is sentenced for six yea...,/m/051zjwb,Narasimham,2000,,175.0,"{""/m/0999q"": ""Malayalam Language""}","{""/m/03rk0"": ""India""}","{""/m/04t36"": ""Musical"", ""/m/02kdv5l"": ""Action""...",20663735,2000,,
3,"The Lemon Drop Kid , a New York City swindler,...","The Lemon Drop Kid , a New York City swindler,...",/m/06xtz3,The Lemon Drop Kid,1951-03-08,2300000.0,91.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/06qm3"": ""Screwball comedy"", ""/m/01z4y"": ""...",2231378,1951,3.0,8.0
4,Seventh-day Adventist Church pastor Michael Ch...,Seventh-day Adventist Church pastor Michael Ch...,/m/02tqm5,A Cry in the Dark,1988-11-03,6908797.0,121.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America"", ""/m/...","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D...",595909,1988,11.0,3.0


In [23]:
def find_imdb_id(title,year):
    title = str(title).lower()
    year = str(year)
    return df_imdb_movie_filtered[((df_imdb_movie_filtered['originalTitle'].str.lower() == title) | (df_imdb_movie_filtered['primaryTitle'].str.lower() == title)) & (df_imdb_movie_filtered['startYear']==year)]['tconst'].values



In [24]:
title = "Taxi Blues"
year = "1990"
find_imdb_id(title,year)


array(['tt0100757'], dtype=object)

In [25]:
title = "Destination Meatball"
year = "1951"

In [26]:
find_imdb_id(title,year)

array([], dtype=object)

In [27]:
df_movie_summary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42303 entries, 0 to 42302
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   m_plot_summary          42303 non-null  object 
 1   cleaned_m_plot_summary  42303 non-null  object 
 2   m_freebase_id           42204 non-null  object 
 3   m_title                 42204 non-null  object 
 4   m_release_date          39586 non-null  object 
 5   m_box_off_revenue       7587 non-null   float64
 6   m_runtime               35580 non-null  float64
 7   m_languages             42204 non-null  object 
 8   m_countries             42204 non-null  object 
 9   m_genres                42204 non-null  object 
 10  m_wikipedia_id          42303 non-null  int64  
 11  year                    39586 non-null  object 
 12  month                   24322 non-null  object 
 13  day                     22944 non-null  object 
dtypes: float64(2), int64(1), object(11)
me

In [28]:
s = df_movie_summary.shape

In [29]:
s[0]

42303

In [30]:
df_summary_found['imdb_id'] = ""

In [31]:
df_movie_summary.loc[9]['m_title']

'Destination Meatball'

In [32]:
for i in range(0,42303):
    # print(i)
    # print(df_movie_summary.loc[i]['m_title'])
    # print(df_movie_summary.loc[i]['year'])
    x = find_imdb_id(df_movie_summary.loc[i]['m_title'],df_movie_summary.loc[i]['year'])
    if len(x) == 0:
        # print("not found")
        continue
    else:
        # print(x)
        # print("========")
        df_movie_summary.loc[i,['imdb_id']]  = x[0]

In [None]:
## Save movies summary with IMDB id
write_file(df_movie_summary,'movie_summary_with_imdb_id.csv')

NameError: name 'write_file' is not defined