# Movie Information ChatBot

## Set-up environment

In [1]:
import os
import pandas as pd

In [2]:
import langdetect
from langdetect import DetectorFactory, detect, detect_langs

In [4]:
from langchain.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.prompts import PromptTemplate

In [5]:
# Embedding facilities
from langchain.embeddings import HuggingFaceEmbeddings
# Pipelines
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA

## Data Collection

In [6]:
tsv_file_paths = ['Data/name.basics.tsv', 'Data/title.basics.tsv', 'Data/title.ratings.tsv']

In [7]:
dfs = []
for file_path in tsv_file_paths:
    df = pd.read_csv(file_path, sep='\t')
    dfs.append(df)

  df = pd.read_csv(file_path, sep='\t')


In [8]:
df1 = dfs[0]
df2 = dfs[1]
df3 = dfs[2]

In [9]:
df4 = pd.read_json('Data/IMDB_reviews.json', lines=True)
df5 = pd.read_json('Data/IMDB_movie_details.json', lines=True)

In [10]:
df1.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"actor,miscellaneous,producer","tt0072308,tt0050419,tt0053137,tt0027125"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack,archive_footage","tt0037382,tt0075213,tt0117057,tt0038355"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,music_department,producer","tt0057345,tt0049189,tt0056404,tt0054452"
3,nm0000004,John Belushi,1949,1982,"actor,writer,music_department","tt0072562,tt0077975,tt0080455,tt0078723"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050986,tt0083922,tt0050976,tt0069467"


In [12]:
df1 = df1.assign(knownForTitles=df1['knownForTitles'].str.split(',')).explode('knownForTitles')
df1.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"actor,miscellaneous,producer",tt0072308
0,nm0000001,Fred Astaire,1899,1987,"actor,miscellaneous,producer",tt0050419
0,nm0000001,Fred Astaire,1899,1987,"actor,miscellaneous,producer",tt0053137
0,nm0000001,Fred Astaire,1899,1987,"actor,miscellaneous,producer",tt0027125
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack,archive_footage",tt0037382


In [18]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22192074 entries, 0 to 13510203
Data columns (total 6 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   nconst             object
 1   primaryName        object
 2   birthYear          object
 3   deathYear          object
 4   primaryProfession  object
 5   knownForTitles     object
dtypes: object(6)
memory usage: 1.2+ GB


In [13]:
df1.replace('\\N', pd.NA, inplace=True)

In [15]:
df1 = df1.dropna(subset=['knownForTitles'])

In [16]:
df1.isnull().sum()

nconst                      0
primaryName                 8
birthYear            20180253
deathYear            21488173
primaryProfession     1850675
knownForTitles              0
dtype: int64

In [20]:
df1 = df1.drop(columns=['nconst'])
df1.head()

Unnamed: 0,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,Fred Astaire,1899,1987,"actor,miscellaneous,producer",tt0072308
0,Fred Astaire,1899,1987,"actor,miscellaneous,producer",tt0050419
0,Fred Astaire,1899,1987,"actor,miscellaneous,producer",tt0053137
0,Fred Astaire,1899,1987,"actor,miscellaneous,producer",tt0027125
1,Lauren Bacall,1924,2014,"actress,soundtrack,archive_footage",tt0037382


In [17]:
df2.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,5,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [23]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10790736 entries, 0 to 10790735
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 740.9+ MB


In [21]:
df2['isAdult'] = df2['isAdult'].replace({0: False, 1: True})

In [22]:
df2.replace('\\N', pd.NA, inplace=True)

In [24]:
df2.isnull().sum()

tconst                   0
titleType                0
primaryTitle            19
originalTitle           19
isAdult                  1
startYear          1406804
endYear           10666898
runtimeMinutes     7443113
genres              480417
dtype: int64

In [25]:
df3.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2058
1,tt0000002,5.7,276
2,tt0000003,6.5,2015
3,tt0000004,5.4,179
4,tt0000005,6.2,2784


In [30]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1440380 entries, 0 to 1440379
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1440380 non-null  object 
 1   averageRating  1440380 non-null  float64
 2   numVotes       1440380 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 33.0+ MB


In [31]:
df3.replace('\\N', pd.NA, inplace=True)

In [32]:
df3.isnull().sum()

tconst           0
averageRating    0
numVotes         0
dtype: int64

In [27]:
merging_df = pd.merge(df1, df2, left_on='knownForTitles', right_on='tconst')
merging_df = merging_df.drop(columns=['knownForTitles'])
merging_df

Unnamed: 0,primaryName,birthYear,deathYear,primaryProfession,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,Fred Astaire,1899,1987,"actor,miscellaneous,producer",tt0072308,movie,The Towering Inferno,The Towering Inferno,False,1974,,165,"Action,Drama,Thriller"
1,Fred Astaire,1899,1987,"actor,miscellaneous,producer",tt0050419,movie,Funny Face,Funny Face,False,1957,,103,"Comedy,Musical,Romance"
2,Fred Astaire,1899,1987,"actor,miscellaneous,producer",tt0053137,movie,On the Beach,On the Beach,False,1959,,134,"Drama,Romance,Sci-Fi"
3,Fred Astaire,1899,1987,"actor,miscellaneous,producer",tt0027125,movie,Top Hat,Top Hat,False,1935,,101,"Comedy,Musical,Romance"
4,Lauren Bacall,1924,2014,"actress,soundtrack,archive_footage",tt0037382,movie,To Have and Have Not,To Have and Have Not,False,1944,,100,"Adventure,Comedy,Film-Noir"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
22192036,Romeo del Rosario,,,"animation_department,art_department",tt11657662,movie,The Witcher: Nightmare of the Wolf,The Witcher: Nightmare of the Wolf,False,2021,,83,"Action,Adventure,Animation"
22192037,Romeo del Rosario,,,"animation_department,art_department",tt14069590,tvSeries,Dota: Dragon's Blood,Dota: Dragon's Blood,False,2021,2022,25,"Action,Adventure,Animation"
22192038,Romeo del Rosario,,,"animation_department,art_department",tt2455546,tvSeries,Avengers Assemble,Avengers Assemble,False,2012,2019,23,"Action,Adventure,Animation"
22192039,Harikrishnan Rajan,,,cinematographer,tt8736744,short,Mirage The Last Men,Mirage The Last Men,False,2018,,,"Drama,Short"


In [28]:
final_df = pd.merge(merging_df, df3, on='tconst')

In [29]:
final_df.head()

Unnamed: 0,primaryName,birthYear,deathYear,primaryProfession,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
0,Fred Astaire,1899,1987,"actor,miscellaneous,producer",tt0072308,movie,The Towering Inferno,The Towering Inferno,False,1974,,165,"Action,Drama,Thriller",7.0,48391
1,Fred Astaire,1899,1987,"actor,miscellaneous,producer",tt0050419,movie,Funny Face,Funny Face,False,1957,,103,"Comedy,Musical,Romance",7.0,33119
2,Fred Astaire,1899,1987,"actor,miscellaneous,producer",tt0053137,movie,On the Beach,On the Beach,False,1959,,134,"Drama,Romance,Sci-Fi",7.1,14451
3,Fred Astaire,1899,1987,"actor,miscellaneous,producer",tt0027125,movie,Top Hat,Top Hat,False,1935,,101,"Comedy,Musical,Romance",7.7,20748
4,Lauren Bacall,1924,2014,"actress,soundtrack,archive_footage",tt0037382,movie,To Have and Have Not,To Have and Have Not,False,1944,,100,"Adventure,Comedy,Film-Noir",7.8,38063


In [38]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14144467 entries, 0 to 14144466
Data columns (total 15 columns):
 #   Column             Dtype  
---  ------             -----  
 0   primaryName        object 
 1   birthYear          object 
 2   deathYear          object 
 3   primaryProfession  object 
 4   tconst             object 
 5   titleType          object 
 6   primaryTitle       object 
 7   originalTitle      object 
 8   isAdult            object 
 9   startYear          object 
 10  endYear            object 
 11  runtimeMinutes     object 
 12  genres             object 
 13  averageRating      float64
 14  numVotes           int64  
dtypes: float64(1), int64(1), object(13)
memory usage: 1.6+ GB


In [39]:
final_df.isnull().sum()

primaryName                 0
birthYear            12575731
deathYear            13608432
primaryProfession     1155151
tconst                      0
titleType                   0
primaryTitle                0
originalTitle               0
isAdult                     0
startYear                1279
endYear              11806724
runtimeMinutes        2334748
genres                 156925
averageRating               0
numVotes                    0
dtype: int64

In [33]:
df4.head()

Unnamed: 0,review_date,movie_id,user_id,is_spoiler,review_text,rating,review_summary
0,10 February 2006,tt0111161,ur1898687,True,"In its Oscar year, Shawshank Redemption (writt...",10,A classic piece of unforgettable film-making.
1,6 September 2000,tt0111161,ur0842118,True,The Shawshank Redemption is without a doubt on...,10,Simply amazing. The best film of the 90's.
2,3 August 2001,tt0111161,ur1285640,True,I believe that this film is the best story eve...,8,The best story ever told on film
3,1 September 2002,tt0111161,ur1003471,True,"**Yes, there are SPOILERS here**This film has ...",10,Busy dying or busy living?
4,20 May 2004,tt0111161,ur0226855,True,At the heart of this extraordinary movie is a ...,8,"Great story, wondrously told and acted"


In [35]:
df4.info()

<class 'pandas.core.frame.DataFrame'>
Index: 573913 entries, 0 to 573912
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   review_date     573913 non-null  object
 1   movie_id        573913 non-null  object
 2   user_id         573913 non-null  object
 3   is_spoiler      573913 non-null  bool  
 4   review_text     573913 non-null  object
 5   rating          573913 non-null  int64 
 6   review_summary  573913 non-null  object
dtypes: bool(1), int64(1), object(5)
memory usage: 31.2+ MB


In [42]:
df4.rename(columns={'rating': 'review_rating'}, inplace=True)

In [36]:
df5.head()

Unnamed: 0,movie_id,plot_summary,duration,genre,rating,release_date,plot_synopsis
0,tt0105112,"Former CIA analyst, Jack Ryan is in England wi...",1h 57min,"[Action, Thriller]",6.9,1992-06-05,"Jack Ryan (Ford) is on a ""working vacation"" in..."
1,tt1204975,"Billy (Michael Douglas), Paddy (Robert De Niro...",1h 45min,[Comedy],6.6,2013-11-01,Four boys around the age of 10 are friends in ...
2,tt0243655,"The setting is Camp Firewood, the year 1981. I...",1h 37min,"[Comedy, Romance]",6.7,2002-04-11,
3,tt0040897,"Fred C. Dobbs and Bob Curtin, both down on the...",2h 6min,"[Adventure, Drama, Western]",8.3,1948-01-24,Fred Dobbs (Humphrey Bogart) and Bob Curtin (T...
4,tt0126886,Tracy Flick is running unopposed for this year...,1h 43min,"[Comedy, Drama, Romance]",7.3,1999-05-07,Jim McAllister (Matthew Broderick) is a much-a...


In [37]:
df5.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1572 entries, 0 to 1571
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   movie_id       1572 non-null   object 
 1   plot_summary   1572 non-null   object 
 2   duration       1572 non-null   object 
 3   genre          1572 non-null   object 
 4   rating         1572 non-null   float64
 5   release_date   1572 non-null   object 
 6   plot_synopsis  1572 non-null   object 
dtypes: float64(1), object(6)
memory usage: 98.2+ KB


In [43]:
df5.rename(columns={'rating': 'overall_rating'}, inplace=True)

In [44]:
reviews_df = pd.merge(df4, df5, on='movie_id')
reviews_df.head()

Unnamed: 0,review_date,movie_id,user_id,is_spoiler,review_text,review_rating,review_summary,plot_summary,duration,genre,overall_rating,release_date,plot_synopsis
0,10 February 2006,tt0111161,ur1898687,True,"In its Oscar year, Shawshank Redemption (writt...",10,A classic piece of unforgettable film-making.,Chronicles the experiences of a formerly succe...,2h 22min,"[Crime, Drama]",9.3,1994-10-14,"In 1947, Andy Dufresne (Tim Robbins), a banker..."
1,6 September 2000,tt0111161,ur0842118,True,The Shawshank Redemption is without a doubt on...,10,Simply amazing. The best film of the 90's.,Chronicles the experiences of a formerly succe...,2h 22min,"[Crime, Drama]",9.3,1994-10-14,"In 1947, Andy Dufresne (Tim Robbins), a banker..."
2,3 August 2001,tt0111161,ur1285640,True,I believe that this film is the best story eve...,8,The best story ever told on film,Chronicles the experiences of a formerly succe...,2h 22min,"[Crime, Drama]",9.3,1994-10-14,"In 1947, Andy Dufresne (Tim Robbins), a banker..."
3,1 September 2002,tt0111161,ur1003471,True,"**Yes, there are SPOILERS here**This film has ...",10,Busy dying or busy living?,Chronicles the experiences of a formerly succe...,2h 22min,"[Crime, Drama]",9.3,1994-10-14,"In 1947, Andy Dufresne (Tim Robbins), a banker..."
4,20 May 2004,tt0111161,ur0226855,True,At the heart of this extraordinary movie is a ...,8,"Great story, wondrously told and acted",Chronicles the experiences of a formerly succe...,2h 22min,"[Crime, Drama]",9.3,1994-10-14,"In 1947, Andy Dufresne (Tim Robbins), a banker..."


In [46]:
reviews_df['genres'] = reviews_df['genre'].apply(lambda x: ', '.join(x))
reviews_df = reviews_df.drop(columns=['genre'])
reviews_df.head()

Unnamed: 0,review_date,movie_id,user_id,is_spoiler,review_text,review_rating,review_summary,plot_summary,duration,overall_rating,release_date,plot_synopsis,genres
0,10 February 2006,tt0111161,ur1898687,True,"In its Oscar year, Shawshank Redemption (writt...",10,A classic piece of unforgettable film-making.,Chronicles the experiences of a formerly succe...,2h 22min,9.3,1994-10-14,"In 1947, Andy Dufresne (Tim Robbins), a banker...","Crime, Drama"
1,6 September 2000,tt0111161,ur0842118,True,The Shawshank Redemption is without a doubt on...,10,Simply amazing. The best film of the 90's.,Chronicles the experiences of a formerly succe...,2h 22min,9.3,1994-10-14,"In 1947, Andy Dufresne (Tim Robbins), a banker...","Crime, Drama"
2,3 August 2001,tt0111161,ur1285640,True,I believe that this film is the best story eve...,8,The best story ever told on film,Chronicles the experiences of a formerly succe...,2h 22min,9.3,1994-10-14,"In 1947, Andy Dufresne (Tim Robbins), a banker...","Crime, Drama"
3,1 September 2002,tt0111161,ur1003471,True,"**Yes, there are SPOILERS here**This film has ...",10,Busy dying or busy living?,Chronicles the experiences of a formerly succe...,2h 22min,9.3,1994-10-14,"In 1947, Andy Dufresne (Tim Robbins), a banker...","Crime, Drama"
4,20 May 2004,tt0111161,ur0226855,True,At the heart of this extraordinary movie is a ...,8,"Great story, wondrously told and acted",Chronicles the experiences of a formerly succe...,2h 22min,9.3,1994-10-14,"In 1947, Andy Dufresne (Tim Robbins), a banker...","Crime, Drama"


In [50]:
reviews_df.rename(columns={'movie_id': 'title_id'}, inplace=True)
reviews_df.head()

Unnamed: 0,review_date,title_id,user_id,is_spoiler,review_text,review_rating,review_summary,plot_summary,duration,overall_rating,release_date,plot_synopsis,genres
0,10 February 2006,tt0111161,ur1898687,True,"In its Oscar year, Shawshank Redemption (writt...",10,A classic piece of unforgettable film-making.,Chronicles the experiences of a formerly succe...,2h 22min,9.3,1994-10-14,"In 1947, Andy Dufresne (Tim Robbins), a banker...","Crime, Drama"
1,6 September 2000,tt0111161,ur0842118,True,The Shawshank Redemption is without a doubt on...,10,Simply amazing. The best film of the 90's.,Chronicles the experiences of a formerly succe...,2h 22min,9.3,1994-10-14,"In 1947, Andy Dufresne (Tim Robbins), a banker...","Crime, Drama"
2,3 August 2001,tt0111161,ur1285640,True,I believe that this film is the best story eve...,8,The best story ever told on film,Chronicles the experiences of a formerly succe...,2h 22min,9.3,1994-10-14,"In 1947, Andy Dufresne (Tim Robbins), a banker...","Crime, Drama"
3,1 September 2002,tt0111161,ur1003471,True,"**Yes, there are SPOILERS here**This film has ...",10,Busy dying or busy living?,Chronicles the experiences of a formerly succe...,2h 22min,9.3,1994-10-14,"In 1947, Andy Dufresne (Tim Robbins), a banker...","Crime, Drama"
4,20 May 2004,tt0111161,ur0226855,True,At the heart of this extraordinary movie is a ...,8,"Great story, wondrously told and acted",Chronicles the experiences of a formerly succe...,2h 22min,9.3,1994-10-14,"In 1947, Andy Dufresne (Tim Robbins), a banker...","Crime, Drama"


In [49]:
final_df.rename(columns={'movie_id': 'title_id'}, inplace=True)
final_df

Unnamed: 0,primaryName,birthYear,deathYear,primaryProfession,title_id,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
0,Fred Astaire,1899,1987,"actor,miscellaneous,producer",tt0072308,movie,The Towering Inferno,The Towering Inferno,False,1974,,165,"Action,Drama,Thriller",7.0,48391
1,Fred Astaire,1899,1987,"actor,miscellaneous,producer",tt0050419,movie,Funny Face,Funny Face,False,1957,,103,"Comedy,Musical,Romance",7.0,33119
2,Fred Astaire,1899,1987,"actor,miscellaneous,producer",tt0053137,movie,On the Beach,On the Beach,False,1959,,134,"Drama,Romance,Sci-Fi",7.1,14451
3,Fred Astaire,1899,1987,"actor,miscellaneous,producer",tt0027125,movie,Top Hat,Top Hat,False,1935,,101,"Comedy,Musical,Romance",7.7,20748
4,Lauren Bacall,1924,2014,"actress,soundtrack,archive_footage",tt0037382,movie,To Have and Have Not,To Have and Have Not,False,1944,,100,"Adventure,Comedy,Film-Noir",7.8,38063
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14144462,Sambit Mishra,,,"writer,producer",tt10709066,tvSeries,Tere Liye Bro,Tere Liye Bro,False,2017,2018,22,Drama,8.4,18
14144463,Sambit Mishra,,,"writer,producer",tt15134202,tvSeries,Cartel,Cartel,False,2021,2021,,"Action,Drama",8.0,4272
14144464,Romeo del Rosario,,,"animation_department,art_department",tt11657662,movie,The Witcher: Nightmare of the Wolf,The Witcher: Nightmare of the Wolf,False,2021,,83,"Action,Adventure,Animation",7.2,50417
14144465,Romeo del Rosario,,,"animation_department,art_department",tt14069590,tvSeries,Dota: Dragon's Blood,Dota: Dragon's Blood,False,2021,2022,25,"Action,Adventure,Animation",7.7,21791


In [51]:
movie_df = pd.merge(reviews_df, final_df, on=['title_id', 'genres'])
movie_df

Unnamed: 0,review_date,title_id,user_id,is_spoiler,review_text,review_rating,review_summary,plot_summary,duration,overall_rating,...,primaryProfession,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,averageRating,numVotes
0,11 June 2001,tt0137523,ur0282760,True,Please.This is a dismal movie. I'm going to s...,1,The comparison to Clockwork Orange makes me puke,A nameless first person narrator (Edward Norto...,2h 19min,8.8,...,"producer,actor,executive",movie,Fight Club,Fight Club,False,1999,,139,8.8,2331118
1,11 June 2001,tt0137523,ur0282760,True,Please.This is a dismal movie. I'm going to s...,1,The comparison to Clockwork Orange makes me puke,A nameless first person narrator (Edward Norto...,2h 19min,8.8,...,"actress,director,producer",movie,Fight Club,Fight Club,False,1999,,139,8.8,2331118
2,11 June 2001,tt0137523,ur0282760,True,Please.This is a dismal movie. I'm going to s...,1,The comparison to Clockwork Orange makes me puke,A nameless first person narrator (Edward Norto...,2h 19min,8.8,...,"music_artist,actor,producer",movie,Fight Club,Fight Club,False,1999,,139,8.8,2331118
3,11 June 2001,tt0137523,ur0282760,True,Please.This is a dismal movie. I'm going to s...,1,The comparison to Clockwork Orange makes me puke,A nameless first person narrator (Edward Norto...,2h 19min,8.8,...,"actor,producer,writer",movie,Fight Club,Fight Club,False,1999,,139,8.8,2331118
4,11 June 2001,tt0137523,ur0282760,True,Please.This is a dismal movie. I'm going to s...,1,The comparison to Clockwork Orange makes me puke,A nameless first person narrator (Edward Norto...,2h 19min,8.8,...,"visual_effects,animation_department,art_depart...",movie,Fight Club,Fight Club,False,1999,,139,8.8,2331118
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4747582,29 December 2013,tt0201265,ur0311809,False,It's a well known fact that Christmas party's ...,8,Modern society masked as Christmas catastrophe,In what has to be one of the worst ideas in Ch...,1h 35min,6.8,...,miscellaneous,movie,In Bed with Santa,Tomten är far till alla barnen,False,1999,,95,6.9,8673
4747583,29 December 2013,tt0201265,ur0311809,False,It's a well known fact that Christmas party's ...,8,Modern society masked as Christmas catastrophe,In what has to be one of the worst ideas in Ch...,1h 35min,6.8,...,actress,movie,In Bed with Santa,Tomten är far till alla barnen,False,1999,,95,6.9,8673
4747584,29 December 2013,tt0201265,ur0311809,False,It's a well known fact that Christmas party's ...,8,Modern society masked as Christmas catastrophe,In what has to be one of the worst ideas in Ch...,1h 35min,6.8,...,actor,movie,In Bed with Santa,Tomten är far till alla barnen,False,1999,,95,6.9,8673
4747585,29 December 2013,tt0201265,ur0311809,False,It's a well known fact that Christmas party's ...,8,Modern society masked as Christmas catastrophe,In what has to be one of the worst ideas in Ch...,1h 35min,6.8,...,actor,movie,In Bed with Santa,Tomten är far till alla barnen,False,1999,,95,6.9,8673


In [52]:
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4747587 entries, 0 to 4747586
Data columns (total 26 columns):
 #   Column             Dtype  
---  ------             -----  
 0   review_date        object 
 1   title_id           object 
 2   user_id            object 
 3   is_spoiler         bool   
 4   review_text        object 
 5   review_rating      int64  
 6   review_summary     object 
 7   plot_summary       object 
 8   duration           object 
 9   overall_rating     float64
 10  release_date       object 
 11  plot_synopsis      object 
 12  genres             object 
 13  primaryName        object 
 14  birthYear          object 
 15  deathYear          object 
 16  primaryProfession  object 
 17  titleType          object 
 18  primaryTitle       object 
 19  originalTitle      object 
 20  isAdult            object 
 21  startYear          object 
 22  endYear            object 
 23  runtimeMinutes     object 
 24  averageRating      float64
 25  numVotes          

In [53]:
movie_df.isnull().sum()

review_date                0
title_id                   0
user_id                    0
is_spoiler                 0
review_text                0
review_rating              0
review_summary             0
plot_summary               0
duration                   0
overall_rating             0
release_date               0
plot_synopsis              0
genres                     0
primaryName                0
birthYear            3712112
deathYear            4506705
primaryProfession        420
titleType                  0
primaryTitle               0
originalTitle              0
isAdult                    0
startYear                  0
endYear              4747587
runtimeMinutes             0
averageRating              0
numVotes                   0
dtype: int64

## Chunking

In [None]:
!pip3 uninstall torch

In [64]:
import utils
from utils import chunkDocs, langDetect, wordCloud

OSError: [WinError 126] The specified module could not be found. Error loading "C:\Users\Caroline\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\torch\lib\shm.dll" or one of its dependencies.