In [15]:
# importing pandas
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import numpy as np

import findspark
findspark.init()
from pyspark.sql import SparkSession
import time



In [16]:
# Read in the CSV file as a Pandas DataFrame

movieTitles_df = pd.read_csv(
    Path("Data/titles.csv")
)

movieTitles_df


Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
0,ts300399,Five Came Back: The Reference Films,SHOW,This collection includes 12 World War II-era p...,1945,TV-MA,51,['documentation'],['US'],1.0,,,,0.600,
1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"['drama', 'crime']",['US'],,tt0075314,8.2,808582.0,40.965,8.179
2,tm154986,Deliverance,MOVIE,Intent on seeing the Cahulawassee River before...,1972,R,109,"['drama', 'action', 'thriller', 'european']",['US'],,tt0068473,7.7,107673.0,10.010,7.300
3,tm127384,Monty Python and the Holy Grail,MOVIE,"King Arthur, accompanied by his squire, recrui...",1975,PG,91,"['fantasy', 'action', 'comedy']",['GB'],,tt0071853,8.2,534486.0,15.461,7.811
4,tm120801,The Dirty Dozen,MOVIE,12 American military prisoners in World War II...,1967,,150,"['war', 'action']","['GB', 'US']",,tt0061578,7.7,72662.0,20.398,7.600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5845,tm1014599,Fine Wine,MOVIE,A beautiful love story that can happen between...,2021,,100,"['romance', 'drama']",['NG'],,tt13857480,6.8,45.0,1.466,
5846,tm898842,C/O Kaadhal,MOVIE,A heart warming film that explores the concept...,2021,,134,['drama'],[],,tt11803618,7.7,348.0,,
5847,tm1059008,Lokillo,MOVIE,A controversial TV host and comedian who has b...,2021,,90,['comedy'],['CO'],,tt14585902,3.8,68.0,26.005,6.300
5848,tm1035612,Dad Stop Embarrassing Me - The Afterparty,MOVIE,"Jamie Foxx, David Alan Grier and more from the...",2021,PG-13,37,[],['US'],,,,,1.296,10.000


In [17]:
movieTitles_df.columns

Index(['id', 'title', 'type', 'description', 'release_year',
       'age_certification', 'runtime', 'genres', 'production_countries',
       'seasons', 'imdb_id', 'imdb_score', 'imdb_votes', 'tmdb_popularity',
       'tmdb_score'],
      dtype='object')

In [18]:
# Drop columns not needed for this project "seasons", "tmdb_popularity", "tmdb_score"
movieTitles_df = movieTitles_df.drop(columns=["seasons", "tmdb_popularity", "tmdb_score"])
movieTitles_df

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,imdb_id,imdb_score,imdb_votes
0,ts300399,Five Came Back: The Reference Films,SHOW,This collection includes 12 World War II-era p...,1945,TV-MA,51,['documentation'],['US'],,,
1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"['drama', 'crime']",['US'],tt0075314,8.2,808582.0
2,tm154986,Deliverance,MOVIE,Intent on seeing the Cahulawassee River before...,1972,R,109,"['drama', 'action', 'thriller', 'european']",['US'],tt0068473,7.7,107673.0
3,tm127384,Monty Python and the Holy Grail,MOVIE,"King Arthur, accompanied by his squire, recrui...",1975,PG,91,"['fantasy', 'action', 'comedy']",['GB'],tt0071853,8.2,534486.0
4,tm120801,The Dirty Dozen,MOVIE,12 American military prisoners in World War II...,1967,,150,"['war', 'action']","['GB', 'US']",tt0061578,7.7,72662.0
...,...,...,...,...,...,...,...,...,...,...,...,...
5845,tm1014599,Fine Wine,MOVIE,A beautiful love story that can happen between...,2021,,100,"['romance', 'drama']",['NG'],tt13857480,6.8,45.0
5846,tm898842,C/O Kaadhal,MOVIE,A heart warming film that explores the concept...,2021,,134,['drama'],[],tt11803618,7.7,348.0
5847,tm1059008,Lokillo,MOVIE,A controversial TV host and comedian who has b...,2021,,90,['comedy'],['CO'],tt14585902,3.8,68.0
5848,tm1035612,Dad Stop Embarrassing Me - The Afterparty,MOVIE,"Jamie Foxx, David Alan Grier and more from the...",2021,PG-13,37,[],['US'],,,


In [20]:
# Create a SparkSession
spark = SparkSession.builder\
    .appName("panda to spark")\
    .config("spark.sql.debug.maxToStringFields", 2000)\
    .config("spark.driver.memory", "2g")\
    .getOrCreate()

In [21]:
# Copy Pandas dataframe to sparks
movieTitles_spdf = spark.createDataFrame(movieTitles_df)
movieTitles_spdf.show()


+--------+--------------------+-----+--------------------+------------+-----------------+-------+--------------------+--------------------+---------+----------+----------+
|      id|               title| type|         description|release_year|age_certification|runtime|              genres|production_countries|  imdb_id|imdb_score|imdb_votes|
+--------+--------------------+-----+--------------------+------------+-----------------+-------+--------------------+--------------------+---------+----------+----------+
|ts300399|Five Came Back: T...| SHOW|This collection i...|        1945|            TV-MA|     51|   ['documentation']|              ['US']|      NaN|       NaN|       NaN|
| tm84618|         Taxi Driver|MOVIE|A mentally unstab...|        1976|                R|    114|  ['drama', 'crime']|              ['US']|tt0075314|       8.2|  808582.0|
|tm154986|         Deliverance|MOVIE|Intent on seeing ...|        1972|                R|    109|['drama', 'action...|              ['US']|t

In [22]:
# Create a temporary view of the DataFrame.
movieTitles_spdf.createOrReplaceTempView('titles_data')
movieTitles_spdf.printSchema()

root
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- type: string (nullable = true)
 |-- description: string (nullable = true)
 |-- release_year: long (nullable = true)
 |-- age_certification: string (nullable = true)
 |-- runtime: long (nullable = true)
 |-- genres: string (nullable = true)
 |-- production_countries: string (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- imdb_score: double (nullable = true)
 |-- imdb_votes: double (nullable = true)



In [52]:
# select just the movies
start_time = time.time()
movies_only_spdf = spark.sql("""
SELECT td.*
FROM titles_data     td
WHERE type == 'MOVIE'
""")
movies_only_spdf.show()

+--------+--------------------+-----+--------------------+------------+-----------------+-------+--------------------+--------------------+---------+----------+----------+
|      id|               title| type|         description|release_year|age_certification|runtime|              genres|production_countries|  imdb_id|imdb_score|imdb_votes|
+--------+--------------------+-----+--------------------+------------+-----------------+-------+--------------------+--------------------+---------+----------+----------+
| tm84618|         Taxi Driver|MOVIE|A mentally unstab...|        1976|                R|    114|  ['drama', 'crime']|              ['US']|tt0075314|       8.2|  808582.0|
|tm154986|         Deliverance|MOVIE|Intent on seeing ...|        1972|                R|    109|['drama', 'action...|              ['US']|tt0068473|       7.7|  107673.0|
|tm127384|Monty Python and ...|MOVIE|King Arthur, acco...|        1975|               PG|     91|['fantasy', 'acti...|              ['GB']|t

In [50]:
# convert the Spars Data frame back to pandas
movies_only_df = movies_only_spdf.toPandas()


In [51]:
 # Look at APPLICATION_TYPE value counts for binning
# imdb_cnt = movies_only_df['imdb_score'].value_counts()
# imdb_cnt
null_count = movies_only_df['imdb_score'].isna().sum()

print("Number of NaN values in 'imdb_score' column:", null_count)

Number of NaN values in 'imdb_score' column: 315


In [67]:
# remove [ ] around genres
def remove_brackets(text):
    return text.replace('[', '').replace(']', '').replace("'", "").replace(" ", "")

# Apply the function to the column containing the brackets
movies_only_df['genres'] = movies_only_df['genres'].apply(remove_brackets)
movies_only_df['production_countries'] = movies_only_df['production_countries'].apply(remove_brackets)
movies_only_df

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,imdb_id,imdb_score,imdb_votes
0,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"drama,crime",US,tt0075314,8.2,808582.0
1,tm154986,Deliverance,MOVIE,Intent on seeing the Cahulawassee River before...,1972,R,109,"drama,action,thriller,european",US,tt0068473,7.7,107673.0
2,tm127384,Monty Python and the Holy Grail,MOVIE,"King Arthur, accompanied by his squire, recrui...",1975,PG,91,"fantasy,action,comedy",GB,tt0071853,8.2,534486.0
3,tm120801,The Dirty Dozen,MOVIE,12 American military prisoners in World War II...,1967,,150,"war,action","GB,US",tt0061578,7.7,72662.0
4,tm70993,Life of Brian,MOVIE,"Brian Cohen is an average young Jewish man, bu...",1979,R,94,comedy,GB,tt0079470,8.0,395024.0
...,...,...,...,...,...,...,...,...,...,...,...,...
3737,tm1066324,Super Monsters: Once Upon a Rhyme,MOVIE,The Super Monsters rethink exemplary fantasies...,2021,,25,"animation,family",,tt14586752,5.6,38.0
3738,tm1097142,My Bride,MOVIE,The story follows a young man and woman who go...,2021,,93,"romance,comedy,drama",EG,tt14216488,5.0,327.0
3740,tm1014599,Fine Wine,MOVIE,A beautiful love story that can happen between...,2021,,100,"romance,drama",NG,tt13857480,6.8,45.0
3741,tm898842,C/O Kaadhal,MOVIE,A heart warming film that explores the concept...,2021,,134,drama,,tt11803618,7.7,348.0


In [68]:
# Remove rows with null values in the 'imdb_score' column
movies_only_df = movies_only_df.dropna(subset=['imdb_score'])

# Show the resulting DataFrame
movies_only_df 

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,imdb_id,imdb_score,imdb_votes
0,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"drama,crime",US,tt0075314,8.2,808582.0
1,tm154986,Deliverance,MOVIE,Intent on seeing the Cahulawassee River before...,1972,R,109,"drama,action,thriller,european",US,tt0068473,7.7,107673.0
2,tm127384,Monty Python and the Holy Grail,MOVIE,"King Arthur, accompanied by his squire, recrui...",1975,PG,91,"fantasy,action,comedy",GB,tt0071853,8.2,534486.0
3,tm120801,The Dirty Dozen,MOVIE,12 American military prisoners in World War II...,1967,,150,"war,action","GB,US",tt0061578,7.7,72662.0
4,tm70993,Life of Brian,MOVIE,"Brian Cohen is an average young Jewish man, bu...",1979,R,94,comedy,GB,tt0079470,8.0,395024.0
...,...,...,...,...,...,...,...,...,...,...,...,...
3737,tm1066324,Super Monsters: Once Upon a Rhyme,MOVIE,The Super Monsters rethink exemplary fantasies...,2021,,25,"animation,family",,tt14586752,5.6,38.0
3738,tm1097142,My Bride,MOVIE,The story follows a young man and woman who go...,2021,,93,"romance,comedy,drama",EG,tt14216488,5.0,327.0
3740,tm1014599,Fine Wine,MOVIE,A beautiful love story that can happen between...,2021,,100,"romance,drama",NG,tt13857480,6.8,45.0
3741,tm898842,C/O Kaadhal,MOVIE,A heart warming film that explores the concept...,2021,,134,drama,,tt11803618,7.7,348.0


In [69]:
# Convert categorical data to numeric with `pd.get_dummies`
# Split the 'genres' column based on ','
split_genres = movies_only_df['genres'].str.get_dummies(sep=',')

# Display the split DataFrame
display(split_genres)

Unnamed: 0,action,animation,comedy,crime,documentation,drama,european,family,fantasy,history,horror,music,reality,romance,scifi,sport,thriller,war,western
0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0
2,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3737,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3738,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
3740,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
3741,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [70]:
# Concatenate split genres DataFrame with original DataFrame and drop the original 'genres' column
film_titles_df = pd.concat([movies_only_df, split_genres], axis=1)
pd.set_option("display.max_columns", 50)
# Display the updated DataFrame
film_titles_df

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,imdb_id,imdb_score,imdb_votes,action,animation,comedy,crime,documentation,drama,european,family,fantasy,history,horror,music,reality,romance,scifi,sport,thriller,war,western
0,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"drama,crime",US,tt0075314,8.2,808582.0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,tm154986,Deliverance,MOVIE,Intent on seeing the Cahulawassee River before...,1972,R,109,"drama,action,thriller,european",US,tt0068473,7.7,107673.0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0
2,tm127384,Monty Python and the Holy Grail,MOVIE,"King Arthur, accompanied by his squire, recrui...",1975,PG,91,"fantasy,action,comedy",GB,tt0071853,8.2,534486.0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,tm120801,The Dirty Dozen,MOVIE,12 American military prisoners in World War II...,1967,,150,"war,action","GB,US",tt0061578,7.7,72662.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,tm70993,Life of Brian,MOVIE,"Brian Cohen is an average young Jewish man, bu...",1979,R,94,comedy,GB,tt0079470,8.0,395024.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3737,tm1066324,Super Monsters: Once Upon a Rhyme,MOVIE,The Super Monsters rethink exemplary fantasies...,2021,,25,"animation,family",,tt14586752,5.6,38.0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3738,tm1097142,My Bride,MOVIE,The story follows a young man and woman who go...,2021,,93,"romance,comedy,drama",EG,tt14216488,5.0,327.0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
3740,tm1014599,Fine Wine,MOVIE,A beautiful love story that can happen between...,2021,,100,"romance,drama",NG,tt13857480,6.8,45.0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
3741,tm898842,C/O Kaadhal,MOVIE,A heart warming film that explores the concept...,2021,,134,drama,,tt11803618,7.7,348.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [71]:
# Drop columns not needed for this project "genres"
film_titles_df = film_titles_df.drop(columns=["genres"])
film_titles_df

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,production_countries,imdb_id,imdb_score,imdb_votes,action,animation,comedy,crime,documentation,drama,european,family,fantasy,history,horror,music,reality,romance,scifi,sport,thriller,war,western
0,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,US,tt0075314,8.2,808582.0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,tm154986,Deliverance,MOVIE,Intent on seeing the Cahulawassee River before...,1972,R,109,US,tt0068473,7.7,107673.0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0
2,tm127384,Monty Python and the Holy Grail,MOVIE,"King Arthur, accompanied by his squire, recrui...",1975,PG,91,GB,tt0071853,8.2,534486.0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,tm120801,The Dirty Dozen,MOVIE,12 American military prisoners in World War II...,1967,,150,"GB,US",tt0061578,7.7,72662.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,tm70993,Life of Brian,MOVIE,"Brian Cohen is an average young Jewish man, bu...",1979,R,94,GB,tt0079470,8.0,395024.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3737,tm1066324,Super Monsters: Once Upon a Rhyme,MOVIE,The Super Monsters rethink exemplary fantasies...,2021,,25,,tt14586752,5.6,38.0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3738,tm1097142,My Bride,MOVIE,The story follows a young man and woman who go...,2021,,93,EG,tt14216488,5.0,327.0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
3740,tm1014599,Fine Wine,MOVIE,A beautiful love story that can happen between...,2021,,100,NG,tt13857480,6.8,45.0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
3741,tm898842,C/O Kaadhal,MOVIE,A heart warming film that explores the concept...,2021,,134,,tt11803618,7.7,348.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [72]:

film_titles_df.to_csv("Data/filmtitles.csv", index=False)