# Import Dependencies and CSV Files

In [1]:
import pandas as pd
import numpy as np
import re
import time

from pathlib import Path
from collections import Counter

In [2]:
# File Paths
credits = "Data/credits.csv"
links = "Data/links.csv"
movies = "Data/movies.csv"
metadata = "Data/movies_metadata.csv"
ratings = "Data/ratings.csv"

# Read CSV Files
credit_df = pd.read_csv(credits, low_memory=False)
link_df = pd.read_csv(links, low_memory=False)
movie_df = pd.read_csv(movies, low_memory=False)
meta_df = pd.read_csv(metadata, low_memory=False)
ratings_df = pd.read_csv(ratings, low_memory=False)

In [3]:
movie_df.head(20)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


# Split Movie Title and Year, Create Separate Columns

In [4]:
# pattern = '\((\d{4})\)'
# years = movie_df["title"].str.extract(pattern, expand=False)

pattern_2 = '(.*\w*)(?:\s\((\d{4})\))'
title = movie_df["title"].str.extract(pattern_2, expand=False)
title["Title"] = title[0]
title["Year"] = title[1]
movie_df["Title"] =  title["Title"]
movie_df["Year"] = title["Year"]
movie_df.head(20)

Unnamed: 0,movieId,title,genres,Title,Year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale,1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II,1995
5,6,Heat (1995),Action|Crime|Thriller,Heat,1995
6,7,Sabrina (1995),Comedy|Romance,Sabrina,1995
7,8,Tom and Huck (1995),Adventure|Children,Tom and Huck,1995
8,9,Sudden Death (1995),Action,Sudden Death,1995
9,10,GoldenEye (1995),Action|Adventure|Thriller,GoldenEye,1995


# Separate Genres Into Lists

In [5]:
# movie_df.reset_index

movies_df = movie_df.set_index("movieId")


movies_df["movieId"] = movie_df["movieId"]

movies_df.head()
# (Genres[0].unique(),
#  Genres[1].unique(),
#  Genres[2].unique(),
#  Genres[3].unique(),
#  Genres[4].unique(),
#  Genres[5].unique(),
#  Genres[6].unique(),
#  Genres[7].unique(),
#  Genres[8].unique(),
#  Genres[9].unique())
# movies_df = movies_df.loc[:, ['Title', 'Year', 'genres']]

Unnamed: 0_level_0,title,genres,Title,Year,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995,2.0
2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji,1995,3.0
3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men,1995,4.0
4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale,1995,5.0
5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II,1995,6.0


In [6]:
movies_df["genres"] = movies_df["genres"].str.split('|', expand=False)
movies_df = movies_df.drop(columns="title")
Genres = movies_df["genres"].to_list()

# Constructing Function To Split Genre List Into Separate Columns With Boolean Entries

In [7]:
# movies_df["Adventure"] = ""
# movies_df["Action"] = ""
# movies_df["Animation"] = ""
# movies_df["Children"] = ""
# movies_df["Comedy"] = ""
# movies_df["Crime"] = ""
# movies_df["Documentary"] = ""
# movies_df["Drama"] = ""
# movies_df["Fantasy"] = []
# movies_df["Film-Noir"] = []
# movies_df["Horror"] = []
# movies_df["IMAX"] = []
# movies_df["Musical"] = []
# movies_df["Mystery"] = []
# movies_df["Romance"] = []
# movies_df["Sci-Fi"] = []
# movies_df["Thriller"] = []
# movies_df["War"] = []
# movies_df["Western"] = []
# movies_df["No Listed Genre"] = []

In [8]:
Genres_df = pd.DataFrame(columns=["Adventure", "Action", "Animation", "Children", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "IMAX", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western", "No Listed Genre"], index=movies_df.index)
Genres_df["Genres"] = movies_df["genres"]
# Genres = Genres.dropna()
Genres_df = Genres_df.fillna(0)
Genres_df.head()

Unnamed: 0_level_0,Adventure,Action,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,...,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,No Listed Genre,Genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[Adventure, Animation, Children, Comedy, Fantasy]"
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[Adventure, Children, Fantasy]"
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[Comedy, Romance]"
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[Comedy, Drama, Romance]"
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[Comedy]


# Function

In [9]:
def genre_split(column, df_2):
    i = 0
    for row in column["genres"]:
        i += 1
        for line in row:
            if i in column["movieId"]:
                if line != "(no genres listed)":
                    if line == "Adventure":
                        df_2.at[i, "Adventure"] = 1
                    elif line == "Action":
                        df_2.at[i, "Action"] = 1
                    elif line == "Animation":
                        df_2.at[i, "Animation"] = 1
                    elif line == "Children":
                        df_2.at[i, "Children"] = 1
                    elif line == "Comedy":
                        df_2.at[i, "Comedy"] = 1
                    elif line == "Crime":
                        df_2.at[i, "Crime"] = 1
                    elif line == "Documentary":
                        df_2.at[i, "Documentary"] = 1
                    elif line == "Drama":
                        df_2.at[i, "Drama"] = 1
                    elif line == "Fantasy":
                        df_2.at[i, "Fantasy"] = 1
                    elif line == "Film-Noir":
                        df_2.at[i, "Film-Noir"] = 1
                    elif line == "Horror":
                        df_2.at[i, "Horror"] = 1
                    elif line == "IMAX":
                        df_2.at[i, "IMAX"] = 1
                    elif line == "Musical":
                        df_2.at[i, "Musical"] = 1
                    elif line == "Mystery":
                        df_2.at[i, "Mystery"] = 1
                    elif line == "Romance":
                        df_2.at[i, "Romance"] = 1
                    elif line == "Sci-Fi":
                        df_2.at[i, "Sci-Fi"] = 1
                    elif line == "Thriller":
                        df_2.at[i, "Thriller"] = 1
                    elif line == "War":
                        df_2.at[i, "War"] = 1
                    elif line == "Western":
                        df_2.at[i, "Western"] = 1
                    else:
                        df_2.at[i, "No Listed Genre"] = 1       
                else:
                    df_2.at[i, "No Listed Genre"] = 1
            else:
                continue    
    return df_2.head()

In [10]:
genre_split(movies_df, Genres_df)

Unnamed: 0_level_0,Adventure,Action,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,...,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,No Listed Genre,Genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,0,1,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,"[Adventure, Animation, Children, Comedy, Fantasy]"
2,1,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,"[Adventure, Children, Fantasy]"
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,"[Comedy, Romance]"
4,0,0,0,0,1,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,"[Comedy, Drama, Romance]"
5,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[Comedy]


In [11]:
Genres_df.head(20)

Unnamed: 0_level_0,Adventure,Action,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,...,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,No Listed Genre,Genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,0,1,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,"[Adventure, Animation, Children, Comedy, Fantasy]"
2,1,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,"[Adventure, Children, Fantasy]"
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,"[Comedy, Romance]"
4,0,0,0,0,1,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,"[Comedy, Drama, Romance]"
5,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[Comedy]
6,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,"[Action, Crime, Thriller]"
7,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,"[Comedy, Romance]"
8,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[Adventure, Children]"
9,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[Action]
10,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,"[Action, Adventure, Thriller]"


In [12]:
film_df = movies_df.join(Genres_df, how="right")
film_df.head()

Unnamed: 0_level_0,genres,Title,Year,movieId,Adventure,Action,Animation,Children,Comedy,Crime,...,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,No Listed Genre,Genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,"[Adventure, Animation, Children, Comedy, Fantasy]",Toy Story,1995,2.0,1,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,"[Adventure, Animation, Children, Comedy, Fantasy]"
2,"[Adventure, Children, Fantasy]",Jumanji,1995,3.0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,"[Adventure, Children, Fantasy]"
3,"[Comedy, Romance]",Grumpier Old Men,1995,4.0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,"[Comedy, Romance]"
4,"[Comedy, Drama, Romance]",Waiting to Exhale,1995,5.0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,"[Comedy, Drama, Romance]"
5,[Comedy],Father of the Bride Part II,1995,6.0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,[Comedy]


# Reorder Columns In Final DataFrame

In [13]:
movie_df = film_df.loc[:, ['Title', 'Year', 'Adventure', 'Action', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western', 'No Listed Genre']]
movie_df.head(20)

Unnamed: 0_level_0,Title,Year,Adventure,Action,Animation,Children,Comedy,Crime,Documentary,Drama,...,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,No Listed Genre
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story,1995,1,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Jumanji,1995,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Grumpier Old Men,1995,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,Waiting to Exhale,1995,0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,0
5,Father of the Bride Part II,1995,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,Heat,1995,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
7,Sabrina,1995,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
8,Tom and Huck,1995,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,Sudden Death,1995,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,GoldenEye,1995,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


# Write DataFrame To CSV File

In [14]:
movie_df.to_csv("Data/movies_clean.csv")