In [1]:
import ast
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
pd.set_option("display.max_columns", None)

In [3]:
credits_data = pd.read_csv("../datasets/credits.csv")

FileNotFoundError: [Errno 2] No such file or directory: '../datasets/credits.csv'

In [None]:
credits_data.head()

In [None]:
credits_data.shape

In [None]:
credits_data.dtypes

### Dataset "movie_45k_data" is missing the directors and casts name. That's why we need the "credits_data" dataset to supplement the missing values.

In [None]:
movie_45k_data = pd.read_csv("../datasets/45k_movies_metadata.csv")

In [None]:
movie_45k_data.head()

In [None]:
movie_45k_data.shape

In [None]:
movie_45k_data.dtypes

#### We want to add movies that will supplement our first preprocessing step - from 2017 - 2021

In [None]:
movie_45k_data[movie_45k_data["release_date"].isna()]

In [None]:
movie_45k_data["release_date"] = pd.to_datetime(movie_45k_data["release_date"], errors="coerce")

In [None]:
movie_45k_data["year"] = movie_45k_data["release_date"].dt.year

In [None]:
movie_45k_data["year"].value_counts().sort_index()

#### To further supplement our dataset from prepro_1, we'll be taking only movies from 2017

In [None]:
features = ["genres", "id", "title", "year"]

In [None]:
movies_2017 = movie_45k_data.loc[movie_45k_data["year"] == 2017, features]

In [None]:
movies_2017

In [None]:
movies_2017.dtypes

#### Now we can merge "credits" with "movies_2017" by the id 

In [None]:
movies_2017["id"] = movies_2017["id"].astype(int)

In [None]:
data = pd.merge(movies_2017, credits_data, on="id")

In [None]:
data

#### We can't get the values from the dictionaries in "genres", "cast" and "crew" columns because they are strings

In [None]:
type(data["genres"][1])

#### Thus, we convert them into list so that we can manipulate them

In [None]:
data["genres"] = data["genres"].map(lambda x: ast.literal_eval(x))
data["cast"] = data["cast"].map(lambda x: ast.literal_eval(x))
data["crew"] = data["crew"].map(lambda x: ast.literal_eval(x))

In [None]:
type(data["genres"][1])

In [None]:
def create_genre_list(x):
    genre = []
    ws = " "
    for i in x:
        if i.get("name") == "Science Fiction":
            sci_fi_genre = "Sci-Fi"
            genre.append(sci_fi_genre)
        else:
            genre.append(i.get("name"))
    if genre == []:
        return np.NaN
    else:
        return (ws.join(genre))

In [None]:
data["genre_list"] = data["genres"].map(lambda x: create_genre_list(x))

In [None]:
data["genre_list"]

#### We want the top 3 names

In [None]:
#Name 1
def get_actor1(x):
    actors = []
    for i in x:
        actors.append(i.get("name"))
    if actors == []:
        return np.NaN
    else:
        return (actors[0])

In [None]:
data["actor_1_name"] = data["cast"].map(lambda x: get_actor1(x))

In [None]:
data["actor_1_name"]

In [None]:
#Name 2
def get_actor2(x):
    actors = []
    for i in x:
        actors.append(i.get("name"))
    if actors == [] or len(actors) <= 1:
        return np.NaN
    else:
        return (actors[1])

In [None]:
data["actor_2_name"] = data["cast"].map(lambda x: get_actor2(x))

In [None]:
data["actor_2_name"]

In [None]:
#Name 3
def get_actor3(x):
    actors = []
    for i in x:
        actors.append(i.get("name"))
    if actors == [] or len(actors) <= 2:
        return np.NaN
    else:
        return (actors[2])

In [None]:
data["actor_3_name"] = data["cast"].map(lambda x: get_actor3(x))

In [None]:
data["actor_3_name"]

#### Next, we'll extract the directors name

In [None]:
def get_director(x):
    directors = []
    ws = " "
    for i in x:
        if i.get("job") == "Director":
            directors.append(i.get("name"))
    if directors == []:
        return np.NaN
    else:
        return (ws.join(directors))

In [None]:
data["director_name"] = data["crew"].map(lambda x: get_director(x))

In [None]:
data["director_name"]

### Feature selection

In [None]:
data.columns

In [None]:
features = ["director_name", "actor_1_name", "actor_2_name", "actor_3_name", "genre_list", "title"]

In [None]:
movie_45k_data = data.loc[:, features]
movie_45k_data.head()

In [None]:
movie_45k_data.shape

### Dropping missing values

In [None]:
movie_45k_data.isnull().sum()

In [None]:
movie_45k_data[movie_45k_data["genre_list"].isnull()]

In [None]:
movie_45k_data = movie_45k_data.dropna()

In [None]:
movie_45k_data.isnull().sum()

### Renaming column names to be consistent with the final dataset in prepro_1

In [None]:
renamed_cols = {"genre_list": "genres", "title": "movie_title"}

In [None]:
movie_45k_data = movie_45k_data.rename(columns=renamed_cols)

In [None]:
movie_45k_data["movie_title"] = movie_45k_data["movie_title"].str.lower()

### For dataset in prepro_1 and current dataset, combine cast and genre columns for future steps in Tfidf vectorizer

#### Current dataset

In [None]:
movie_45k_data["combined_info"] = movie_45k_data["director_name"] + " " + movie_45k_data["actor_1_name"] + " " +  movie_45k_data["actor_2_name"] + " " + movie_45k_data["actor_3_name"] + " " + movie_45k_data["genres"]

In [None]:
movie_45k_data

#### Previous dataset from prepro_1

In [None]:
movie_5k_data = pd.read_csv("../datasets/movie_5k_data.csv")
movie_5k_data.head()

In [None]:
movie_5k_data["combined_info"] = movie_45k_data["director_name"] + " " + movie_45k_data["actor_1_name"] + " " +  movie_45k_data["actor_2_name"] + " " + movie_45k_data["actor_3_name"] + " " + movie_45k_data["genres"]

In [None]:
movie_5k_data

In [None]:
combined_metadata = movie_5k_data.append(movie_45k_data)
combined_metadata

### Drop duplicates from the new combined dataset based on the movie_title

In [None]:
combined_metadata.drop_duplicates(subset="movie_title", keep="last", inplace=True)
combined_metadata

#### Note: We've droped 137 duplicates

#### Make new dataset after combining both datasets and cleaning it

In [None]:
combined_metadata.to_csv("../datasets/combined_metadata.csv", index=False)