In [121]:
#importing pandas
import pandas as pd
import json
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

In [122]:
# Define the path to the dataset
movies_path = '../data/tmdb_5000_movies.csv'
credits_path =  '../data/tmdb_5000_credits.csv'

movie_df = pd.read_csv(movies_path)
credits_df = pd.read_csv(credits_path)

In [123]:
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [124]:
movie_df.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [125]:
# Filling missing values
movie_df['runtime'].fillna(movie_df['runtime'].mean(), inplace=True)
movie_df['release_date'].fillna('1900-01-01', inplace=True)
movie_df['overview'].fillna('', inplace=True)
movie_df['tagline'].fillna('', inplace=True)

In [126]:
# Extract features from release_date
movie_df['release_date'] = pd.to_datetime(movie_df['release_date'])
movie_df['release_year'] = movie_df['release_date'].dt.year.astype(int)
movie_df['release_month'] = movie_df['release_date'].dt.month.astype(int)
movie_df['release_day'] = movie_df['release_date'].dt.day.astype(int)

# Drop the original 'release_date' column
movie_df.drop(columns=['release_date'], inplace=True)

# Now you can access the 'release_year' column
movie_df['release_year']

0       2009
1       2007
2       2015
3       2012
4       2012
        ... 
4798    1992
4799    2011
4800    2013
4801    2012
4802    2005
Name: release_year, Length: 4803, dtype: int32

In [127]:
def extract_unique_features_from_json(json_column_name : str, df: pd.DataFrame) -> set:
    """
    Extracts unique features from the 'name' column of columns of JSON type in a DataFrame.

    Parameters:
    - json_column_name (str): Name of the JSON column in the DataFrame.
    - df (pandas.DataFrame): DataFrame containing the JSON column.

    Returns:
    - set: A set containing unique features in the DataFrame.
    """
    if json_column_name not in df.columns:
        raise ValueError(f"Column '{json_column_name}' not found in the DataFrame.")
    
    # Initialize an empty set to store unique genres
    unique_features = set()

    # Iterate over each JSON string in the 'genres' column
    for features_json in movie_df[json_column_name]:
        # Load JSON string into a Python list
        features_list = json.loads(features_json)
        
        # Extract genre names from the list
        for feature_info in features_list:
            # Assuming the genre name is stored under the key 'name'
            feature_name = feature_info.get('name')
            
            # Check if genre name exists and add it to the set
            if feature_name:
                unique_features.add(feature_name)
            
    return unique_features


In [128]:
genres = extract_unique_features_from_json('genres', movie_df)
print(genres)

{'Thriller', 'Adventure', 'TV Movie', 'Action', 'Music', 'Family', 'Drama', 'Romance', 'Comedy', 'Animation', 'Science Fiction', 'War', 'Crime', 'Western', 'Fantasy', 'Mystery', 'Horror', 'History', 'Foreign', 'Documentary'}


In [137]:
# Custom One Hit Encoding of genres

# Initialize all genre columns to zero
for genre in genres:
    movie_df[genre] = 0

# Update genre columns to 1 if the genre exists in the movie's genres list
for row_index in range(movie_df.shape[0]):
    genres_list = json.loads(movie_df['genres'][row_index])
    for genre_info in genres_list:
        genre_name = genre_info.get('name')
        if genre_name:
            movie_df.at[row_index, genre_name] = 1
            

In [144]:
movie_df.head()

Unnamed: 0,budget,popularity,revenue,runtime,vote_average,vote_count,release_year,release_month,release_day,genre,...,Science Fiction,War,Crime,Western,Fantasy,Mystery,Horror,History,Foreign,Documentary
0,237000000,150.437577,2787965087,162.0,7.2,11800,2009,12,10,0,...,1,0,0,0,1,0,0,0,0,0
1,300000000,139.082615,961000000,169.0,6.9,4500,2007,5,19,0,...,0,0,0,0,1,0,0,0,0,0
2,245000000,107.376788,880674609,148.0,6.3,4466,2015,10,26,0,...,0,0,1,0,0,0,0,0,0,0
3,250000000,112.31295,1084939099,165.0,7.6,9106,2012,7,16,0,...,0,0,1,0,0,0,0,0,0,0
4,260000000,43.926995,284139100,132.0,6.1,2124,2012,3,7,0,...,1,0,0,0,0,0,0,0,0,0


In [142]:
# remove unesesary columns
movie_df.drop(columns=['genres',
                       'homepage',
                       'id',
                       'keywords',
                       'original_language',
                       'original_title',
                       'overview',
                       'production_companies',
                       'production_countries',
                       'title',
                       'spoken_languages',
                       'status',
                       'tagline'
                       ], inplace=True)

In [143]:
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 30 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   budget           4803 non-null   int64  
 1   popularity       4803 non-null   float64
 2   revenue          4803 non-null   int64  
 3   runtime          4803 non-null   float64
 4   vote_average     4803 non-null   float64
 5   vote_count       4803 non-null   int64  
 6   release_year     4803 non-null   int32  
 7   release_month    4803 non-null   int32  
 8   release_day      4803 non-null   int32  
 9   genre            4803 non-null   int64  
 10  Thriller         4803 non-null   int64  
 11  Adventure        4803 non-null   int64  
 12  TV Movie         4803 non-null   int64  
 13  Action           4803 non-null   int64  
 14  Music            4803 non-null   int64  
 15  Family           4803 non-null   int64  
 16  Drama            4803 non-null   int64  
 17  Romance       