In [2]:
import pandas as pd
import numpy as np
import ast

from typing import Literal, LiteralString

In [63]:
df = pd.read_csv('dataset/TMDB_movie_dataset_v11.csv')

In [64]:
print(f'Original Row Count: {len(df)}')

Original Row Count: 1331123


# preprocessing

## Filter movies

In [65]:
lost_movies = df[(df['vote_count'] >= 10) & (df['vote_count'] < 50)]

print(f"Movies with 10-49 votes: {len(lost_movies)}")
print("\n--- Examples of movies you are DROPPING ---")
print(lost_movies[['title', 'vote_count']].sample(20))

Movies with 10-49 votes: 50871

--- Examples of movies you are DROPPING ---
                                           title  vote_count
62432                         For Love and Honor          14
56113                             Change of Life          16
35972                           Killer Mosquitos          33
56377                     Louis de Funès Forever          16
58009                                   The Boys          16
76842                Witness for the Prosecution          10
47385                                     Victor          22
47868  Trailer Park Boys: Live at the North Pole          21
75095                     The Key Is in the Door          10
78735                             Tarzan's Peril          10
58174   Scenes from Under Childhood, Section One          16
56776                        Dieudonné - Foxtrot          16
29749                                  Off Track          45
49024                      Antarctica: Ice & Sky          20
50112    

In [66]:
filtered_df = df[df['vote_count'] > 50].copy()

In [67]:
filtered_df = filtered_df[filtered_df['status'] == 'Released'].copy()

In [68]:
print(f"Rows after filtering for quality: {len(filtered_df)}")

Rows after filtering for quality: 27689


In [69]:
df = filtered_df

Movies with voting count >= 50 are good for the bot

## Cleaning

### Drop useless cols

In [None]:
df = df.drop(columns=['homepage', 'poster_path', 'backdrop_path', 'imdb_id'], errors='ignore')

### Handle Duplication

In [None]:
df.title.duplicated().sum()

np.int64(0)

In [75]:
df.drop_duplicates(subset=['title'], inplace=True)

### Handle Nulls

In [78]:
df.isnull().sum()

id                         0
title                      0
vote_average               0
vote_count                 0
status                     0
release_date               0
revenue                    0
runtime                    0
adult                      0
budget                     0
original_language          0
original_title             0
overview                 108
popularity                 0
tagline                 9324
genres                    17
production_companies     818
production_countries     233
spoken_languages          64
keywords                3719
dtype: int64

less nulls now after taking only the high quality movies

In [79]:
df['overview'] = df['overview'].fillna('')
df['genres'] = df['genres'].fillna('unknown')
df['keywords'] = df['keywords'].fillna('')
df['tagline'] = df['tagline'].fillna('')

## Parsing year

In [80]:
def get_year(y) -> int:
    try:
        return int(str(y).split('-')[0])
    except:
        return 0

In [81]:
df['year'] = df['release_date'].apply(get_year)

In [83]:
print(f"New Count: {len(df)}")

New Count: 26075


## Create Soup

In [87]:
df.columns

Index(['id', 'title', 'vote_average', 'vote_count', 'status', 'release_date',
       'revenue', 'runtime', 'adult', 'budget', 'original_language',
       'original_title', 'overview', 'popularity', 'tagline', 'genres',
       'production_companies', 'production_countries', 'spoken_languages',
       'keywords', 'year'],
      dtype='object')

In [94]:
def create_soup(row) -> str:
    return f"""
    Movie Title: {row['title']}
    Genres: {row['genres']}
    Keywords: {row['keywords']}
    Rating: {row['vote_average']}
    Overview: {row['overview']}
    Budget: {row['budget']}
    Status: {row['status']}
    Runtime: {row['runtime']}
    Tagline: {row['tagline']}
    Adult: {row['adult']}
    Year: {row['year']}
    Production Companies: {row['production_companies']}
    """

In [95]:
df['page_content'] = df.apply(create_soup, axis=1)

In [100]:
final_df = df[['id', 'title', 'page_content', 'vote_average', 'genres', 'year']].copy()

In [101]:
final_df.to_csv('dataset/processed_2024_tmdb_movies.csv', index=False)