# Libraries


In [1]:
import pandas as pd
import ast

from typing import Literal, LiteralString

# Load data


In [2]:
movies = pd.read_csv('dataset/tmdb_5000_movies.csv')
credits = pd.read_csv('dataset/tmdb_5000_credits.csv')

In [3]:
df = movies.merge(credits, on='title')

# Prepare data


## Duplicates


In [4]:
df.duplicated(subset='id').sum()

np.int64(6)

In [5]:
df = df.drop_duplicates(subset='id')

## Parsing cols


In [6]:
def parsing(text) -> LiteralString | Literal['']:
    try:
        if not isinstance(text, str) or text == '[]':
            return ''
        
        data = ast.literal_eval(text)
        names = [item['name'] for item in data]
        
        return ', '.join(names)
    
    except Exception as e:
        print(f"Error parsing: {text[:30]}... -> {e}") 
        return ''

In [7]:
def get_top_cast(text) -> LiteralString | Literal['']:
    try:
        data = ast.literal_eval(text)
        names = [item['name'] for item in data]
        
        return ', '.join(names)
    except:
        return ''

In [8]:
from typing import Literal, LiteralString


def get_genres(text) -> LiteralString | Literal['']:
    try:
        return ", ".join([x['name'] for x in ast.literal_eval(text)])
    except:
        return ""

In [9]:
def get_year(date_str) -> int:
    try:
        return int(str(date_str).split('-')[0])
    except:
        return 0

### Apply Parsing


In [10]:
df['genre_names'] = df['genres'].apply(get_genres)
df['keyword_names'] = df['keywords'].apply(parsing)
df['production_companies_names'] = df['production_companies'].apply(parsing)
df['year'] = df['release_date'].apply(get_year)

In [11]:
display(df['genre_names'].head())
display(df['keyword_names'].head())
display(df['production_companies_names'].head())

0    Action, Adventure, Fantasy, Science Fiction
1                     Adventure, Fantasy, Action
2                       Action, Adventure, Crime
3                 Action, Crime, Drama, Thriller
4             Action, Adventure, Science Fiction
Name: genre_names, dtype: object

0    culture clash, future, space war, space colony...
1    ocean, drug abuse, exotic island, east india t...
2    spy, based on novel, secret agent, sequel, mi6...
3    dc comics, crime fighter, terrorist, secret id...
4    based on novel, mars, medallion, space travel,...
Name: keyword_names, dtype: object

0    Ingenious Film Partners, Twentieth Century Fox...
1    Walt Disney Pictures, Jerry Bruckheimer Films,...
2                       Columbia Pictures, Danjaq, B24
3    Legendary Pictures, Warner Bros., DC Entertain...
4                                 Walt Disney Pictures
Name: production_companies_names, dtype: object

In [12]:
df['cast_names'] = df['cast'].apply(get_top_cast)
df['director_name'] = df['crew'].apply(lambda x: next((i['name'] for i in ast.literal_eval(x) if i['job']=='Director'), ""))

In [13]:
display(df.cast_names.head())
display(df.director_name.head())

0    Sam Worthington, Zoe Saldana, Sigourney Weaver...
1    Johnny Depp, Orlando Bloom, Keira Knightley, S...
2    Daniel Craig, Christoph Waltz, Léa Seydoux, Ra...
3    Christian Bale, Michael Caine, Gary Oldman, An...
4    Taylor Kitsch, Lynn Collins, Samantha Morton, ...
Name: cast_names, dtype: object

0        James Cameron
1       Gore Verbinski
2           Sam Mendes
3    Christopher Nolan
4       Andrew Stanton
Name: director_name, dtype: object

## Nulls


In [14]:
df['overview'] = df['overview'].fillna("") 
df['tagline'] = df['tagline'].fillna("")

## Create Soup


In [15]:
def create_soup(row) -> str:
    return f"""
    Movie Title: {row['title']}
    Director: {row['director_name']}
    Cast: {row['cast_names']}
    Genres: {row['genre_names']}
    Keywords: {row['keyword_names']}
    Rating: {row['vote_average']}
    Overview: {row['overview']}
    Budget: {row['budget']}
    Status: {row['status']}
    Runtime: {row['runtime']}
    Tagline: {row['tagline']}
    Year: {row['year']}
    Production Companies Names: {row['production_companies_names']}
    """

In [16]:
df['page_content'] = df.apply(create_soup, axis=1)

In [17]:
final_df = df[['id', 'title', 'page_content', 'vote_average', 'genre_names', 'year']].copy()

In [18]:
final_df.to_csv('dataset/processed_movies.csv', index=False)