In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer

# from src.data_processing.load_data import load_data

# m, r = load_data()

movies = pd.read_csv('../data/preprocessing/movies_metadata.csv', low_memory=False)
ratings = pd.read_csv('../data/raw/ratings.csv')

In [24]:
movies.head(1)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,original_language,overview,popularity,poster_path,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,release_year
0,False,,160000000.0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",http://inceptionmovie.warnerbros.com/,27205,en,"Cobb, a skilled thief who commits corporate es...",29.108149,/qmDpIHrmpJINaRKAfWQfftjCdyi.jpg,...,825532764.0,148.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Your mind is the scene of the crime.,Inception,False,8.1,14075.0,2010.0


# Feature engineering

## Extract Features from release date

In [25]:
movies['release_year'] = pd.to_datetime(movies['release_date'], errors='coerce').dt.year

## Process genres column

In [26]:
movies['genres'] = movies['genres'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])
movies['genres'] = movies['genres'].apply(lambda x: [i['name'] for i in x])

In [27]:
# One-hot encode the genres using MultiLabelBinarizer
mlb = MultiLabelBinarizer()
genres_dummies = pd.DataFrame(mlb.fit_transform(movies['genres']), columns=mlb.classes_, index=movies.index)

In [28]:
# Append the one-hot encoded genres to the movies DataFrame
movies = pd.concat([movies, genres_dummies], axis=1)

In [29]:
movies.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,original_language,overview,popularity,poster_path,...,Romance,Science Fiction,Sentai Filmworks,TV Movie,Telescene Film Group Productions,The Cartel,Thriller,Vision View Entertainment,War,Western
0,False,,160000000.0,"[Action, Thriller, Science Fiction, Mystery, A...",http://inceptionmovie.warnerbros.com/,27205,en,"Cobb, a skilled thief who commits corporate es...",29.108149,/qmDpIHrmpJINaRKAfWQfftjCdyi.jpg,...,0,1,0,0,0,0,1,0,0,0
1,False,"{'id': 263, 'name': 'The Dark Knight Collectio...",185000000.0,"[Drama, Action, Crime, Thriller]",http://thedarkknight.warnerbros.com/dvdsite/,155,en,Batman raises the stakes in his war on crime. ...,123.167259,/1hRoyzDtpgMU7Dz4JF22RANzQO7.jpg,...,0,0,0,0,0,0,1,0,0,0
2,False,"{'id': 87096, 'name': 'Avatar Collection', 'po...",237000000.0,"[Action, Adventure, Fantasy, Science Fiction]",http://www.avatarmovie.com/,19995,en,"In the 22nd century, a paraplegic Marine is di...",185.070892,/kmcqlZGaSh20zpTbuoF0Cdn07dT.jpg,...,0,1,0,0,0,0,0,0,0,0


## Create popularity metrics

In [30]:
movies['popularity_metric'] = (movies['vote_average'] * movies['vote_count']) / (movies['vote_count'] + 100)

# Normalize the popularity metric
scaler = StandardScaler()
movies['popularity_metric'] = scaler.fit_transform(movies[['popularity_metric']])

# Display the new feature
movies[['title', 'vote_average', 'vote_count', 'popularity_metric']].head()

Unnamed: 0,title,vote_average,vote_count,popularity_metric
0,Inception,8.1,14075.0,4.165992
1,The Dark Knight,8.3,12269.0,4.282292
2,Avatar,7.2,12114.0,3.614106
3,The Avengers,7.4,12000.0,3.73515
4,Deadpool,7.4,11444.0,3.733347


## Handle budget and revenue columns
Normalise them for better model performance

In [31]:
movies[['budget', 'revenue']] = scaler.fit_transform(movies[['budget', 'revenue']])
movies[['title', 'budget', 'revenue']].head(3)

Unnamed: 0,title,budget,revenue
0,Inception,4.031797,5.165205
1,The Dark Knight,4.760158,6.387266
2,Avatar,6.27515,18.561116


## Combine Text features

In [32]:
# Combine text features into a single field
movies['combined_text'] = movies['title'].fillna('') + ' ' + movies['overview'].fillna('') + ' ' + movies['tagline'].fillna('')

# Check the combined text feature
movies[['title', 'combined_text']].head()


Unnamed: 0,title,combined_text
0,Inception,"Inception Cobb, a skilled thief who commits co..."
1,The Dark Knight,The Dark Knight Batman raises the stakes in hi...
2,Avatar,"Avatar In the 22nd century, a paraplegic Marin..."
3,The Avengers,The Avengers When an unexpected enemy emerges ...
4,Deadpool,Deadpool Deadpool tells the origin story of fo...


## Save

In [34]:
movies.to_csv('../data/processed/movies_metadata.csv', index=False)