# CSS 206 Mini-Project Data Preprocess + Model demos

## Load data files

Assume we have already downloaded csv files in a local folder named "data".

In [1]:
import pandas as pd
import numpy as np
import requests

In [2]:
# Load local csv files
df_movies = pd.read_csv("./data/movies_metadata.csv")
df_keywords = pd.read_csv("./data/keywords.csv")
df_ratings = pd.read_csv("./data/ratings.csv")

# Download bechdel test score data
address = "http://bechdeltest.com/api/v1/getAllMovies"
response = requests.get(address)
df_scores = pd.read_json(response.text)

  df_movies = pd.read_csv("./data/movies_metadata.csv")
  df_scores = pd.read_json(response.text)


## Combine dataframes

**Let's have a look at dataframes and process id and imdb_id column**

In [3]:
# movies dataframe
print(df_movies.dtypes)
print(df_movies.shape)

adult                     object
belongs_to_collection     object
budget                    object
genres                    object
homepage                  object
id                        object
imdb_id                   object
original_language         object
original_title            object
overview                  object
popularity                object
poster_path               object
production_companies      object
production_countries      object
release_date              object
revenue                  float64
runtime                  float64
spoken_languages          object
status                    object
tagline                   object
title                     object
video                     object
vote_average             float64
vote_count               float64
dtype: object
(45466, 24)


In [4]:
# keywords dataframe
print(df_keywords.dtypes)
print(df_keywords.shape)

id           int64
keywords    object
dtype: object
(46419, 2)


In [5]:
# rating dataframe
print(df_ratings.dtypes)
print(df_ratings.shape)

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object
(26024289, 4)


In [6]:
# scores dataframe
print(df_scores.dtypes)
print(df_scores.shape)

imdbid    object
id         int64
year       int64
title     object
rating     int64
dtype: object
(10493, 5)


In [7]:
# Preprocess "id" and "imdb_id" columns in df_movies

# "id" in df_movies is str, "id" in df_ratings and df_keywords is int64
df_movies["id"] = pd.to_numeric(df_movies["id"], errors="coerce")
df_movies.dropna(subset=["id"], inplace=True)
df_movies["id"] = df_movies["id"].astype("int64")

# imdbid in df_movies and df_scores both are str
# imdbid in df_movies has additonal "tt" prefix, so we remove it
df_movies["imdb_id"] = df_movies["imdb_id"].str[2:]

In [8]:
# Check number of data after intersection
s1_id = set(df_movies["id"])
s1_imdb = set(df_movies["imdb_id"])
s2_id = set(df_keywords["id"])
s3_id = set(df_ratings["movieId"])
s4_imdb = set(df_scores["imdbid"])

print(f"Data in s1 intersect s2: {len(s1_id & s2_id)}")
print(f"Data in s1 intersect s3: {len(s1_id & s3_id)}")
print(f"Data in s1 intersect s4: {len(s1_imdb & s4_imdb)}")

Data in s1 intersect s2: 45432
Data in s1 intersect s3: 7565
Data in s1 intersect s4: 7985


There will be even less data if we combine 4 dataframes together.

**Then, let's merge df_movies with other 3 dataframes**

In [9]:
# Combine df_movies and df_keywords together
# Combine and retain only matching rows
df_movies = df_movies.merge(df_keywords, on="id")

In [10]:
# Combine df_movies and df_ratings together
# Calculate average ratings
df_ratings = df_ratings.groupby("movieId")["rating"].mean().reset_index()
df_ratings.columns = ["id", "rating"]

# Combine and retain only matching rows
df_movies = df_movies.merge(df_ratings, on="id")

In [11]:
# Combine df_movies and df_scores together
df_scores = df_scores[["imdbid", "rating"]]
df_scores.columns = ["imdb_id", "bechdel_score"]

# Combine and retain only matching rows
df_movies = df_movies.merge(df_scores, on="imdb_id")

**There only 2k+ rows data now.**

In [12]:
# If you want to save memory 
del df_keywords
del df_ratings
del df_scores

## Change data type

In [13]:
# Check column names and types in df_movies
print(df_movies.dtypes)

adult                     object
belongs_to_collection     object
budget                    object
genres                    object
homepage                  object
id                         int64
imdb_id                   object
original_language         object
original_title            object
overview                  object
popularity                object
poster_path               object
production_companies      object
production_countries      object
release_date              object
revenue                  float64
runtime                  float64
spoken_languages          object
status                    object
tagline                   object
title                     object
video                     object
vote_average             float64
vote_count               float64
keywords                  object
rating                   float64
bechdel_score              int64
dtype: object


In [14]:
# Pick columns that we need to train our model
# You may add more column here
column_names = ["budget", "popularity", "revenue", "runtime", "vote_average", "vote_count", "rating", "bechdel_score"]
df_movies = df_movies[column_names]

In [15]:
# Convert budget and popularity column from string to float
df_movies["budget"] = df_movies["budget"].astype(float)
df_movies["popularity"] = df_movies["popularity"].astype(float)

**You may apply more data preprocessing here.**

In [16]:
# You may apply one-hot encoding to categorical columns like "genres"
# However, in that case there will be a lot of sparse input features.

## Split dataset

In [29]:
# Convert data into numpy array
data = df_movies.values

# Shuffle data, you may change seed
np.random.seed(1)
np.random.shuffle(data)

# split features and label
x, y = data[:, :-1], data[:, -1]

# Change data type
x = x.astype(np.float32)
y = y.astype(np.int32) # We use float in regression and int in classification

In [30]:
# split train dataset and test dataset, you may change the ratio
train_ratio = 0.9
threshold = int(train_ratio * len(x))
x_train, y_train = x[:threshold, :], y[:threshold]
x_test,  y_test  = x[threshold:, :], y[threshold:]

# Feature scaling with Z-score
train_mean, train_std = np.mean(x_train, axis=0), np.std(x_train, axis=0)
x_train = (x_train - train_mean) / train_std
x_test  = (x_test  - train_mean) / train_std

Then you can go training model with (x_train, y_train), and test model with (x_test, y_test)

## Train models (demo)

In [35]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import mean_squared_error

# Support vector machine
model = SVC()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

loss = mean_squared_error(y_test, y_pred)
acc = np.mean(y_pred[:] == y_test[:])
print(f"SVM: Loss- {loss}, Accuracy- {acc}")

# Random forest
model = RandomForestClassifier()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

loss = mean_squared_error(y_test, y_pred)
acc = np.mean(y_pred[:] == y_test[:])
print(f"RF: Loss- {loss}, Accuracy- {acc}")

# Gaussian naive bayes
model = GaussianNB()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

loss = mean_squared_error(y_test, y_pred)
acc = np.mean(y_pred[:] == y_test[:])
print(f"GNB: Loss- {loss}, Accuracy- {acc}")

SVM: Loss- 1.6221198156682028, Accuracy- 0.5576036866359447
RF: Loss- 1.7788018433179724, Accuracy- 0.511520737327189
GNB: Loss- 1.8387096774193548, Accuracy- 0.5207373271889401
