# CSS 206 Mini-Project Data Preprocess + Model demos

## Load data files

Assume we have already downloaded csv files in a local folder named "data".

In [244]:
import pandas as pd
import numpy as np
import requests
import sklearn
import nltk
from nltk.stem import PorterStemmer

In [141]:
# Load local csv files
df_movies = pd.read_csv("./data/movies_metadata.csv")
df_keywords = pd.read_csv("./data/keywords.csv")
df_ratings = pd.read_csv("./data/ratings.csv")

# Download bechdel test score data
address = "http://bechdeltest.com/api/v1/getAllMovies"
response = requests.get(address)
df_scores = pd.read_json(response.text)

  exec(code_obj, self.user_global_ns, self.user_ns)


## Combine dataframes

**Let's have a look at dataframes and process id and imdb_id column**

In [142]:
# movies dataframe
print(df_movies.dtypes)
print(df_movies.shape)

adult                     object
belongs_to_collection     object
budget                    object
genres                    object
homepage                  object
id                        object
imdb_id                   object
original_language         object
original_title            object
overview                  object
popularity                object
poster_path               object
production_companies      object
production_countries      object
release_date              object
revenue                  float64
runtime                  float64
spoken_languages          object
status                    object
tagline                   object
title                     object
video                     object
vote_average             float64
vote_count               float64
dtype: object
(45466, 24)


In [143]:
# keywords dataframe
print(df_keywords.dtypes)
print(df_keywords.shape)

id           int64
keywords    object
dtype: object
(46419, 2)


In [144]:
# rating dataframe
print(df_ratings.dtypes)
print(df_ratings.shape)

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object
(26024289, 4)


In [145]:
# scores dataframe
print(df_scores.dtypes)
print(df_scores.shape)

year       int64
id         int64
rating     int64
imdbid    object
title     object
dtype: object
(10493, 5)


In [146]:
# Preprocess "id" and "imdb_id" columns in df_movies

# "id" in df_movies is str, "id" in df_ratings and df_keywords is int64
df_movies["id"] = pd.to_numeric(df_movies["id"], errors="coerce")
df_movies.dropna(subset=["id"], inplace=True)
df_movies["id"] = df_movies["id"].astype("int64")

# imdbid in df_movies and df_scores both are str
# imdbid in df_movies has additonal "tt" prefix, so we remove it
df_movies["imdb_id"] = df_movies["imdb_id"].str[2:]

In [147]:
# Check number of data after intersection
s1_id = set(df_movies["id"])
s1_imdb = set(df_movies["imdb_id"])
s2_id = set(df_keywords["id"])
s3_id = set(df_ratings["movieId"])
s4_imdb = set(df_scores["imdbid"])

print(f"Data in s1 intersect s2: {len(s1_id & s2_id)}")
print(f"Data in s1 intersect s3: {len(s1_id & s3_id)}")
print(f"Data in s1 intersect s4: {len(s1_imdb & s4_imdb)}")

Data in s1 intersect s2: 45432
Data in s1 intersect s3: 7565
Data in s1 intersect s4: 7985


There will be even less data if we combine 4 dataframes together.

**Then, let's merge df_movies with other 3 dataframes**

In [148]:
# Combine df_movies and df_keywords together
# Combine and retain only matching rows
df_movies = df_movies.merge(df_keywords, on="id")

In [149]:
# Combine df_movies and df_ratings together
# Calculate average ratings
df_ratings = df_ratings.groupby("movieId")["rating"].mean().reset_index()
df_ratings.columns = ["id", "rating"]

# Combine and retain only matching rows
df_movies = df_movies.merge(df_ratings, on="id")

In [150]:
# Combine df_movies and df_scores together
df_scores = df_scores[["imdbid", "rating"]]
df_scores.columns = ["imdb_id", "bechdel_score"]

# Combine and retain only matching rows
df_movies = df_movies.merge(df_scores, on="imdb_id")

**There only 2k+ rows data now.**

In [151]:
# If you want to save memory 
del df_keywords
del df_ratings
del df_scores

## Change data type

In [152]:
# Check column names and types in df_movies
print(df_movies.dtypes)

adult                     object
belongs_to_collection     object
budget                    object
genres                    object
homepage                  object
id                         int64
imdb_id                   object
original_language         object
original_title            object
overview                  object
popularity                object
poster_path               object
production_companies      object
production_countries      object
release_date              object
revenue                  float64
runtime                  float64
spoken_languages          object
status                    object
tagline                   object
title                     object
video                     object
vote_average             float64
vote_count               float64
keywords                  object
rating                   float64
bechdel_score              int64
dtype: object


# Add categorical data
`genres`, `spoken_languages`, `original_language`, `production_countries`

Ignoring `production_companies` because there are more unique values than there are rows.

In [154]:
df_movies['spoken_languages']=df_movies['spoken_languages'].apply(eval).apply(lambda l: [x['name'] for x in l])
df_movies['genres']=df_movies['genres'].apply(eval).apply(lambda l: [x['name'] for x in l])
df_movies['production_countries']=df_movies['production_countries'].apply(eval).apply(lambda l: [x['name'] for x in l])

In [155]:
genres = set()
df_movies['genres'].apply(genres.update)
len(genres)

20

In [157]:
original_language = set()
df_movies['original_language'].dropna().apply(original_language.add)
len(original_language)

33

In [158]:
spoken_languages = set()
df_movies['spoken_languages'].apply(spoken_languages.update)
len(spoken_languages)

54

In [160]:
production_countries = set()
df_movies['production_countries'].apply(production_countries.update)
len(production_countries)

73

In [138]:
# production_companies = set()
# df_movies['production_companies'].apply(lambda l: production_companies.update([x['name'] for x in l]))
# len(production_companies)
# 2813

For multi-valued categorical variables, decompose into 1-hot columns

In [177]:
categorical_vars = {
    'genres': genres,
    'spoken_languages': spoken_languages,
    'production_countries': production_countries
}
onehot_cols = []
for varname, values in categorical_vars.items():
    for var in values:
        varcol = varname+'_'+var
        onehot_cols.append(varcol)
        df_movies[varcol]=0
        has_var = df_movies[varname].apply(lambda l: var in l)
        df_movies.loc[has_var, varcol]=1

genre_cols = [col for col in onehot_cols if 'genre' in col]
cntry_cols = [col for col in onehot_cols if 'countries' in col]
lang_cols = [col for col in onehot_cols if 'spoken' in col]

df_movies.head()

  df_movies[varcol]=0


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,production_countries_Netherlands Antilles,production_countries_Bulgaria,production_countries_Peru,production_countries_Bahamas,production_countries_Canada,production_countries_Finland,production_countries_India,production_countries_United States of America,production_countries_Hong Kong,production_countries_Chile
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000.0,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,0,0,0,0,0,0,0,1,0,0
1,False,,65000000.0,"[Adventure, Fantasy, Family]",,8844,113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,0,0,0,0,0,0,0,1,0,0
2,False,,60000000.0,"[Action, Crime, Drama, Thriller]",,949,113277,en,Heat,"Obsessive master thief, Neil McCauley leads a ...",...,0,0,0,0,0,0,0,1,0,0
3,False,"{'id': 645, 'name': 'James Bond Collection', '...",58000000.0,"[Adventure, Action, Thriller]",http://www.mgm.com/view/movie/757/Goldeneye/,710,113189,en,GoldenEye,James Bond must unmask the mysterious head of ...,...,0,0,0,0,0,0,0,1,0,0
4,False,,98000000.0,"[Action, Adventure]",,1408,112760,en,Cutthroat Island,"Morgan Adams and her slave, William Shaw, are ...",...,0,0,0,0,0,0,0,1,0,0


# Add bag of word columns

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_validate
from sklearn.naive_bayes import BernoulliNB
vectorizer = CountVectorizer(stop_words='english')

stemmer = PorterStemmer()
df_movies['keyword_str'] = df_movies['keywords'].apply(eval).apply(lambda l: ' '.join(x['name'] for x in l))
word_columns = ["overview", "keyword_str", "tagline"]
stemmed_cols = []
for col in word_columns:
    stemmed_col = df_movies[col].apply(lambda s: ' '.join([stemmer.stem(w) for w in s.split()]) if type(s) is str else np.nan)

    bnb = BernoulliNB()
    bow = vectorizer.fit_transform(stemmed_col.dropna())
    y = df_movies['bechdel_score'].loc[stemmed_col.dropna().index]
    print(cross_validate(estimator=bnb, X=bow, y=y, scoring='f1_macro')['test_score'])
    # print(stemmed_col.head())


[0.20966642 0.20343705 0.2114386  0.17224279 0.19495868]
[0.25561468 0.25294464 0.25817429 0.23188406 0.21433009]
[0.19696272 0.20101679 0.20822134 0.19114082 0.19763775]


In [178]:
# Convert budget and popularity column from string to float
df_movies["budget"] = df_movies["budget"].astype(float)
df_movies["popularity"] = df_movies["popularity"].astype(float)

In [189]:
# Pick columns that we need to train our model
cont_cols = ["budget", "popularity", "revenue", "runtime", "vote_average", "vote_count", "rating"]
print(len(cont_cols), len(onehot_cols))
df_train = df_movies[cont_cols+onehot_cols+['bechdel_score']]
df_train.head()

7 147


Unnamed: 0,budget,popularity,revenue,runtime,vote_average,vote_count,rating,genres_Mystery,genres_Comedy,genres_Drama,...,production_countries_Bulgaria,production_countries_Peru,production_countries_Bahamas,production_countries_Canada,production_countries_Finland,production_countries_India,production_countries_United States of America,production_countries_Hong Kong,production_countries_Chile,bechdel_score
0,30000000.0,21.946943,373554033.0,81.0,7.7,5415.0,3.59893,0,1,0,...,0,0,0,0,0,0,1,0,0,1
1,65000000.0,17.015539,262797249.0,104.0,6.9,2413.0,3.760163,0,0,0,...,0,0,0,0,0,0,1,0,0,3
2,60000000.0,17.924927,187436818.0,170.0,7.7,1886.0,3.905544,0,0,1,...,0,0,0,0,0,0,1,0,0,2
3,58000000.0,14.686036,352194034.0,130.0,6.6,1194.0,2.740334,0,0,0,...,0,0,0,0,0,0,1,0,0,3
4,98000000.0,7.284477,10017322.0,119.0,5.7,137.0,3.710181,0,0,0,...,0,0,0,0,0,0,1,0,0,1


## Split dataset

In [197]:
data = df_train.values

# Shuffle data, you may change seed
np.random.seed(1)
np.random.shuffle(data)

# split features and label
x_cont = data[:, :len(cont_cols)]
x_cat = data[:, len(cont_cols):-1]
y = data[:, -1]

# Change data type
x_cont = x_cont.astype(np.float32)
y = y.astype(np.int32) # We use float in regression and int in classification
x_cont.shape, x_cat.shape, np.unique(x_cat), y.shape, np.unique(y)

((2167, 7), (2167, 147), array([0., 1.]), (2167,), array([0, 1, 2, 3]))

In [200]:
# split train dataset and test dataset, you may change the ratio
train_ratio = 0.9
threshold = int(train_ratio * len(y))
x_cont_train = x_cont[:threshold, :]
x_cont_test = x_cont[threshold:, :]

x_cat_train = x_cat[:threshold, :]
x_cat_test = x_cat[threshold:, :]

y_train = y[:threshold]
y_test = y[threshold:]

# Feature scaling with Z-score
train_mean, train_std = np.mean(x_cont_train, axis=0), np.std(x_cont_train, axis=0)
x_cont_train = (x_cont_train - train_mean) / train_std
x_cont_test  = (x_cont_test  - train_mean) / train_std

In [226]:
x_all_train = np.concatenate([x_cont_train, x_cat_train], axis=1)
x_all_test = np.concatenate([x_cont_test, x_cat_test], axis=1)

Then you can go training model with (x_train, y_train), and test model with (x_test, y_test)

## Train models (demo)

In [227]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import root_mean_squared_error, f1_score

print("Continuous featues")

# Support vector machine
model = SVC()
model.fit(x_cont_train, y_train)
y_pred = model.predict(x_cont_test)

loss = root_mean_squared_error(y_test, y_pred)
acc = np.mean(y_pred[:] == y_test[:])
f1 = f1_score(y_test, y_pred, average='macro')
print(f"SVM: Loss- {loss:.2f}, Accuracy- {acc:.2f}, f1- {f1:.2f}")

# Random forest
model = RandomForestClassifier()
model.fit(x_cont_train, y_train)
y_pred = model.predict(x_cont_test)

loss = root_mean_squared_error(y_test, y_pred)
acc = np.mean(y_pred[:] == y_test[:])
f1 = f1_score(y_test, y_pred, average='macro')
print(f"RF: Loss- {loss:.2f}, Accuracy- {acc:.2f}, f1- {f1:.2f}")

# Random forest
model = DecisionTreeClassifier(max_depth=2)
model.fit(x_cont_train, y_train)
y_pred = model.predict(x_cont_test)

loss = root_mean_squared_error(y_test, y_pred)
acc = np.mean(y_pred[:] == y_test[:])
f1 = f1_score(y_test, y_pred, average='macro')
print(f"DT: Loss- {loss:.2f}, Accuracy- {acc:.2f}, f1- {f1:.2f}")

# Gaussian naive bayes
model = GaussianNB()
model.fit(x_cont_train, y_train)
y_pred = model.predict(x_cont_test)

loss = root_mean_squared_error(y_test, y_pred)
acc = np.mean(y_pred[:] == y_test[:])
f1 = f1_score(y_test, y_pred, average='macro')
print(f"GNB: Loss- {loss:.2f}, Accuracy- {acc:.2f}, f1- {f1:.2f}")

print("Categorical featues")

# Support vector machine
model = SVC()
model.fit(x_cat_train, y_train)
y_pred = model.predict(x_cat_test)

loss = root_mean_squared_error(y_test, y_pred)
acc = np.mean(y_pred[:] == y_test[:])
f1 = f1_score(y_test, y_pred, average='macro')
print(f"SVM: Loss- {loss:.2f}, Accuracy- {acc:.2f}, f1- {f1:.2f}")

# Random forest
model = RandomForestClassifier()
model.fit(x_cat_train, y_train)
y_pred = model.predict(x_cat_test)

loss = root_mean_squared_error(y_test, y_pred)
acc = np.mean(y_pred[:] == y_test[:])
f1 = f1_score(y_test, y_pred, average='macro')
print(f"RF: Loss- {loss:.2f}, Accuracy- {acc:.2f}, f1- {f1:.2f}")

# Random forest
model = DecisionTreeClassifier(max_depth=2)
model.fit(x_cat_train, y_train)
y_pred = model.predict(x_cat_test)

loss = root_mean_squared_error(y_test, y_pred)
acc = np.mean(y_pred[:] == y_test[:])
f1 = f1_score(y_test, y_pred, average='macro')
print(f"DT: Loss- {loss:.2f}, Accuracy- {acc:.2f}, f1- {f1:.2f}")

# Bernoulli naive bayes
model = BernoulliNB()
model.fit(x_cat_train, y_train)
y_pred = model.predict(x_cat_test)

loss = root_mean_squared_error(y_test, y_pred)
acc = np.mean(y_pred[:] == y_test[:])
f1 = f1_score(y_test, y_pred, average='macro')
print(f"BNB: Loss- {loss:.2f}, Accuracy- {acc:.2f}, f1- {f1:.2f}")

print("All features")

# Support vector machine
model = SVC()
model.fit(x_all_train, y_train)
y_pred = model.predict(x_all_test)

loss = root_mean_squared_error(y_test, y_pred)
acc = np.mean(y_pred[:] == y_test[:])
f1 = f1_score(y_test, y_pred, average='macro')
print(f"SVM: Loss- {loss:.2f}, Accuracy- {acc:.2f}, f1- {f1:.2f}")

# Random forest
model = RandomForestClassifier()
model.fit(x_all_train, y_train)
y_pred = model.predict(x_all_test)

loss = root_mean_squared_error(y_test, y_pred)
acc = np.mean(y_pred[:] == y_test[:])
f1 = f1_score(y_test, y_pred, average='macro')
print(f"RF: Loss- {loss:.2f}, Accuracy- {acc:.2f}, f1- {f1:.2f}")

# Random forest
model = DecisionTreeClassifier(max_depth=2)
model.fit(x_all_train, y_train)
y_pred = model.predict(x_all_test)

loss = root_mean_squared_error(y_test, y_pred)
acc = np.mean(y_pred[:] == y_test[:])
f1 = f1_score(y_test, y_pred, average='macro')
print(f"DT: Loss- {loss:.2f}, Accuracy- {acc:.2f}, f1- {f1:.2f}")

# Bernoulli naive bayes
model = BernoulliNB()
model.fit(x_all_train, y_train)
y_pred = model.predict(x_all_test)

loss = root_mean_squared_error(y_test, y_pred)
acc = np.mean(y_pred[:] == y_test[:])
f1 = f1_score(y_test, y_pred, average='macro')
print(f"BNB: Loss- {loss:.2f}, Accuracy- {acc:.2f}, f1- {f1:.2f}")

Continuous featues
SVM: Loss- 1.35, Accuracy- 0.55, f1- 0.23
RF: Loss- 1.33, Accuracy- 0.52, f1- 0.27
DT: Loss- 1.37, Accuracy- 0.54, f1- 0.20
GNB: Loss- 1.39, Accuracy- 0.53, f1- 0.27
Categorical featues
SVM: Loss- 1.30, Accuracy- 0.55, f1- 0.29
RF: Loss- 1.34, Accuracy- 0.50, f1- 0.32
DT: Loss- 1.33, Accuracy- 0.53, f1- 0.27
BNB: Loss- 1.31, Accuracy- 0.53, f1- 0.31
All features
SVM: Loss- 1.32, Accuracy- 0.55, f1- 0.27
RF: Loss- 1.31, Accuracy- 0.53, f1- 0.30
DT: Loss- 1.37, Accuracy- 0.53, f1- 0.25
BNB: Loss- 1.29, Accuracy- 0.54, f1- 0.34


In [231]:
y_naive = np.full_like(y_test, fill_value=3)
loss = root_mean_squared_error(y_test, y_naive)
acc = np.mean(y_naive[:] == y_test[:])
f1 = f1_score(y_test, y_naive, average='macro')
print(f"Naive: Loss- {loss:.2f}, Accuracy- {acc:.2f}, f1- {f1:.2f}")

Naive: Loss- 1.38, Accuracy- 0.53, f1- 0.17
