# Question 2

## MediaDiversity Scores

How do the number of characters from different races and the distribution of dialogue lines across genders and races contribute to the overall diversity score provided by the Mediaversity Index?

In [2]:
import pandas as pd
import requests

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix, classification_report


In [3]:
# Loading cleaned data that contains cast information, media diversity score and other features/movie information
movies = pd.read_csv('Dataset/allMediaDiversityMovies.csv' , index_col=False)
movies.sample(5)

Unnamed: 0,known_chars,known_females,known_males,known_nbs,scenes_two_non_males,scenes_no_males,scenes_bechdel,file_name,tmdb_id,score
85,8,3,5,0,1,1,1,the-hitmans-bodyguard-2017,522931,3.0
46,24,9,12,0,22,14,6,Three-Billboards-Outside-Ebbing-Missouri-Scree...,359940,4.08
91,5,4,1,0,10,6,3,wonder-woman-2017,464052,2.75
0,11,3,6,1,30,14,9,Everything-Everywhere-All-At-Once,545611,5.67
16,26,11,12,1,20,5,1,The-Hate-U-Give,470044,4.58


In [4]:
def genderJobs(tmdb):
    url = "https://api.themoviedb.org/3/movie/"+str(tmdb)+"?append_to_response=credits&language=en-US" 
    headers = {
    "accept": "application/json",
    "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiJmMGRlYWRmZDQ4NzE2MWVhMTk4YmM0NDAyYWM1ZmY0YyIsInN1YiI6IjY1ZTM3OTFhMjc4ZDhhMDE0N2JkNzRlZiIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.S3LvxOgSnplzerf7SlRC0h_e8QHtKNsBwRyqM7CaPfg"
    }
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        data = response.json()['credits']['crew']
        nonMaleDirector = False
        nonMaleProducer = False
        for member in data:
            if (member['job'] == 'Director' and member['department'] == 'Directing' and member['gender'] != 2):
                nonMaleDirector = True
            if (((member['job'] == 'Executive Producer') or (member['job'] == 'Producer')) and member['department'] == 'Production' and member['gender'] != 2):
                nonMaleProducer = True
        return nonMaleDirector, nonMaleProducer

In [5]:
movies['nonMaleDirector'], movies['nonMaleProducer'] = zip(*movies.apply(lambda x: genderJobs(x.tmdb_id), axis=1))

In [6]:
# Getting the features and target variable
X = movies[['known_chars', 'known_females', 'known_males', 'known_nbs',
       'scenes_two_non_males', 'scenes_no_males', 'scenes_bechdel', 'nonMaleDirector',
        'nonMaleProducer']]
y = movies.score

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [7]:
# Standardizing the data
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [11]:
# Defining parameters for the models
paramsRidge = {
    'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg']
}
grid = GridSearchCV(Ridge(max_iter = 10000, random_state=10), paramsRidge)
grid.fit(X_train, y_train)
ridge = grid.best_estimator_
ridge.fit(X_train, y_train)

paramsRFReg = {
 'max_depth': [40, 60, 80, 100, 120],
 'n_estimators': [200, 400, 600],
 'min_samples_split': [5, 10, 20]
}
grid = GridSearchCV(RandomForestRegressor(random_state = 10), paramsRFReg)
grid.fit(X_train, y_train)
rf = grid.best_estimator_
rf.fit(X_train, y_train)

paramsKNN = {
    'n_neighbors': range(10, 31), 
    'weights': ['uniform', 'distance']
}
grid = GridSearchCV(KNeighborsRegressor(), paramsKNN)
grid.fit(X_train, y_train)
knn = grid.best_estimator_
knn.fit(X_train, y_train)

paramsSGD = {
    'alpha': [0.001, 0.01, 0.1, 1, 10],
    'penalty': ['l2', 'l1'],
    'learning_rate': ['optimal', 'adaptive']
}
grid = GridSearchCV(SGDRegressor(max_iter=10000000, random_state = 10, early_stopping=True), paramsSGD)
grid.fit(X_train, y_train)
sgd = grid.best_estimator_
sgd.fit(X_train, y_train)

models = [ridge, rf, knn, sgd]
model_names = ['Ridge', 'Random Forest', 'KNN', 'SGD']

# Evaluating the models
print('{:15s} {:7s} {:7s}'.format('Model Name', 'MSE', 'R^2'))
for i, model in enumerate(models):
    print('{:15s} {:7.3f} {:7.3f}'.format(model_names[i], mean_squared_error(y_test, model.predict(X_test)), model.score(X_test, y_test)))


Model Name      MSE     R^2    
Ridge             1.044  -0.065
Random Forest     0.882   0.100
KNN               0.904   0.078
SGD               0.982  -0.001


> As we can see, the results using the features we have are NOT good - the model performs poorly, with not much of a fit.
>
> Let's try with Polynomial, to see how it looks:

In [12]:
from sklearn.preprocessing import PolynomialFeatures

In [21]:
poly = PolynomialFeatures(2, include_bias=True)

X_std = scaler.transform(X)
X_std = poly.fit_transform(X_std)

X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=0.2)

In [23]:
# Defining parameters for the models
paramsRidge = {
    'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg']
}
grid = GridSearchCV(Ridge(max_iter = 10000, random_state=10), paramsRidge)
grid.fit(X_train, y_train)
print('RIDGE:', grid.best_params_)
ridge = grid.best_estimator_
ridge.fit(X_train, y_train)

paramsRFReg = {
 'max_depth': [40, 60, 80, 100, 120],
 'n_estimators': [200, 400, 600],
 'min_samples_split': [5, 10, 20]
}
grid = GridSearchCV(RandomForestRegressor(random_state = 10), paramsRFReg)
grid.fit(X_train, y_train)
print('RF:', grid.best_params_)
rf = grid.best_estimator_
rf.fit(X_train, y_train)

paramsKNN = {
    'n_neighbors': range(10, 31), 
    'weights': ['uniform', 'distance']
}
grid = GridSearchCV(KNeighborsRegressor(), paramsKNN)
grid.fit(X_train, y_train)
print('KNN:', grid.best_params_)
knn = grid.best_estimator_
knn.fit(X_train, y_train)

paramsSGD = {
    'alpha': [0.001, 0.01, 0.1, 1, 10],
    'penalty': ['l2', 'l1'],
    'learning_rate': ['optimal', 'adaptive']
}
grid = GridSearchCV(SGDRegressor(max_iter=100000, random_state = 10, early_stopping=True), paramsSGD)
grid.fit(X_train, y_train)
print('SGD:', grid.best_params_)
sgd = grid.best_estimator_
sgd.fit(X_train, y_train)

models = [ridge, rf, knn, sgd]
model_names = ['Ridge', 'Random Forest', 'KNN', 'SGD']

# Evaluating the models
print('{:15s} {:7s} {:7s}'.format('Model Name', 'MSE', 'R^2'))
for i, model in enumerate(models):
    print('{:15s} {:7.3f} {:7.3f}'.format(model_names[i], mean_squared_error(y_test, model.predict(X_test)), model.score(X_test, y_test)))


RIDGE: {'alpha': 100, 'solver': 'lsqr'}
RF: {'max_depth': 40, 'min_samples_split': 10, 'n_estimators': 400}
KNN: {'n_neighbors': 10, 'weights': 'distance'}
SGD: {'alpha': 10, 'learning_rate': 'adaptive', 'penalty': 'l1'}
Model Name      MSE     R^2    
Ridge             0.987  -0.123
Random Forest     1.065  -0.212
KNN               0.911  -0.037
SGD               1.015  -0.155


> Polynomial with a power of 2 clearly doesn't fit the data at all! Let's try 3 just to be safe.

In [24]:
poly = PolynomialFeatures(3, include_bias=True)

X_std = scaler.transform(X)
X_std = poly.fit_transform(X_std)

X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=0.2)

# Defining parameters for the models
paramsRidge = {
    'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg']
}
grid = GridSearchCV(Ridge(max_iter = 10000, random_state=10), paramsRidge)
grid.fit(X_train, y_train)
print('RIDGE:', grid.best_params_)
ridge = grid.best_estimator_
ridge.fit(X_train, y_train)

paramsRFReg = {
 'max_depth': [40, 60, 80, 100, 120],
 'n_estimators': [200, 400, 600],
 'min_samples_split': [5, 10, 20]
}
grid = GridSearchCV(RandomForestRegressor(random_state = 10), paramsRFReg)
grid.fit(X_train, y_train)
print('RF:', grid.best_params_)
rf = grid.best_estimator_
rf.fit(X_train, y_train)

paramsKNN = {
    'n_neighbors': range(10, 31), 
    'weights': ['uniform', 'distance']
}
grid = GridSearchCV(KNeighborsRegressor(), paramsKNN)
grid.fit(X_train, y_train)
print('KNN:', grid.best_params_)
knn = grid.best_estimator_
knn.fit(X_train, y_train)

paramsSGD = {
    'alpha': [0.001, 0.01, 0.1, 1, 10],
    'penalty': ['l2', 'l1'],
    'learning_rate': ['optimal', 'adaptive']
}
grid = GridSearchCV(SGDRegressor(max_iter=100000, random_state = 10, early_stopping=True), paramsSGD)
grid.fit(X_train, y_train)
print('SGD:', grid.best_params_)
sgd = grid.best_estimator_
sgd.fit(X_train, y_train)

models = [ridge, rf, knn, sgd]
model_names = ['Ridge', 'Random Forest', 'KNN', 'SGD']

# Evaluating the models
print('{:15s} {:7s} {:7s}'.format('Model Name', 'MSE', 'R^2'))
for i, model in enumerate(models):
    print('{:15s} {:7.3f} {:7.3f}'.format(model_names[i], mean_squared_error(y_test, model.predict(X_test)), model.score(X_test, y_test)))


RIDGE: {'alpha': 1000, 'solver': 'svd'}
RF: {'max_depth': 40, 'min_samples_split': 10, 'n_estimators': 600}
KNN: {'n_neighbors': 10, 'weights': 'uniform'}
SGD: {'alpha': 10, 'learning_rate': 'adaptive', 'penalty': 'l2'}
Model Name      MSE     R^2    
Ridge             1.021  -0.229
Random Forest     1.174  -0.414
KNN               1.026  -0.236
SGD             2560262386860008407040.000 -3083183729207409639424.000


> Much worse - it looks like Polynomial doesn't help us.
