## Decision Tree Based Methods
This notebook includes the code for testing decision tree based methods on the hollywood lead actors dataset.

In [1]:
import json

import joblib
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import RobustScaler
import os

In [2]:
## Config Variables
data_dir = './data'
save_res_dir = "./results/dec_tree"
if not os.path.exists(save_res_dir):
   os.makedirs(save_res_dir)

CV = 5
metrics = ['precision', 'recall', 'f1', 'accuracy']
random_state=42

In [3]:

def save_best_estimator(clf, name):
    best_model_stats = {}

    for metric in metrics:
        best_model_stats[metric] = {}
        rank_key = f"rank_test_{metric}"
        important_index = clf.cv_results_[rank_key].tolist().index(1)

        for key, val in clf.cv_results_.items():
            real_val = val[important_index]
            if 'numpy' in str(type(real_val)):
                if 'float' in str(type(real_val)):
                    best_model_stats[metric][key] = float(real_val)
                else:
                    best_model_stats[metric][key] = int(real_val)
            else:
                best_model_stats[metric][key] = real_val
    joblib.dump(clf.best_estimator_, f'{name}.pkl', compress=1)
    with open(f"{save_res_dir}/dec_tree/{name}_stats.json", 'w') as jf:
        json.dump(best_model_stats, jf, indent=4)

## Reading and Cleaning Data
We use scale some variables with logScaler, and decided to remove `Total words`, `Gross` and `year` from dataset. As to why we did that, please refer to the report PDF and data analysis part

In [None]:
data_dir = "./data"
df = pd.read_csv(os.path.join(data_dir, "train.csv"))
logscaled = ['Number words female', 'Number of words lead',
             'Difference in words lead and co-lead', 'Number of male actors',
             'Number of female actors', 'Number words male']

for col in logscaled:
    df[col] = np.log(df[col] + 1)

df = df.drop(columns=["Gross", "Total words", "Year"])

x = df.drop("Lead")
y = df['Lead'].replace({"Male": 0, "Female": 1})


## Decision Tree Classifier

In [None]:
parameters = {'model__criterion': ["gini", "entropy", "log_loss"], 'model__max_depth': range(3, 15),
              'model__min_samples_leaf': range(1, 10),
              'model__min_samples_split': range(2, 10)}
model = Pipeline([('scaler', RobustScaler()), ('model', DecisionTreeClassifier())])

clf = GridSearchCV(model, parameters, n_jobs=4, cv=CV, refit="accuracy",
                   scoring=metrics, random_state=random_state)
clf.fit(X=x.values, y=y)
save_best_estimator(clf, "decisionTree")
print("decision tree best result", clf.best_score_, clf.best_params_)
