In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
import joblib

In [2]:
# upload data
url = 'https://raw.githubusercontent.com/Ari-vu/SML/main/Given_data/train.csv'
df = pd.read_csv(url)

In [3]:
# split train and test data
perc = 0.7 # percentage of training data
random_state = 10
np.random.seed(random_state)
trainIndex = np.random.choice(df.shape[0], size=int(perc*df.shape[0]), replace=False)
train = df.iloc[trainIndex]
test = df.iloc[~df.index.isin(trainIndex)]

In [4]:
# define features (X) and label (y)
X_train = train.drop(columns=['Lead'])
y_train = train['Lead']
X_test = test.drop(columns=['Lead'])
y_test = test['Lead']

In [5]:
# create a first model
base_model = RandomForestClassifier(random_state=random_state)
# fit the model with the training data
base_model.fit(X_train, y_train)
# calculate the accuracy
print('Accuracy on training data: %.4f' %np.mean(base_model.predict(X_train)==y_train))
print('Accuracy on test data: %.4f' %np.mean(base_model.predict(X_test)==y_test))
pd.crosstab(base_model.predict(X_test), y_test)

Accuracy on training data: 1.0000
Accuracy on test data: 0.8462


Lead,Female,Male
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,32,10
Male,38,232


In [6]:
base_model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 10,
 'verbose': 0,
 'warm_start': False}

In [7]:
# defining several values for some hyperparameters in a random grid
n_estimators = [100, 300, 500, 800, 1200]
max_depth = [5, 8, 15, 25, 30]
min_samples_split = [2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10]

random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

In [8]:
# use cross validation to find the best hyperparameters
CV = RandomizedSearchCV(estimator=base_model, param_distributions=random_grid, n_iter=100, cv=3,
                        verbose=2, random_state=random_state, n_jobs=-1)
CV.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(random_state=10),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'max_depth': [5, 8, 15, 25, 30],
                                        'min_samples_leaf': [1, 2, 5, 10],
                                        'min_samples_split': [2, 5, 10, 15,
                                                              100],
                                        'n_estimators': [100, 300, 500, 800,
                                                         1200]},
                   random_state=10, verbose=2)

In [9]:
CV.best_params_

{'max_depth': 25,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 1200}

In [10]:
# define and fit the optimized model
optimized_model = RandomForestClassifier(max_depth=25, min_samples_leaf=1, min_samples_split=2,
                                         n_estimators=1200, random_state=random_state)
optimized_model.fit(X_train, y_train)
optimized_model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 25,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 1200,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 10,
 'verbose': 0,
 'warm_start': False}

In [11]:
# calculate accuracy for the optimized model
print('Accuracy on training data: %.4f' %np.mean(optimized_model.predict(X_train)==y_train))
print('Accuracy on test data: %.4f' %np.mean(optimized_model.predict(X_test)==y_test))
pd.crosstab(optimized_model.predict(X_test), y_test)

Accuracy on training data: 1.0000
Accuracy on test data: 0.8558


Lead,Female,Male
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,34,9
Male,36,233


In [12]:
# calculate importance of each feature
importances = list(optimized_model.feature_importances_)
feature_importances = [(feature, round(importance, 2)) for feature, importance
                       in zip(X_train.columns, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('Feature: {:40} Importance: {}'.format(*pair)) for pair in feature_importances];

Feature: Number words female                      Importance: 0.13
Feature: Age Lead                                 Importance: 0.12
Feature: Difference in words lead and co-lead     Importance: 0.1
Feature: Number of female actors                  Importance: 0.1
Feature: Age Co-Lead                              Importance: 0.09
Feature: Number of male actors                    Importance: 0.07
Feature: Total words                              Importance: 0.06
Feature: Number of words lead                     Importance: 0.06
Feature: Number words male                        Importance: 0.06
Feature: Mean Age Male                            Importance: 0.06
Feature: Mean Age Female                          Importance: 0.06
Feature: Year                                     Importance: 0.05
Feature: Gross                                    Importance: 0.05


In [13]:
# remove features with low importance
X_train2 = train.drop(columns=['Lead', 'Gross', 'Year'])
X_test2 = test.drop(columns=['Lead', 'Gross', 'Year'])

In [14]:
# train a new model and calculate its accuracy
optimized_model2 = RandomForestClassifier(max_depth=25, min_samples_leaf=1, min_samples_split=2,
                                          n_estimators=1200, random_state=random_state)
optimized_model2.fit(X_train2, y_train)
print('Accuracy on training data: %.4f' %np.mean(optimized_model2.predict(X_train2)==y_train))
print('Accuracy on test data: %.4f' %np.mean(optimized_model2.predict(X_test2)==y_test))
pd.crosstab(optimized_model2.predict(X_test2), y_test)

Accuracy on training data: 1.0000
Accuracy on test data: 0.8462


Lead,Female,Male
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,37,15
Male,33,227


There is not an improvement of the accuracy when removing the less important features, therefore the best model is the first optimized model (optimized_model).

In [15]:
# Save the model as a pickle in a file
joblib.dump(optimized_model, 'random_forests.pkl')

['random_forests.pkl']