In [1]:
# Import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
np.random.seed(2012)

# Configure visual settings:
%matplotlib inline 
plt.rcParams['figure.figsize'] = (10.0, 8.0) 
plt.style.use(['bmh'])

# Load the dataframe

data = pd.read_pickle('assets/training_data.p')

# Import lists of variable names
with open('var_names.p', 'rb') as f:
    target_variables, predictor_variables, categorical_variables, numerical_variables, text_variables, ordinal_variables = pickle.load(f)

In [2]:
# Drop 'rate' column
data.drop('rate', axis=1, inplace=True)
target_variables.remove('rate')
data.drop('index', axis=1, inplace=True)

assert data.isnull().sum().sum() == 0

data.reset_index(drop=True, inplace=True)

In [3]:
X = data.drop(target_variables, axis=1, inplace=False).select_dtypes(include=[np.number])

In [4]:
data['deviation'] = data['cc_hours'] - data['est_hours']

In [5]:
y = [1 if x > 0  else 0 for x in data['deviation']]

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_val, y_train, y_val = train_test_split(X,y)

In [8]:
# Begin modeling!
from sklearn.ensemble import RandomForestClassifier

In [9]:
forest = RandomForestClassifier(max_depth=300)
forest.fit(X_train, y_train)
forest.score(X_val, y_val)

0.65112335735481142

In [10]:
from sklearn.model_selection import GridSearchCV

In [11]:
# param_grid = {'n_estimators':[500, 750]}
# param_grid = {'class_weight' :['balanced',None]}

# forest = RandomForestClassifier(max_features=None, max_depth=300, n_estimators=500, class_weight='balanced')

# grid_forest = GridSearchCV(forest, param_grid)

# grid_forest.fit(X_train, y_train)

In [12]:
forest = RandomForestClassifier(max_features=None, max_depth=300, n_estimators=500, class_weight='balanced')

forest.fit(X_train, y_train, )
forest.score(X_val, y_val)

0.68673166596015256

In [13]:
from sklearn.metrics import confusion_matrix, roc_auc_score
CM = confusion_matrix(y_val, forest.predict(X_val))

TN = CM[0][0]
TP = CM[1][1]
FN = CM[1][0]
FP = CM[0][1]

In [14]:
CM

array([[1401,  147],
       [ 592,  219]], dtype=int64)

In [15]:
TP / (TP + FN)

0.27003699136868065

In [16]:
np.mean(y)

0.33898664405342377

In [17]:
roc_auc_score(y_val, forest.predict_proba(X_val)[:,1])

0.69582962941721871

In [18]:
import pickle
with open('models/over_under_model.p', 'wb') as f:
    pickle.dump(forest, f)

In [21]:
print("Features sorted by their score:")
print(sorted(zip(map(lambda x: round(x, 4), forest.feature_importances_), X_train.columns), 
             reverse=True))

Features sorted by their score:
[(0.11609999999999999, 'driving_distance'), (0.0591, 'loc2.sqFt'), (0.048899999999999999, 'loc1.sqFt'), (0.0465, 'furniture_list_length'), (0.033799999999999997, 'furniture_polarity'), (0.032399999999999998, 'furniture_subjectivity'), (0.027, 'mention_polarity'), (0.025100000000000001, 'boxes'), (0.024199999999999999, 'num_noun_phrases'), (0.022200000000000001, 'mention_subjectivity'), (0.0183, 'loc1.lengthOfWalkOptID'), (0.0178, 'loc2.lengthOfWalkOptID'), (0.016500000000000001, 'table'), (0.0121, 'chair'), (0.0111, 'help_packing'), (0.010200000000000001, 'small'), (0.0099000000000000008, 'loc2.stairs'), (0.0095999999999999992, 'loc1.stairs'), (0.0088999999999999999, 'reference_polarity'), (0.0088000000000000005, 'mattress'), (0.0086, 'reference_subjectivity'), (0.0085000000000000006, 'bed'), (0.0080000000000000002, 'tv'), (0.0074999999999999997, 'loc2.parkingType_Street'), (0.0074999999999999997, 'dresser'), (0.0074000000000000003, 'large'), (0.00740000