In [1]:
# Import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
np.random.seed(2012)

# Configure visual settings:
%matplotlib inline 
plt.rcParams['figure.figsize'] = (10.0, 8.0) 
plt.style.use(['bmh'])

# Load the dataframe

data = pd.read_pickle('assets/training_data.p')

# Import lists of variable names
with open('var_names.p', 'rb') as f:
    target_variables, predictor_variables, categorical_variables, numerical_variables, text_variables, ordinal_variables = pickle.load(f)

In [2]:
# Drop 'rate' column
data.drop('rate', axis=1, inplace=True)
target_variables.remove('rate')
data.drop('index', axis=1, inplace=True)

In [3]:
assert data.isnull().sum().sum() == 0

In [4]:
data.reset_index(drop=True, inplace=True)

In [5]:
X = data.drop(target_variables, axis=1, inplace=False).select_dtypes(include=[np.number])

In [6]:
X.iloc[0,10:]

loc1.elevatorType_Medium           0.00
loc1.elevatorType_Small            0.00
loc1.parkingType_Driveway          0.00
loc1.parkingType_Loading Dock      1.00
loc1.parkingType_Parking Lot       0.00
loc1.parkingType_Street            0.00
loc1.type_Apartment                1.00
loc1.type_House                    0.00
loc1.type_Storage                  0.00
loc2.elevatorType_Large            0.00
loc2.elevatorType_Medium           0.00
loc2.elevatorType_Small            0.00
loc2.parkingType_Driveway          1.00
loc2.parkingType_Loading Dock      0.00
loc2.parkingType_Parking Lot       0.00
loc2.parkingType_Street            0.00
loc2.type_Apartment                0.00
loc2.type_House                    1.00
loc2.type_Storage                  0.00
furniture_polarity                 0.00
furniture_subjectivity             0.00
reference_polarity                 0.00
reference_subjectivity             0.00
mention_polarity                  -0.20
mention_subjectivity               0.55


In [7]:
y = [1 if x == 'Big' else 0 for x in data['truck_type']]

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_val, y_train, y_val = train_test_split(X,y)

In [10]:
# Begin modeling!
from sklearn.ensemble import RandomForestClassifier

In [11]:
forest = RandomForestClassifier(max_depth=300)
forest.fit(X_train, y_train)
forest.score(X_val, y_val)

0.77236116998728277

In [12]:
from sklearn.model_selection import GridSearchCV

In [13]:
# param_grid = {'n_estimators':[750, 1000],'max_depth':[500]}
# param_grid = {'class_weight' :['balanced',None]} # No class weighting is best.


# forest = RandomForestClassifier(max_depth=500, n_estimators = 1000)

# grid_forest = GridSearchCV(forest, param_grid)

# grid_forest.fit(X_train, y_train)

# grid_forest.best_score_ # Not too bad given how many big truck jobs can do small truck jobs.

In [14]:
forest = RandomForestClassifier(class_weight=None, max_depth=500, n_estimators=1000)

forest.fit(X_train, y_train)
forest.score(X_val, y_val)

0.80796947859262402

In [15]:
from sklearn.linear_model import LogisticRegressionCV

In [16]:
logreg = LogisticRegressionCV()
logreg.fit(X_train, y_train)
logreg.score(X_val, y_val)

0.79694785926239931

In [17]:
from sklearn.neighbors import KNeighborsClassifier

In [18]:
nn = KNeighborsClassifier()

grid_nn = GridSearchCV(nn, param_grid={'n_neighbors':[3,5,10,15]})



grid_nn.fit(X_train, y_train)
grid_nn.best_score_

0.7882685512367491

In [19]:
# Random Forest does the best!

In [20]:
from sklearn.metrics import confusion_matrix, roc_auc_score

In [21]:
roc_auc_score(y_val, forest.predict_proba(X_val)[:,1])

0.8531718765041143

In [22]:
CM = confusion_matrix(y_val, forest.predict(X_val))

TN = CM[0][0]
TP = CM[1][1]
FN = CM[1][0]
FP = CM[0][1]

In [23]:
# True negative rate
TN / (TN + FP) # 90.7% of the time, my model predicts a small truck job to be an actual small truck job

0.90751086281812543

In [27]:
import pickle
with open('models/small_or_big_model.p', 'wb') as f:
    pickle.dump(forest, f)

In [28]:
print("Features sorted by their score:")
print(sorted(zip(map(lambda x: round(x, 4), forest.feature_importances_), X_train.columns), 
             reverse=True))

Features sorted by their score:
[(0.098299999999999998, 'loc1.sqFt'), (0.067400000000000002, 'loc2.sqFt'), (0.063899999999999998, 'boxes'), (0.040599999999999997, 'furniture_list_length'), (0.0293, 'num_noun_phrases'), (0.026200000000000001, 'driving_distance'), (0.024, 'chair'), (0.022599999999999999, 'loc2.type_House'), (0.021399999999999999, 'table'), (0.020199999999999999, 'furniture_polarity'), (0.0189, 'furniture_subjectivity'), (0.018499999999999999, 'loc2.type_Apartment'), (0.015900000000000001, 'loc1.type_House'), (0.015299999999999999, 'mention_polarity'), (0.0149, 'dining'), (0.014500000000000001, 'help_packing'), (0.0137, 'mention_subjectivity'), (0.012999999999999999, 'crib'), (0.012800000000000001, 'loc2.lengthOfWalkOptID'), (0.012800000000000001, 'bed'), (0.012500000000000001, 'loc1.lengthOfWalkOptID'), (0.0101, 'large'), (0.0097000000000000003, 'loc2.stairs'), (0.0092999999999999992, 'room'), (0.0088000000000000005, 'cabinet'), (0.0085000000000000006, 'mattress'), (0.00