In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn import tree
from sklearn.tree import export_text

In [3]:
ps6 = pd.read_csv(r'biden.csv')

In [6]:
ps6.describe()

Unnamed: 0,biden,female,age,educ,dem,rep
count,1807.0,1807.0,1807.0,1807.0,1807.0,1807.0
mean,62.163807,0.55285,47.535141,13.360266,0.431655,0.205313
std,23.462034,0.497337,16.887444,2.440257,0.495444,0.404042
min,0.0,0.0,18.0,0.0,0.0,0.0
25%,50.0,0.0,34.0,12.0,0.0,0.0
50%,60.0,1.0,47.0,13.0,0.0,0.0
75%,85.0,1.0,59.5,16.0,1.0,0.0
max,100.0,1.0,93.0,17.0,1.0,1.0


In [7]:
# Problem 1
X = ps6.drop(ps6.columns[0],axis=1)
y = ps6['biden']

In [13]:
# Split the full dataset into training and test data (7:3)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25)

In [23]:
# Fit a decision tree on the train data
dtr = tree.DecisionTreeRegressor(max_depth=3, min_samples_leaf=5)
dtr = dtr.fit(X_train, y_train)

In [18]:
# Predict the data using test dataset
y_pred = dtr.predict(X_test)

In [25]:
# print out text representation of the tree
X_var_names = ['female', 'age', 'educ', 'dem', 'rep']
r = export_text(dtr, feature_names = X_var_names)
print(r)

|--- dem <= 0.50
|   |--- rep <= 0.50
|   |   |--- female <= 0.50
|   |   |   |--- value: [56.49]
|   |   |--- female >  0.50
|   |   |   |--- value: [61.27]
|   |--- rep >  0.50
|   |   |--- female <= 0.50
|   |   |   |--- value: [38.33]
|   |   |--- female >  0.50
|   |   |   |--- value: [44.89]
|--- dem >  0.50
|   |--- age <= 54.50
|   |   |--- educ <= 15.50
|   |   |   |--- value: [71.11]
|   |   |--- educ >  15.50
|   |   |   |--- value: [76.59]
|   |--- age >  54.50
|   |   |--- female <= 0.50
|   |   |   |--- value: [75.19]
|   |   |--- female >  0.50
|   |   |   |--- value: [80.29]



In [33]:
# MSE
MSE = mean_squared_error(y_test, dtr.predict(X_test))
print('The MSE of a tree of depth ', dtr.get_depth(), ' and ',
      dtr.get_n_leaves(), ' leaves = ', MSE)

The MSE of a tree of depth  3  and  8  leaves =  396.1937146321307


In [35]:
# Problem 2
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
param_dist = {"max_depth": [3, 10], 
              "min_samples_split": sp_randint(2, 20),
              "min_samples_leaf": sp_randint(2, 20)}

In [47]:
grid = RandomizedSearchCV(dtr, param_dist, cv = 5, scoring = 'neg_mean_squared_error', 
                          n_iter = 100, n_jobs = -1, random_state = 25)

In [48]:
grid.fit(X_train, y_train)

RandomizedSearchCV(cv=5,
                   estimator=DecisionTreeRegressor(max_depth=3,
                                                   min_samples_leaf=5),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'max_depth': [3, 10],
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x118e6fc10>,
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x118e6f1c0>},
                   random_state=25, scoring='neg_mean_squared_error')

In [50]:
best_params = grid.best_params_
best_scores = grid.best_score_
print(best_params)
print(best_scores)

{'max_depth': 3, 'min_samples_leaf': 2, 'min_samples_split': 9}
-404.70994082761746


In [51]:
# Problem 3
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()

In [52]:
param_dist = {"n_estimators": [10, 200], 
              "max_depth": [3, 10],
              "min_samples_split": sp_randint(2, 20), 
              "min_samples_leaf": sp_randint(2, 20),
              "max_features": sp_randint(1, 5)}


In [53]:
grid2 = RandomizedSearchCV(rfr, param_dist, cv = 5, scoring = 'neg_mean_squared_error', 
                          n_iter = 100, n_jobs = -1, random_state = 25)

grid2.fit(X_train, y_train)

RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'max_depth': [3, 10],
                                        'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x122d65220>,
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x122d78370>,
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x122d78760>,
                                        'n_estimators': [10, 200]},
                   random_state=25, scoring='neg_mean_squared_error')

In [57]:
best_params = grid2.best_params_
best_score = grid2.best_score_
print(best_params)
print(best_score)

{'max_depth': 10, 'max_features': 2, 'min_samples_leaf': 19, 'min_samples_split': 11, 'n_estimators': 200}
-397.29377597944983
