# 4. Preliminary Modelling


In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import sklearn



## Create Train-Test Splits

In [2]:
feat_df = pd.read_pickle('../data/features_df.pkl')
meta_df = pd.read_csv('../data/speechdetails.csv')

In [3]:
#Removes all the text that was in before
feat_df = feat_df.select_dtypes(exclude=['object'])

In [4]:
X = feat_df.values
y = meta_df['IC'].values

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,y, 
                                                    test_size=0.2,
                                                    random_state=11)

## XGBOOST

In [7]:
xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

In [8]:
xg_reg.fit(X_train, y_train)

XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.3, gamma=0, importance_type='gain',
       learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=10, n_jobs=1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)

In [None]:
preds = xg_reg.predict(X_test)

In [None]:
for i in range(len(X_test)):
    print('Prediction',preds[i],'\t','True',y_test[i])

Prediction 1.4418921 	 True 2.333
Prediction 1.2277672 	 True 1.75
Prediction 1.4418921 	 True 2.1
Prediction 1.4089674 	 True 1.6
Prediction 1.399816 	 True 2.5
Prediction 1.3489909 	 True 1.9
Prediction 1.3177259 	 True 1.625
Prediction 1.3786222 	 True 1.5
Prediction 1.4089674 	 True 1.4
Prediction 1.3456593 	 True 1.2
Prediction 1.4372418 	 True 1.875
Prediction 1.1730297 	 True 1.8
Prediction 1.365099 	 True 2.2
Prediction 1.4418921 	 True 1.875
Prediction 1.3140495 	 True 1.5
Prediction 1.4418921 	 True 1.6
Prediction 1.4418921 	 True 2.25
Prediction 1.4418921 	 True 2.0
Prediction 1.4418921 	 True 2.0
Prediction 1.4089674 	 True 1.0
Prediction 1.2490172 	 True 1.3
Prediction 1.4089674 	 True 1.75
Prediction 1.3861767 	 True 1.625
Prediction 1.2954475 	 True 1.5
Prediction 1.4290214 	 True 1.75
Prediction 1.365099 	 True 1.7
Prediction 1.4418921 	 True 2.1
Prediction 1.2277672 	 True 2.0
Prediction 1.3999717 	 True 1.8
Prediction 1.3462541 	 True 2.1


In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

RMSE: 0.527372


### Try with K-fold Cross Validation

In [None]:
data_dmatrix = xgb.DMatrix(data=X,label=y)

In [None]:
params = {"objective":"reg:linear",'colsample_bytree': 0.3,'learning_rate': 0.1,
                'max_depth': 5, 'alpha': 10}

cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
                    num_boost_round=50,early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123)

In [None]:
cv_results

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,1.222118,0.039675,1.220663,0.087276
1,1.124399,0.036571,1.124137,0.089193
2,1.037226,0.033902,1.037717,0.093476
3,0.9597,0.031666,0.96071,0.097018
4,0.89093,0.029815,0.892631,0.1
5,0.829778,0.028143,0.832865,0.100951
6,0.775398,0.027006,0.780085,0.102865
7,0.727083,0.025794,0.733415,0.104681
8,0.685052,0.025115,0.692368,0.10585
9,0.647245,0.02485,0.655065,0.105256


In [None]:
print((cv_results["test-rmse-mean"]).tail(1))

49    0.41687
Name: test-rmse-mean, dtype: float64


### Visualize Boosting Trees and Feature Importance
We can visualize individual trees from the fully boosted model that XGBoost creates using the entire dataset.

In [None]:
import matplotlib.pyplot as plt

In [None]:
model = xgb.XGBRegressor()
model.fit(X, y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, importance_type='gain',
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)

In [None]:
print(model.feature_importances_)

[0.01383704 0.04233427 0.04702509 0.04291662 0.         0.
 0.03668954 0.06519893 0.         0.05106923 0.03429625 0.03446979
 0.02683082 0.04484382 0.07273355 0.06962946 0.         0.
 0.         0.         0.         0.         0.03727857 0.09618485
 0.07310277 0.07819398 0.04760987 0.08575562]


## Neural Net

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense


Using TensorFlow backend.


In [None]:
len(X[0])

28

In [None]:
model = Sequential()
model.add(Dense(16, input_dim=len(X[0]), activation='relu'))
model.add(Dense(4, activation='relu'))
model.add(Dense(1, activation='linear'))
model.compile(loss='mse', optimizer='adam')
model.fit(X, y, epochs=10, verbose=0)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
