In [1]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns


from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.metrics import mean_squared_error, silhouette_score
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsRegressor

# Data Import


In [2]:
train = pd.read_csv('./data/train.csv', index_col='id')
test = pd.read_csv('./data/test.csv', index_col='id')

features = train.columns.drop('target')
target = train.loc[:,['target']]
train.drop(columns = ['target'], inplace = True)


ss = StandardScaler()

train_ss = train.copy()
train_ss = ss.fit_transform(train_ss)
train_ss = pd.DataFrame(train_ss, index = train.index, columns = features)
test_ss = test.copy()
test_ss = ss.transform(test_ss)
test_ss = pd.DataFrame(test_ss, index = test.index, columns = features)



poly = PolynomialFeatures(degree = 2, include_bias=False)
ss = StandardScaler()


train_poly = train.copy()
train_poly = poly.fit_transform(train_poly)
train_poly = ss.fit_transform(train_poly)
train_poly = pd.DataFrame(train_poly, index = train.index, columns = poly.get_feature_names(features))
train_poly.columns = train_poly.columns.str.replace(' ', '_')

test_poly = test.copy()
test_poly = poly.transform(test_poly)
test_poly = ss.transform(test_poly)
test_poly = pd.DataFrame(test_poly, index = test.index, columns = poly.get_feature_names(features))
test_poly.columns = test_poly.columns.str.replace(' ', '_')


# Smaller Datasets for time efficient experimenting
train_ss_sub = train_ss.sample(frac = 0.25, axis = 0, random_state = 42)
train_ss_sub = train_ss_sub.sort_index()
train_poly_sub = train_poly.loc[train_poly.index.isin(train_ss_sub.index),:]
train_poly_sub = train_poly_sub.sort_index()
target_sub = target.loc[target.index.isin(train_ss_sub.index), :]
target_sub = target_sub.sort_index()

In [21]:
X_train, X_test, y_train, y_test = train_test_split(train_ss_sub,
                                                    target_sub,
                                                    test_size = 0.4,
                                                    random_state = 42
                                                   )
X_train_poly, X_test_poly, y_train, y_test = train_test_split(train_poly_sub,
                                                    target_sub,
                                                    test_size = 0.4,
                                                    random_state = 42
                                                   )


# Create Cluster-Based Model

In [6]:
kmeans_sil_scores_file = './summaries/sample_cluster_scores.csv'
if not os.path.isfile(kmeans_sil_scores_file):
    sil_scores = []
    for k in range(2, 16):
        km_cl = KMeans(n_clusters=k)
        km_cl.fit(X_train)
        inertia = km_cl.inertia_
        sil = silhouette_score(X_train, km_cl.labels_)
        sil_scores.append([k, inertia, sil])
    sil_scores_df = pd.DataFrame(sil_scores)
    sil_scores_df.columns = ['k', 'inertia', 'silhouette']
    sil_scores_df.to_csv(kmeans_sil_scores_file, index = False)
else:
    sil_scores_df = pd.read_csv(kmeans_sil_scores_file, index_col = 'k')
    
round(sil_scores_df.head(15), 4)

Unnamed: 0,k,inertia,silhouette
0,2,439223.8537,0.2683
1,3,384355.4894,0.1997
2,4,355524.1129,0.1872
3,5,333715.6734,0.1502
4,6,319824.8825,0.1416
5,7,308185.9343,0.1346
6,8,298675.3045,0.1345
7,9,289811.1226,0.1177
8,10,281841.681,0.1152
9,11,274564.8544,0.114


### Cluster Average Test

In [8]:
km_clus = KMeans(n_clusters = 9, random_state = 42)
km_clus.fit(X_train)

y_clus = y_train.copy()
y_clus['cluster'] = km_clus.labels_
cluster_means = y_clus.groupby('cluster').mean().reset_index()
cluster_means.columns = ['cluster', 'cl_mean']

y_clus = y_clus.merge(cluster_means, how = 'left', on = 'cluster')
clus_train_rmse = np.sqrt(mean_squared_error(y_clus['target'], y_clus['cl_mean']))
print('Cluster Average Train RMSE: {:.4f}'.format(clus_train_rmse))


y_hat_cluster = pd.DataFrame(km_clus.predict(X_test), index = X_test.index, columns = ['cluster'])
y_hat_cluster = y_hat_cluster.merge(cluster_means, how = 'left', on = 'cluster')
y_hat_cluster.index = X_test.index

clus_test_rmse = np.sqrt(mean_squared_error(y_test, y_hat_cluster['cl_mean']))
print('Cluster Average Test RMSE: {:.4f}'.format(clus_test_rmse))

Cluster Average Train RMSE: 0.7281
Cluster Average Test RMSE: 0.7264


### Pure Cluster Average Submission

In [13]:
km_clus = KMeans(n_clusters = 9, random_state = 42)
km_clus.fit(train_ss)

y_clus = target.copy()
y_clus['cluster'] = km_clus.labels_
cluster_means = y_clus.groupby('cluster').mean().reset_index()
cluster_means.columns = ['cluster', 'cl_mean']

y_clus = y_clus.merge(cluster_means, how = 'left', on = 'cluster')
clus_train_rmse = np.sqrt(mean_squared_error(y_clus['target'], y_clus['cl_mean']))
print('Cluster Average Train RMSE: {:.4f}'.format(clus_train_rmse))


y_sub_cluster = pd.DataFrame(km_clus.predict(test_ss), index = test_ss.index, columns = ['cluster'])
y_sub_cluster = y_sub_cluster.merge(cluster_means, how = 'left', on = 'cluster')
y_sub_cluster.index = test_ss.index
y_sub_cluster = y_sub_cluster.drop('cluster', axis = 1)
y_sub_cluster.columns = ['target']
y_sub_cluster.to_csv('./submissions/cluster_sub.csv')


Cluster Average Train RMSE: 0.7298


### Expanded Cluster Average Submission

In [14]:
km_clus = KMeans(n_clusters = 20, random_state = 42)
km_clus.fit(train_ss)

y_clus = target.copy()
y_clus['cluster'] = km_clus.labels_
cluster_means = y_clus.groupby('cluster').mean().reset_index()
cluster_means.columns = ['cluster', 'cl_mean']

y_clus = y_clus.merge(cluster_means, how = 'left', on = 'cluster')
clus_train_rmse = np.sqrt(mean_squared_error(y_clus['target'], y_clus['cl_mean']))
print('Cluster Average Train RMSE: {:.4f}'.format(clus_train_rmse))


y_sub_cluster = pd.DataFrame(km_clus.predict(test_ss), index = test_ss.index, columns = ['cluster'])
y_sub_cluster = y_sub_cluster.merge(cluster_means, how = 'left', on = 'cluster')
y_sub_cluster.index = test_ss.index
y_sub_cluster = y_sub_cluster.drop('cluster', axis = 1)
y_sub_cluster.columns = ['target']
y_sub_cluster.to_csv('./submissions/cluster_sub2.csv')


Cluster Average Train RMSE: 0.7269


# Incorporate Cluster as OLS Feature

In [15]:
ols_poly = LinearRegression()
ols_poly.fit(X_train_poly, y_train)
y_hat_ols_poly = ols_poly.predict(X_test_poly)
rmse_ols_poly = np.sqrt(mean_squared_error(y_hat_ols_poly, y_test))

print('OLS with Poly Terms Test RMSE: {:.4f}'.format(rmse_ols_poly))


OLS with Poly Terms Test RMSE: 0.7167


In [32]:
km_clus = KMeans(n_clusters = 5, random_state = 42)
km_clus.fit(X_train)

enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(km_clus.labels_.reshape((len(km_clus.labels_),1)))
clus_enc = enc.transform(km_clus.labels_.reshape((len(km_clus.labels_),1)))
clus_ec_df = pd.DataFrame.sparse.from_spmatrix(clus_enc)
clus_ec_df.columns = ['clus_' + str(c + 1) for c in clus_ec_df.columns ]
clus_ec_df.index = X_train.index

X_train_poly_clus = pd.merge(X_train_poly, clus_ec_df, left_index = True, right_index = True)
X_train_poly_clus.drop(columns = ['clus_1'], inplace = True)


clus_ec_test = enc.transform(km_clus.predict(X_test).reshape((X_test.shape[0],1)))
clus_ec_test = pd.DataFrame.sparse.from_spmatrix(clus_ec_test)
clus_ec_test.columns = ['clus_' + str(c + 1) for c in clus_ec_test.columns ]
clus_ec_test.index = X_test.index

X_test_poly_clus = pd.merge(X_test_poly, clus_ec_test, left_index = True, right_index = True)
X_test_poly_clus.drop(columns = ['clus_1'], inplace = True)


ols_poly_clus = LinearRegression()
ols_poly_clus.fit(X_train_poly_clus, y_train)

y_hat_ols_poly_clus = ols_poly_clus.predict(X_test_poly_clus)
rmse_ols_poly_clus = np.sqrt(mean_squared_error(y_hat_ols_poly_clus, y_test))

print('OLS with Poly Terms and Clusters Test RMSE: {:.4f}'.format(rmse_ols_poly_clus))


OLS with Poly Terms and Clusters Test RMSE: 0.7168


In [31]:
km_clus = KMeans(n_clusters = 9, random_state = 42)
km_clus.fit(X_train)

enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(km_clus.labels_.reshape((len(km_clus.labels_),1)))
clus_enc = enc.transform(km_clus.labels_.reshape((len(km_clus.labels_),1)))
clus_ec_df = pd.DataFrame.sparse.from_spmatrix(clus_enc)
clus_ec_df.columns = ['clus_' + str(c + 1) for c in clus_ec_df.columns ]
clus_ec_df.index = X_train.index

X_train_poly_clus = pd.merge(X_train_poly, clus_ec_df, left_index = True, right_index = True)
X_train_poly_clus.drop(columns = ['clus_1'], inplace = True)


clus_ec_test = enc.transform(km_clus.predict(X_test).reshape((X_test.shape[0],1)))
clus_ec_test = pd.DataFrame.sparse.from_spmatrix(clus_ec_test)
clus_ec_test.columns = ['clus_' + str(c + 1) for c in clus_ec_test.columns ]
clus_ec_test.index = X_test.index

X_test_poly_clus = pd.merge(X_test_poly, clus_ec_test, left_index = True, right_index = True)
X_test_poly_clus.drop(columns = ['clus_1'], inplace = True)


ols_poly_clus = LinearRegression()
ols_poly_clus.fit(X_train_poly_clus, y_train)

y_hat_ols_poly_clus = ols_poly_clus.predict(X_test_poly_clus)
rmse_ols_poly_clus = np.sqrt(mean_squared_error(y_hat_ols_poly_clus, y_test))

print('OLS with Poly Terms and Clusters Test RMSE: {:.4f}'.format(rmse_ols_poly_clus))



OLS with Poly Terms and Clusters Test RMSE: 0.7164


In [30]:
km_clus = KMeans(n_clusters = 20, random_state = 42)
km_clus.fit(X_train)


enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(km_clus.labels_.reshape((len(km_clus.labels_),1)))
clus_enc = enc.transform(km_clus.labels_.reshape((len(km_clus.labels_),1)))
clus_ec_df = pd.DataFrame.sparse.from_spmatrix(clus_enc)
clus_ec_df.columns = ['clus_' + str(c + 1) for c in clus_ec_df.columns ]
clus_ec_df.index = X_train.index

X_train_poly_clus = pd.merge(X_train_poly, clus_ec_df, left_index = True, right_index = True)
X_train_poly_clus.drop(columns = ['clus_1'], inplace = True)


clus_ec_test = enc.transform(km_clus.predict(X_test).reshape((X_test.shape[0],1)))
clus_ec_test = pd.DataFrame.sparse.from_spmatrix(clus_ec_test)
clus_ec_test.columns = ['clus_' + str(c + 1) for c in clus_ec_test.columns ]
clus_ec_test.index = X_test.index

X_test_poly_clus = pd.merge(X_test_poly, clus_ec_test, left_index = True, right_index = True)
X_test_poly_clus.drop(columns = ['clus_1'], inplace = True)


ols_poly_clus = LinearRegression()
ols_poly_clus.fit(X_train_poly_clus, y_train)

y_hat_ols_poly_clus = ols_poly_clus.predict(X_test_poly_clus)
rmse_ols_poly_clus = np.sqrt(mean_squared_error(y_hat_ols_poly_clus, y_test))

print('OLS with Poly Terms and Clusters Test RMSE: {:.4f}'.format(rmse_ols_poly_clus))



OLS with Poly Terms and Clusters Test RMSE: 0.7163


### OLS with Clusters Submission

In [33]:
km_clus = KMeans(n_clusters = 9, random_state = 42)
km_clus.fit(train_ss)


enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(km_clus.labels_.reshape((len(km_clus.labels_),1)))
clus_enc = enc.transform(km_clus.labels_.reshape((len(km_clus.labels_),1)))
clus_ec_df = pd.DataFrame.sparse.from_spmatrix(clus_enc)
clus_ec_df.columns = ['clus_' + str(c + 1) for c in clus_ec_df.columns ]
clus_ec_df.index = train_ss.index

X_train_poly_clus = pd.merge(train_poly, clus_ec_df, left_index = True, right_index = True)
X_train_poly_clus.drop(columns = ['clus_1'], inplace = True)


clus_ec_test = enc.transform(km_clus.predict(test_ss).reshape((test_ss.shape[0],1)))
clus_ec_test = pd.DataFrame.sparse.from_spmatrix(clus_ec_test)
clus_ec_test.columns = ['clus_' + str(c + 1) for c in clus_ec_test.columns ]
clus_ec_test.index = test_ss.index

X_test_poly_clus = pd.merge(test_poly, clus_ec_test, left_index = True, right_index = True)
X_test_poly_clus.drop(columns = ['clus_1'], inplace = True)


ols_poly_clus = LinearRegression()
ols_poly_clus.fit(X_train_poly_clus, target)

y_hat_ols_poly_clus = ols_poly_clus.predict(X_test_poly_clus)


In [35]:
y_sub_ols_cl = pd.DataFrame(y_hat_ols_poly_clus)
y_sub_ols_cl.index = test_ss.index
y_sub_ols_cl.columns = ['target']
y_sub_ols_cl.to_csv('./submissions/ols_cl9_sub.csv')


# KNN Regressor

In [41]:
knn_params = {
    'n_neighbors': range(11, 362, 50)
}

knn_gridsearcher = GridSearchCV(KNeighborsRegressor(), 
                                knn_params, 
                                scoring = 'neg_mean_squared_error', 
                                cv = 10, 
                                verbose = 1)
knn_gridsearcher.fit(X_train, y_train)


Fitting 10 folds for each of 8 candidates, totalling 80 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed: 10.8min finished


GridSearchCV(cv=10, error_score=nan,
             estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30,
                                           metric='minkowski',
                                           metric_params=None, n_jobs=None,
                                           n_neighbors=5, p=2,
                                           weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid={'n_neighbors': range(11, 362, 50)},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=1)

In [44]:
yhat_knn = knn_gridsearcher.predict(X_test)

print(knn_gridsearcher.best_params_)
np.sqrt(mean_squared_error(y_test, yhat_knn))

{'n_neighbors': 111}


0.7158564465376754

### KNN Submission

In [47]:
knn = KNeighborsRegressor(n_neighbors = 111)
knn.fit(train_ss, target)
yhat_sub_knn = knn.predict(test_ss)

yhat_sub_knn = pd.DataFrame(yhat_sub_knn)
yhat_sub_knn.index = test_ss.index
yhat_sub_knn.columns = ['target']
yhat_sub_knn.to_csv('./submissions/knn111_sub.csv')
