In [1]:
get_ipython().run_line_magic('matplotlib', 'inline')
import os
import importlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import problem
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, FunctionTransformer
from sklearn.decomposition import PCA
import geopy.distance
from sklearn.metrics import mean_squared_error
from model_selection_python import *
import statsmodels.api as sm
import seaborn as sns
from sklearn.model_selection import GridSearchCV

pd.set_option('display.max_columns', None)

In [None]:
def get_problem_data(get_dummies=True):
    
    X_train = pd.read_csv('../data/X_train.csv')
    X_test = pd.read_csv('../data/X_test.csv')
    y_train = pd.read_csv('../data/y_train.csv')
    y_test = pd.read_csv('../data/y_test.csv')
    
#     X_train.loc[:, 'DateOfDeparture'] = pd.to_datetime(X_train.loc[:, 'DateOfDeparture'])
#     X_test.loc[:, 'DateOfDeparture'] = pd.to_datetime(X_test.loc[:, 'DateOfDeparture'])
    
    X_train.drop(['DateOfDeparture', 'state_arrival', 'state_departure'], axis=1, inplace=True)
    X_test.drop(['DateOfDeparture', 'state_arrival', 'state_departure'], axis=1, inplace=True)
    
    if get_dummies:
        X_train = pd.get_dummies(X_train, drop_first=True)
        X_test = pd.get_dummies(X_test, drop_first=True)
   
    return X_train, y_train, X_test, y_test

In [None]:
X_train, y_train, X_test, y_test = get_problem_data(get_dummies=False)
print("X_train = ", X_train.shape)
print("y_train = ", y_train.shape)
print("X_test = ", X_test.shape)
print("y_test = ", y_test.shape)

print(X_train.info())
print(y_train.info())

In [None]:
Xy_train = X_train.copy()
Xy_train['Passengers'] = y_train

In [None]:
plt.hist(y_train, bins=20)

In [None]:
plt.figure(figsize=(30, 30))
sns.heatmap(abs(Xy_train.corr()), cmap='BrBG', annot=True, vmin=-1, vmax=1)

In [None]:
plt.figure(figsize=(8, 15))
heatmap = sns.heatmap(Xy_train.corr()[['Passengers']].sort_values(by='Passengers', ascending=False), vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Features Correlating with Sales Price', fontdict={'fontsize':18}, pad=16);

In [None]:
from sklearn.neighbors import KNeighborsRegressor

grid_params = {
    'n_neighbors': range(1, 200),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan'],
}

gs = GridSearchCV(
    KNeighborsRegressor(),
    grid_params,
    verbose=1,
    cv=3,
    n_jobs=-1
)

gs_results = gs.fit(X_train, y_train)

In [None]:
print(gs_results.best_score_)
print(gs_results.best_estimator_)
print(gs_results.best_params_)

In [None]:
# X_train, y_train = problem.get_train_data('..')

In [None]:
from sklearn.ensemble import RandomForestRegressor

grid_params = {
    'randomforestregressor__n_estimators': [50, 100],
    'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'],
    'randomforestregressor__max_depth': [4, 6, 8],
    'randomforestregressor__criterion': ['mse']
}

pipe = make_pipeline(StandardScaler(with_mean=False), RandomForestRegressor())

gs = GridSearchCV(
    pipe,
    grid_params,
    verbose=1,
    cv=3,
    n_jobs=-1
)

gs_results = gs.fit(X_train, y_train)

In [None]:
print(gs_results.best_score_)
print(gs_results.best_estimator_)
print(gs_results.best_params_)

## Feature Importance

In [None]:
# from sklearn.inspection import permutation_importance

# feature_importances = permutation_importance(
#     pipe, X_train, y_train, n_repeats=10
# )
# sorted_idx = feature_importances.importances_mean.argsort()

In [None]:
# fig, ax = plt.subplots(figsize=(15, 15))
# ax.boxplot(feature_importances.importances[sorted_idx].T,
#            vert=False, labels=X_train.columns[sorted_idx])
# ax.set_title("Permutation Importances (train set)")
# fig.tight_layout()
# plt.show()

In [None]:
# feature_importances = permutation_importance(
#     pipe, X_test, y_test, n_repeats=10
# )
# sorted_idx = feature_importances.importances_mean.argsort()

In [None]:
# fig, ax = plt.subplots(figsize=(15, 15))
# ax.boxplot(feature_importances.importances[sorted_idx].T,
#            vert=False, labels=X_test.columns[sorted_idx])
# ax.set_title("Permutation Importances (test set)")
# fig.tight_layout()
# plt.show()

## Predictor Selection

In [None]:
X_train = sm.add_constant(X_train)
print(X_train.info())
print("X as array:", np.asarray(X_train))
print("y as array:", np.asarray(y_train))
model_both = forwardSelection(X_train.astype(float), y_train.astype(float))

In [None]:
def plot_pca(data, index, n_components=2):
    """ Plot the PCA transformation of Sklearn module

    Args:
        data (np.array): Original scaled data as numpy array 
                         (n samples, d features)
        index (list): list of strings to label samples
        n_components (int, optional): Number of Principal Components to keep. 
                                      Defaults to 2.
    """

    pca = PCA(n_components=n_components)
    pca_data = pca.fit_transform(data)
    per_var = np.round(pca.explained_variance_ratio_ * 100, decimals=1)

    labels = []
    for i in range(len(per_var)):
        labels.append('PC' + str(i + 1) + " : " + str(per_var[i]))

    pca_df = pd.DataFrame(pca_data, index=index, columns=labels)

    if n_components == 2:
        plt.scatter(pca_df[labels[0]], pca_df[labels[1]], alpha=0.1)
        plt.xlabel(labels[0])
        plt.ylabel(labels[1])
#         for sample in pca_df.index:
#             plt.annotate(sample, (pca_df.loc[sample, labels[0]], pca_df.loc[sample, labels[1]]), rotation=45)
    
    elif n_components == 3:
        fig = plt.figure()
        ax = fig.add_subplot(111, projection='3d')
        X = pca_df[labels[0]]
        Y = pca_df[labels[1]]
        Z = pca_df[labels[2]]
        ax.scatter(X, Y, Z)
        ax.set_xlabel(labels[0], labelpad=20)
        ax.set_ylabel(labels[1], labelpad=20)
        ax.set_zlabel(labels[2], labelpad=20)
        for sample in pca_df.index:
            ax.text(pca_df.loc[sample, labels[0]], pca_df.loc[sample, labels[1]], pca_df.loc[sample, labels[2]],
                    '%s' % sample, size=20, color='k', rotation=50) 
    plt.title("Data projected on space given by the {} principal components.".format(n_components))
    plt.show()
    
    plt.hist(per_var)
    plt.show()
        

scaler = StandardScaler()
scaler.fit(X_train)

scaled_X_train_np = scaler.transform(X_train)
scaled_X_train_df = pd.DataFrame(scaled_X_train_np, index=X_train.index, columns=X_train.columns)
# scaled_consumption_df 

pca = PCA(n_components=25)
pca_data = pca.fit_transform(scaled_X_train_np)
per_var = np.round(pca.explained_variance_ratio_ * 100, decimals=1)
print(per_var)
print(sum(per_var))

## Model

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression

pipe = make_pipeline(OneHotEncoder(handle_unknown='ignore'),
                    StandardScaler(with_mean=False),
                    LinearRegression())

In [None]:
pipe.fit(X_train, y_train)
y_train_pred = pipe.predict(X_train)
#print("10 first y_train pred = ", y_train_pred[:10])
#print("10 first y_train = ", y_train[:10])
print("Score on train set = ", pipe.score(X_train, y_train))

mean_error_train = mean_squared_error(y_train, y_train_pred)
print("Mean square error = ", mean_error_train)

y_test_pred = pipe.predict(X_test)
#print("10 first y_test pred = ", y_test_pred[:10])
#print("10 first y_test = ", y_test[:10])
print("Score on test set = ", pipe.score(X_test, y_test))

mean_error_test = mean_squared_error(y_test, y_test_pred)
print("Mean square error = ", mean_error_test)