In [None]:
# final exam for IBM ML 4

import numpy as np
import pickle, os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow.keras

from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures, OneHotEncoder, LabelEncoder
from sklearn.model_selection import KFold, cross_val_predict, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline
from scipy.stats import boxcox
from scipy.special import inv_boxcox
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import NMF

pd.set_option('display.max_columns', 35)

In [None]:
path = '../input/house-prices-advanced-regression-techniques/train.csv'
hspr = pd.read_csv(path) # titanic_fullsample
hspr['sample']='train'
hspr.reset_index(inplace=True, drop=True)
print(hspr.head())
print(hspr.shape)

In [None]:
hspr0 = hspr.copy()
cols_tokeep = ['SalePrice', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'ExterCond',
               'BsmtFinSF1', 'TotalBsmtSF', 'HeatingQC', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'FullBath', 'HalfBath', 
               'KitchenQual', 'GarageArea', 'GarageCars', 'TotRmsAbvGrd', 'BedroomAbvGr',
               'ExterQual', 'LotFrontage', 'sample']
hspr = hspr[cols_tokeep]
hspr.dropna(subset=hspr.columns.drop('SalePrice'), inplace=True)
hspr.info()
# there are no missing values.

In [None]:
hspr.describe().transpose()

In [None]:
ord_cols = ['ExterCond', 'HeatingQC', 'KitchenQual', 'ExterQual']
hspr[ord_cols] = hspr[ord_cols].replace(['Po', 'Fa', 'TA', 'Gd', 'Ex'], [1,2,3,4,5])
#print(hspr.BsmtCond.value_counts())

# it makes sense to replace YearBuilt with Age
hspr['Age']=2010-hspr.YearBuilt
hspr.drop(columns=['YearBuilt'], inplace=True)

# transform LotArea
hspr['LotArea']=np.log1p(hspr.LotArea)

In [None]:
# correlation analysis

corr_mat = hspr.corr()
print(corr_mat)

In [None]:
# preprocessing
#hspr.head()
X = hspr[hspr['sample']=='train'].copy()
X.drop(columns=['sample', 'SalePrice'], inplace=True)
y = hspr.SalePrice[hspr['sample']=='train'].copy()

s = MinMaxScaler()
X_df = pd.DataFrame(s.fit_transform(X), columns=X.columns)

In [None]:
X_df

In [None]:
# PCA #

data = X_df.copy()

pca_list = list()
feature_weight_list = list()

# Fit a range of PCA models

for n in range(1, 10):
    
    # Create and fit the model
    PCAmod = PCA(n_components=n)
    PCAmod.fit(data)
    
    # Store the model and variance
    pca_list.append(pd.Series({'n':n, 'model':PCAmod,
                               'var': PCAmod.explained_variance_ratio_.sum()}))
    
    # Calculate and store feature importances
    abs_feature_values = np.abs(PCAmod.components_).sum(axis=0)
    feature_weight_list.append(pd.DataFrame({'n':n, 
                                             'features': data.columns,
                                             'values':abs_feature_values/abs_feature_values.sum()}))
    
pca_df = pd.concat(pca_list, axis=1).T.set_index('n')
pca_df

In [None]:
features_df = (pd.concat(feature_weight_list)
               .pivot(index='n', columns='features', values='values'))

features_df

In [None]:
sns.set_context('talk')
ax = pca_df['var'].plot(kind='bar')

ax.set(xlabel='Number of dimensions',
       ylabel='Percent explained variance',
       title='Explained Variance vs Dimensions');

In [None]:
### Kernel PCA ###
# Custom scorer--use negative rmse of inverse transform
def scorer(pcamodel, X, y=None):

    try:
        X_val = X.values
    except:
        X_val = X
        
    # Calculate and inverse transform the data
    data_inv = pcamodel.fit(X_val).transform(X_val)
    data_inv = pcamodel.inverse_transform(data_inv)
    
    # The error calculation
    mse = mean_squared_error(data_inv.ravel(), X_val.ravel())
    
    # Larger values are better for scorers, so take negative value
    return -1.0 * mse

In [None]:
# The grid search parameters
param_grid = {'gamma':[0.1, 0.5, 1.0, 2, 5],
              'n_components': [5, 6, 7, 8, 9, 10, 11, 12]}

# The grid search
kernelPCA = GridSearchCV(KernelPCA(kernel='rbf', fit_inverse_transform=True),
                         param_grid=param_grid,
                         scoring=scorer,
                         n_jobs=-1)


kernelPCA = kernelPCA.fit(data)

kernelPCA.best_estimator_
temp = kernelPCA.best_score_

In [None]:
data.shape

In [None]:
### Autoencoder ###

from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

ENCODING_DIM = 10
HIDDEN_DIM = 5
### BEGIN SOLUTION
# Encoder model
inputs = Input(shape=(20,)) 
encoder_hidden = Dense(HIDDEN_DIM, activation="sigmoid")(inputs)
encoded = Dense(ENCODING_DIM, activation="relu")(encoder_hidden)
encoder_model = Model(inputs, encoded, name='encoder')

# Decoder model
encoded_inputs = Input(shape=(ENCODING_DIM,), name='encoding')
decoder_hidden = Dense(HIDDEN_DIM, activation="relu")(encoded_inputs)
reconstruction = Dense(20, activation="sigmoid")(decoder_hidden)
decoder_model = Model(encoded_inputs, reconstruction, name='decoder')

# Defining the full model as the combination of the two
outputs = decoder_model(encoder_model(inputs))
full_model = Model(inputs, outputs, name='full_ae')

In [None]:
full_model = Model(inputs=inputs, 
                   outputs=outputs)

#loss='binary_crossentropy',metrics=['accuracy']
full_model.compile(optimizer='rmsprop', loss='binary_crossentropy')

In [None]:
history = full_model.fit(data, data, epochs=10, batch_size=50)
history.history


In [None]:
encoder_prd = encoder_model.predict(data)
encoder_prd.shape

In [None]:
np.sum(np.sum(np.power(decoded_prd - data, 2))) / (decoded_prd.shape[0]*decoded_prd.shape[1])