Before you turn this problem in, make sure everything runs as expected. First, **restart the kernel** (in the menubar, select Kernel$\rightarrow$Restart) and then **run all cells** (in the menubar, select Cell$\rightarrow$Run All).

Make sure you fill in any place that says `YOUR CODE HERE` or "YOUR ANSWER HERE", as well as your name and collaborators below:

In [None]:
Group = "" #G1
NAME = ""
COLLABORATORS = ""

---

### Lab1: cross-validation

Your task: implement a cross-validation framework to select the best set of features.

This notebook provides: 

- Tools and packages
- Download data from [kaggle](https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/data) 
- Data processing 
- Your implementation of cross-validation
- Show the averaged model selection scores: AIC, AICs, BIC, Adjusted R2 (using Training sets)
- Show the averaged MSE scores from training and validation.

You can run this notebook on collab:  <a target="_blank" href="https://colab.research.google.com/github/GabbySuwichaya/Statistical-Learning-EE575/blob/master/Lab1/main.ipynb"> <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

### Tools and Packages

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt  

from sklearn.linear_model import LinearRegression 
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler  

def train_test_split(df, train_percent=.8, seed=None):
    np.random.seed(seed) 
    perm         = np.random.permutation(df.index)
    m            = len(df.index) 
    train_end    = np.floor(int(train_percent * m)) 
    
    train        = df.iloc[perm[:train_end]]
    test         = df.iloc[perm[train_end:]] 

    train       = train.drop(columns=["index"]) 
    test        = test.drop(columns=["index"])

    train       = train.reset_index(drop=True) 
    test        = test.reset_index(drop=True)
    
    return train, test


def likelihood_score(y_training, y_expectation=None): 
    if y_expectation is None:
        y_expectation = np.mean(y_training)
    sigma_hat =  np.mean((y_training - y_expectation)**2)  
    m = len(y_training) 
    score     = - 0.5*m*np.log(2*np.pi*sigma_hat) - 0.5*m
    return score

def model_selection_scores(dataset):

    x_ = dataset["X"]
    y_ = dataset["y"]
    y_predict = dataset["y_predict"] 

    feat_dim = x_.shape[1]
    num_samples  = len(y_) 
    ll_score = likelihood_score(y_, y_expectation=y_predict) 
    

    AIC_score  = -2*ll_score + 2*feat_dim
    AICs_score = -2*ll_score + 2*feat_dim + 2*feat_dim*(feat_dim + 1)/(num_samples - feat_dim - 1)
    BIC_score  = -2*ll_score +  feat_dim*np.log(num_samples)
    
    RSS_score  = np.sum((y_ - y_predict)**2)
    TSS_score  = np.sum((y_ - np.mean(y_))**2) 
    AdjustR2_score = 1 - (RSS_score/(num_samples-feat_dim-1))/(TSS_score/(num_samples-1))
    
    RMSE_score = np.sqrt(np.mean((y_ - y_predict)**2))

    return { "AIC": AIC_score , "AICs": AICs_score, "BIC": BIC_score, "AdjustedR2": AdjustR2_score , "RMSE": RMSE_score }

### Data loading and preprocessing

In [None]:
foldername = "house-prices-advanced-regression-techniques"
df = pd.read_csv('%s/train.csv' % foldername)   
df.keys()

In [None]:
all_column = [  "MoSold",         "YrSold",        "MSSubClass",    "LandSlope",   
                "ScreenPorch",   "MiscVal",       "Condition1",   "OverallCond",      
                "KitchenAbvGr",  "PoolArea",     "Utilities" ,   "SalePrice"] 

Encoding string labels to numerical labels and drop nan values

In [None]:
df = df[all_column]
df = df.apply(LabelEncoder().fit_transform)

df = df.dropna() 
df = df.reset_index()  

train, test = train_test_split(df, train_percent=.8, seed=None)

### Lab1. Cross-validation to find the most suitable set of features

Use cross-validation to find the most suitable set of features. 

In [None]:
num_k_folds = 5

In [None]:
target_column = ["SalePrice"]

feat_column_1   = ["MoSold",       "YrSold"  ]

feat_column_2   = ["MoSold",        "YrSold",       "MSSubClass"     ]

feat_column_3   = ["MoSold",        "YrSold",       "MSSubClass",     "LandSlope"  ]

feat_column_4   = ["MoSold",        "YrSold",       "MSSubClass",     "LandSlope",  
                   "ScreenPorch"  ]

feat_column_5   = ["MoSold",        "YrSold",       "MSSubClass",     "LandSlope",   
                   "ScreenPorch",   "MiscVal"  ]

feat_column_6   = ["MoSold",        "YrSold",       "MSSubClass",     "LandSlope",   
                   "ScreenPorch",   "MiscVal",      "Condition1"  ]

feat_column_7  = ["MoSold",        "YrSold",       "MSSubClass",     "LandSlope",   
                   "ScreenPorch",  "MiscVal",       "Condition1",   "OverallCond" ]

feat_column_8  = [ "MoSold",        "YrSold",       "MSSubClass",     "LandSlope",   
                   "ScreenPorch",  "MiscVal",         "Condition1",   "OverallCond",      
                   "KitchenAbvGr"  ]

feat_column_9  = [ "MoSold",        "YrSold",       "MSSubClass",     "LandSlope",   
                   "ScreenPorch",  "MiscVal",         "Condition1",   "OverallCond",      
                   "KitchenAbvGr",  "PoolArea" ]

feat_column_10  = ["MoSold",        "YrSold",       "MSSubClass",     "LandSlope",   
                   "ScreenPorch",  "MiscVal",       "Condition1",   "OverallCond",      
                   "KitchenAbvGr",  "PoolArea",     "Utilities"  ]


feature_set_dict = {1:feat_column_1, 2:feat_column_2, 3:feat_column_3, 4:feat_column_4, 5: feat_column_5, 
                    6:feat_column_6, 7:feat_column_7, 8:feat_column_8, 9:feat_column_9, 10: feat_column_10 }

In [None]:
def single_fold_operation(train_X, train_y, train_subet_index, valid_subset_index): 
    '''
    This function will perform the training and inference on the k-fold data. 
    Then, it will return the scores (AIC, AICs, BIC, AdjustedR2, RMSE_train, RMSE_valid)

    For example, after you train and run inference, you may get output such as
    y_train_pred and y_valid_pred... Then, you may use them to calculate such scores...
    
    train_subset = {"X":X_train_subset, "y":y_train_subset, "y_predict":y_train_pred}
    valid_subset = {"X":X_valid_subset, "y":y_valid_subset, "y_predict":y_valid_pred} 

    training_score = model_selection_scores(train_subset)
    valid_score   = model_selection_scores(valid_subset)
    
    scores = {  "AIC": training_score["AIC"], 
                "AICs": training_score["AICs"], 
                "BIC": training_score["BIC"], 
                "AdjustedR2": training_score["AdjustedR2"],
                "RMSE_train": training_score["RMSE"], 
                "RMSE_valid":valid_score["RMSE"] }

    return scores
    '''

    # YOUR CODE HERE
    raise NotImplementedError()

    return scores 

In [None]:
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import KFold 
  
AIC_feature_set = {}
AICs_feature_set = {}
BIC_feature_set = {}
AdjustedR2_feature_set = {}

RMSE_train_feature_set = {}
RMSE_valid_feature_set = {}


for key, feat_set in feature_set_dict.items(): 

    X_scaler = StandardScaler() 
    train_X = X_scaler.fit_transform(train[feat_set].values[:] )  
    train_y = train[target_column].values[:]

    kf = KFold(n_splits=num_k_folds) 

    AIC_list  = []
    AICs_list = []
    BIC_list  = []
    AdjustedR2_list = [] 
    RMSE_train_list  = []
    RMSE_valid_list  = []

    for i, (train_subet_index, valid_subset_index) in enumerate(kf.split(train_X)):

        scores = single_fold_operation(train_X, train_y, train_subet_index, valid_subset_index)
 

        AIC_list.append(scores["AIC"])
        AICs_list.append(scores["AICs"])
        BIC_list.append(scores["BIC"])
        AdjustedR2_list.append(scores["AdjustedR2"])
 
        RMSE_train_list.append(scores["RMSE_train"])
        RMSE_valid_list.append(scores["RMSE_valid"])
     

    AIC_feature_set[key]  = np.mean(AIC_list)
    AICs_feature_set[key] = np.mean(AICs_list) 
    BIC_feature_set[key]  = np.mean(BIC_list) 
    AdjustedR2_feature_set[key]  = np.mean(AdjustedR2_list) 

    RMSE_train_feature_set[key] = np.mean(RMSE_train_list) 
    RMSE_valid_feature_set[key] = np.mean(RMSE_valid_list) 

Converting from dict to list for plotting

In [None]:
AIC_feature_set_lists = sorted(AIC_feature_set.items())  
feature_set, AIC_scores_list = zip(*AIC_feature_set_lists) 

AICs_feature_set_lists = sorted(AICs_feature_set.items())  
feature_set, AICs_scores_list = zip(*AICs_feature_set_lists) 

BIC_feature_set_lists = sorted(BIC_feature_set.items())  
feature_set, BIC_scores_list = zip(*BIC_feature_set_lists) 

AdjustedR2_feature_set_lists = sorted(AdjustedR2_feature_set.items())  
feature_set, AdjustedR2_scores_list = zip(*AdjustedR2_feature_set_lists) 

RMSE_train_feature_set_lists = sorted(RMSE_train_feature_set.items())  
feature_set, RMSE_train_lists = zip(*RMSE_train_feature_set_lists) 

RMSE_valid_feature_set_lists = sorted(RMSE_valid_feature_set.items())  
feature_set, RMSE_valid_lists = zip(*RMSE_valid_feature_set_lists) 

In [None]:
plt.plot(feature_set, RMSE_train_lists, label="Training", color="blue", linestyle="-",  marker="o")
plt.plot(feature_set, RMSE_valid_lists, label="Valid", color="red", linestyle = "--", marker="o")  
plt.xlabel("Feature settings")
plt.ylabel("RMSE")
plt.legend()
plt.grid()
plt.show()

Plot AIC, AICs, BIC, Adjusted R2

In [None]:
plt.plot(feature_set, AIC_scores_list, label="AIC", color="red", linestyle="-",  marker="o")
plt.plot(feature_set, AICs_scores_list, label="AICs", color="orange", linestyle = "--", marker="o")  
plt.plot(feature_set, BIC_scores_list, label="BIC", color="blue", linestyle = "-.", marker="o")  
plt.ylabel("Scores") 
plt.legend(loc=(0.8,0.4))
plt.xlabel("Feature settings")
plt.grid()

plt.gca().twinx().plot(feature_set, AdjustedR2_scores_list, label="AdjustR2", color="cyan", linestyle = "-", marker="o")  
plt.ylabel("Accuracy") 
plt.legend(loc=(0.75,0.30))  
plt.gca().xaxis.grid(True)
plt.show()