In [None]:
"""
This is an example to perform simple linear regression algorithm on the dataset (weight and height),
where x = weight and y = height.
"""
import pandas as pd
import numpy as np
import datetime
import random
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.svm import LinearSVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

seed = 309
# Freeze the random seed
random.seed(seed)
np.random.seed(seed)
train_test_split_test_size = 0.3

# Training settings
alpha = 0.1  # step size
max_iters = 50  # max iterations

algorithms = {
    "LR": LinearRegression(),
    "KNNR": KNeighborsRegressor(),
    "R": Ridge(),
    "DTR": DecisionTreeRegressor(),
    "RFR": RandomForestRegressor(),
    "GBR": GradientBoostingRegressor(),
    "SGDR": SGDRegressor(max_iter = 1000),
    "SVR": SVR(degree=3, C=100, epsilon=.01),
    "LSVR": LinearSVR(),
    "MLPR": MLPRegressor(max_iter = 1000),
}


def load_data():
    """
    Load Data from CSV
    :return: df    a panda data frame
    """
    df = pd.read_csv("../data/diamonds.csv")
    df = df.drop(columns=df.columns[0])
    return df


def data_preprocess(data):
    """
    Data preprocess:
        1. Split the entire dataset into train and test
        2. Split outputs and inputs
        3. Standardize train and test
        4. Add intercept dummy for computation convenience
    :param data: the given dataset (format: panda DataFrame)
    :return: train_data       train data contains only inputs
             train_labels     train data contains only labels
             test_data        test data contains only inputs
             test_labels      test data contains only labels
             train_data_full       train data (full) contains both inputs and labels
             test_data_full       test data (full) contains both inputs and labels
    """
    # Split the data into train and test
    train_data, test_data = train_test_split(data, test_size=train_test_split_test_size, random_state=seed)

    # Pre-process data (both train and test)
    train_data_full = train_data.copy()
    train_data = train_data.drop(["price"], axis=1)
    train_labels = train_data_full["price"]

    test_data_full = test_data.copy()
    test_data = test_data.drop(["price"], axis=1)
    test_labels = test_data_full["price"]
    #create dummy for catagorical values 
    train_data = pd.get_dummies(train_data, columns=['cut'])
    train_data = pd.get_dummies(train_data, columns=['color'])
    train_data = pd.get_dummies(train_data, columns=['clarity'])
    test_data = pd.get_dummies(test_data, columns=['cut'])
    test_data = pd.get_dummies(test_data, columns=['color'])
    test_data = pd.get_dummies(test_data, columns=['clarity'])
    # Standardize the inputs
    train_mean = train_data.mean()
    train_std = train_data.std()
    train_data = (train_data - train_mean) / train_std
    test_data = (test_data - train_mean) / train_std

    # Tricks: add dummy intercept to both train and test
    train_data['intercept_dummy'] = pd.Series(1.0, index=train_data.index)
    test_data['intercept_dummy'] = pd.Series(1.0, index=test_data.index)
    return train_data, train_labels, test_data, test_labels, train_data_full, test_data_full


if __name__ == '__main__':
    
    # Step 1: Load Data
    data = load_data()

    # Step 2: Preprocess the data
    train_data, train_labels, test_data, test_labels, train_data_full, test_data_full = data_preprocess(data)

    # Step 3: Learning Start
    for method in algorithms:
        clf = algorithms[method]
        start_time = datetime.datetime.now()  # Track learning starting time
        clf.fit(train_data.values, train_labels.values)
        end_time = datetime.datetime.now()  # Track learning ending time
        exection_time = (end_time - start_time).total_seconds()  # Track execution time
        prediction = clf.predict(test_data.values)

        # Step 4: Results presentation
        print(clf)
        print("Learn: execution time={t:.3f} seconds".format(t=exection_time))

        # Build baseline model
        print("R2:", float("{0:.2f}".format(r2_score(test_labels.values, prediction))))  # R2 should be maximize
        print("MSE:", float("{0:.2f}".format(mean_squared_error(test_labels.values, prediction))))
        print("RMSE:", float("{0:.2f}".format(np.sqrt(mean_squared_error(test_labels.values, prediction)))))
        print("MAE:", float("{0:.2f}".format(mean_absolute_error(test_labels.values, prediction), "\n")))


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
Learn: execution time=0.044 seconds
R2: 0.92
MSE: 1324168.94
RMSE: 1150.73
MAE: 743.13


<Figure size 640x480 with 1 Axes>

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform')
Learn: execution time=0.112 seconds
R2: 0.95
MSE: 774686.78
RMSE: 880.16
MAE: 437.82


<Figure size 640x480 with 1 Axes>

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)
Learn: execution time=0.035 seconds
R2: 0.92
MSE: 1324178.2
RMSE: 1150.73
MAE: 743.3


<Figure size 640x480 with 1 Axes>

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')
Learn: execution time=0.225 seconds
R2: 0.97
MSE: 529566.17
RMSE: 727.71
MAE: 352.8


<Figure size 640x480 with 1 Axes>

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
Learn: execution time=1.461 seconds
R2: 0.98
MSE: 333880.54
RMSE: 577.82
MAE: 285.35


<Figure size 640x480 with 1 Axes>

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)
Learn: execution time=2.512 seconds
R2: 0.97
MSE: 555862.23
RMSE: 745.56
MAE: 406.4


<Figure size 640x480 with 1 Axes>

SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01,
       fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
       loss='squared_loss', max_iter=1000, n_iter=None, penalty='l2',
       power_t=0.25, random_state=None, shuffle=True, tol=None, verbose=0,
       warm_start=False)
Learn: execution time=8.196 seconds
R2: 0.92
MSE: 1326874.68
RMSE: 1151.9
MAE: 747.29


<Figure size 640x480 with 1 Axes>