#### Lab1：Regression

In [5]:
import pandas as pd
# load train_set.csv
train_set = pd.read_csv('train_set.csv')
# load test_set.csv
test_set = pd.read_csv('test_set.csv')

# check the shape of train_set & test_set
print("Shape of train_set:", train_set.shape, "; Shape of test_set:", test_set.shape)
# describe the train set
train_set.describe()

Shape of train_set: (15726, 13) ; Shape of test_set: (3949, 13)


Unnamed: 0,median_income,house_age,avg_rooms,avg_bedrooms,block_population,avg_occupancy,latitude,longitude,house_value,diag_coord,bed_per_rooms,rooms_per_occup,pop_per_occup
count,15726.0,15726.0,15726.0,15726.0,15726.0,15726.0,15726.0,15726.0,15726.0,15726.0,15726.0,15726.0,15726.0
mean,3.678555,28.379944,5.370023,1.098158,1435.30262,3.123054,35.657075,-119.56603,1.923611,-83.908955,0.214882,1.943798,498.807135
std,1.570925,12.497755,2.398827,0.480437,1127.179163,11.864913,2.152346,2.003633,0.977266,0.82297,0.056292,1.161179,377.719409
min,0.4999,1.0,1.130435,0.333333,3.0,0.692308,32.54,-124.35,0.14999,-85.89,0.1,0.002547,2.0
25%,2.53505,18.0,4.416667,1.00564,793.0,2.446153,33.93,-121.76,1.167,-84.41,0.177622,1.497171,281.0
50%,3.4544,28.0,5.1875,1.04842,1177.0,2.838178,34.28,-118.52,1.7335,-84.16,0.204512,1.912297,409.5
75%,4.5794,37.0,5.972678,1.098845,1740.75,3.304575,37.73,-118.0,2.475,-83.46,0.241086,2.249429,603.0
max,15.0001,52.0,132.533333,34.066667,35682.0,1243.333333,41.95,-114.31,5.0,-78.29,1.0,55.222222,5358.0


In [6]:
# split input and output of both train_set and test_set
train_x = train_set.drop(['house_value'], axis=1)
train_y = train_set['house_value']
test_x = test_set.drop(['house_value'], axis=1)
test_y = test_set['house_value']
# transform the sets into numpy array
train_x = train_x.values
train_y = train_y.values
test_x = test_x.values
test_y = test_y.values

In [7]:
# import sklearn's linear, lasso, ridge, SVM, k-neighbors, descision tree, GBDT regression
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor

# import sklearn's mean squared error and r2 score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

import time

def train_model(model, train_x, train_y):
    # start timing
    start_time = time.time()
    # train the model
    model.fit(train_x, train_y)
    # end timing
    end_time = time.time()
    training_time = end_time - start_time
    # calculate the MSE, r2 & adjested r2 score of training set
    pred_y = model.predict(train_x)
    mse = mean_squared_error(train_y, pred_y)
    r2 = r2_score(train_y, pred_y)
    adj_r2 = 1 - (1 - r2) * (len(train_y) - 1) / (len(train_y) - train_x.shape[1] - 1)
    return training_time, mse, r2, adj_r2
    

def test_model(model, test_x, test_y):
    # predict the output of test set
    pred_y = model.predict(test_x)
    # calculate the mean squared error
    mse = mean_squared_error(test_y, pred_y)
    # calculate the r2 score
    r2 = r2_score(test_y, pred_y)
    # calcultae the adjusted r2 score
    adj_r2 = 1 - (1 - r2) * (len(test_y) - 1) / (len(test_y) - test_x.shape[1] - 1)
    print(pred_y)
    print(test_y)
    return mse, r2, adj_r2


# create a function to train and test the models
def train_test_model(model, train_x, train_y, test_x, test_y):
    # train the model
    train_result = train_model(model, train_x, train_y)
    # test the model
    test_result = test_model(model, test_x, test_y)
    # concat train & test result
    result = train_result + test_result
    return result



In [8]:
# create a dict with all the models
models = {
    'LinearRegression': LinearRegression(),
    #'Lasso': Lasso(),
    #'Ridge': Ridge(),
    #'SVR': SVR(),
    #'KNeighborsRegressor': KNeighborsRegressor(),
    #'DecisionTreeRegressor': DecisionTreeRegressor(),
    #'GradientBoostingRegressor': GradientBoostingRegressor()
}

# create a pandas dataframe to log the results
result_df = pd.DataFrame(columns=['model_name', 'training_time', 'train_mse', 'train_r2', 'train_adj_r2', 'test_mse', 'test_r2', 'test_adj_r2'])

# train and test all the models
for name, model in models.items():
    result = train_test_model(model, train_x, train_y, test_x, test_y)
    # append the name and result to result_df
    result_df = result_df.append({'model_name': name, 'training_time': result[0], 'train_mse': result[1], 'train_r2': result[2], 'train_adj_r2': result[3], 'test_mse': result[4], 'test_r2': result[5], 'test_adj_r2': result[6]}, ignore_index=True)

In [9]:
from IPython.display import display
# display the result_df
display(result_df)
result_df.to_csv('result.csv', index=False)

Unnamed: 0,model_name,training_time,train_mse,train_r2,train_adj_r2,test_mse,test_r2,test_adj_r2
0,LinearRegression,0.013987,0.359732,0.623313,0.623025,0.349395,0.633585,0.632468
1,Lasso,0.039748,0.913391,0.043558,0.042827,0.912354,0.043204,0.040287
2,Ridge,0.005322,0.359818,0.623223,0.622935,0.348935,0.634068,0.632952
3,SVR,4.891536,0.879537,0.079008,0.078305,0.884729,0.072174,0.069345
4,KNeighborsRegressor,0.016133,0.634212,0.335895,0.335388,0.972041,-0.019391,-0.022499
5,DecisionTreeRegressor,0.146256,0.0,1.0,1.0,0.360893,0.621527,0.620373
6,GradientBoostingRegressor,3.250647,0.19816,0.7925,0.792342,0.210284,0.779473,0.778801
