In [96]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
import pandas as pd
import scipy
import torch
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics

In [132]:
# Part-one: linear regression
from sklearn.linear_model import LinearRegression
# load data
dataset = pd.read_csv('50_Startups.csv')
target = dataset.values[:,-1]
inputs = dataset.values[:,:-1]
# column_name = dataset.columns

# prepare data: 
# one-hot encoding categorical data
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
inputs = ct.fit_transform(inputs)
# normalise input data
# inputs = normalize(inputs, axis=0)
# split data
inputs_train, inputs_test, target_train, target_test = train_test_split(inputs, target, test_size = 0.2, random_state = 0)

# model
regressor = LinearRegression()
regressor.fit(inputs_train, target_train)
print("Coefficients: \n", regressor.coef_)
# sensitivity
# header =['California','Florida','New York','R&D Spend','Administration','Marketing Spend']
# fig = plt.figure(figsize=(8, 4))
# ax = fig.add_axes([0,0,1,1])
# ax.bar(header,regressor.coef_)
# plt.show()

# eval result
target_train_p = regressor.predict(inputs_train)
target_test_p = regressor.predict(inputs_test)
target_p = np.concatenate((target_train_p,target_test_p), axis=0)

Rsq_train = metrics.r2_score(target_train, target_train_p)
Rsq_test = metrics.r2_score(target_test, target_test_p)
print("R_sqrd_train = %.4f || R_sqrd_test = %.4f" % (Rsq_train, Rsq_test))

MSE_train = metrics.mean_squared_error(target_train, target_train_p)
MSE_test = metrics.mean_squared_error(target_test, target_test_p)
print("MSE_train = %.4f || MSE_test = %.4f" % (MSE_train, MSE_test))

# response surface
# fig = plt.figure(figsize=(9, 8))
# ax = plt.axes(projection ="3d")
# sctt = ax.scatter3D(inputs[:,3], inputs[:,4], target_p, s=30,
#             alpha = 0.8, c = target_p, cmap = 'hsv', marker ='o')
# plt.title("50 Startups")
# ax.set_xlabel('R&D Spend $', fontweight ='bold', fontsize=15, rotation=0)
# ax.set_ylabel('Administration $', fontweight ='bold', fontsize=15, rotation=0)
# ax.set_zlabel('Profit $', fontweight ='bold', fontsize=15, rotation=0)
# # fig.colorbar(sctt, ax = ax, shrink = 0.5, aspect = 5)
# plt.show()

Coefficients: 
 [ 8.66383692e+01 -8.72645791e+02  7.86007422e+02  7.73467193e-01
  3.28845975e-02  3.66100259e-02]
R_sqrd_train = 0.9502 || R_sqrd_test = 0.9347
MSE_train = 81571001.8008 || MSE_test = 83502864.0326


In [144]:
# Part-two: polynomial regression
from sklearn.preprocessing import PolynomialFeatures

# load data
dataset = pd.read_csv('50_Startups.csv')
target = dataset.values[:,-1]
inputs = dataset.values[:,:-1]

# prepare data
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
inputs = ct.fit_transform(inputs)
poly = PolynomialFeatures(degree = 2, interaction_only=False, include_bias=True)
poly_inputs = poly.fit_transform(inputs)
poly_inputs_train, poly_inputs_test, target_train, target_test = train_test_split(poly_inputs, target, test_size = 0.2, random_state = 0)

# model
regressor = LinearRegression()
regressor.fit(poly_inputs_train, target_train)
print("Coefficients: \n", regressor.coef_)

# plot result
target_train_p = regressor.predict(poly_inputs_train)
target_test_p = regressor.predict(poly_inputs_test)
target_p = np.concatenate((target_train_p,target_test_p), axis=0)

Rsq_train = metrics.r2_score(target_train, target_train_p)
Rsq_test = metrics.r2_score(target_test, target_test_p)
print("R_sqrd_train = %.4f || R_sqrd_test = %.4f" % (Rsq_train, Rsq_test))

MSE_train = metrics.mean_squared_error(target_train, target_train_p)
MSE_test = metrics.mean_squared_error(target_test, target_test_p)
print("MSE_train = %.4f || MSE_test = %.4f" % (MSE_train, MSE_test))

Coefficients: 
 [ 2.24618753e-01 -6.98446296e+03  3.34563694e+03  3.63888076e+03
  4.24714487e-01  2.60492458e-01  1.71472340e-01 -6.98440746e+03
 -4.16535386e-07  4.45579872e-07  2.14756790e-01  1.11022944e-01
  7.71986904e-02  3.34553696e+03  1.36508334e-08  5.52887013e-02
  9.35970354e-02  3.70039347e-02  3.63887050e+03  7.74374710e-02
  6.32999534e-02  5.37813659e-02 -2.32329836e-06  2.68190279e-06
  8.24683450e-07 -5.27029967e-07 -1.88251783e-06  3.15980845e-09]
R_sqrd_train = 0.9655 || R_sqrd_test = 0.9074
MSE_train = 56457075.4972 || MSE_test = 118379856.8127
