In [4]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
import pandas as pd
import scipy
import torch
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
import warnings
warnings.filterwarnings('ignore')

In [5]:
# Part-one: linear regression
from sklearn.linear_model import LinearRegression
# load data
dataset = pd.read_csv('50_Startups.csv')
target = dataset.values[:,-1]
inputs = dataset.values[:,:-1]
# column_name = dataset.columns

# prepare data: 
# one-hot encoding categorical data
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
inputs = ct.fit_transform(inputs)
# normalise input data
# inputs = normalize(inputs, axis=0)
# split data
inputs_train, inputs_test, target_train, target_test = train_test_split(inputs, target, test_size = 0.2, random_state = 0)

# model
regressor = LinearRegression()
regressor.fit(inputs_train, target_train)
print("Coefficients: \n", regressor.coef_)
# sensitivity
# header =['California','Florida','New York','R&D Spend','Administration','Marketing Spend']
# fig = plt.figure(figsize=(8, 4))
# ax = fig.add_axes([0,0,1,1])
# ax.bar(header,regressor.coef_)
# plt.show()

# eval result
target_train_p = regressor.predict(inputs_train)
target_test_p = regressor.predict(inputs_test)
target_p = np.concatenate((target_train_p,target_test_p), axis=0)

Rsq_train = metrics.r2_score(target_train, target_train_p)
Rsq_test = metrics.r2_score(target_test, target_test_p)
print("R_sqrd_train = %.4f || R_sqrd_test = %.4f" % (Rsq_train, Rsq_test))

MSE_train = metrics.mean_squared_error(target_train, target_train_p)
MSE_test = metrics.mean_squared_error(target_test, target_test_p)
print("MSE_train = %.4f || MSE_test = %.4f" % (MSE_train, MSE_test))

# response surface
# fig = plt.figure(figsize=(9, 8))
# ax = plt.axes(projection ="3d")
# sctt = ax.scatter3D(inputs[:,3], inputs[:,4], target_p, s=30,
#             alpha = 0.8, c = target_p, cmap = 'hsv', marker ='o')
# plt.title("50 Startups")
# ax.set_xlabel('R&D Spend $', fontweight ='bold', fontsize=15, rotation=0)
# ax.set_ylabel('Administration $', fontweight ='bold', fontsize=15, rotation=0)
# ax.set_zlabel('Profit $', fontweight ='bold', fontsize=15, rotation=0)
# # fig.colorbar(sctt, ax = ax, shrink = 0.5, aspect = 5)
# plt.show()

Coefficients: 
 [ 8.66383692e+01 -8.72645791e+02  7.86007422e+02  7.73467193e-01
  3.28845975e-02  3.66100259e-02]
R_sqrd_train = 0.9502 || R_sqrd_test = 0.9347
MSE_train = 81571001.8008 || MSE_test = 83502864.0326


In [29]:
# Part-two: polynomial regression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
# load data
dataset = pd.read_csv('50_Startups.csv')
target = dataset.values[:,-1]
inputs = dataset.values[:,:-1]

# prepare data
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
inputs = ct.fit_transform(inputs)
inputs_train, inputs_test, target_train, target_test = train_test_split(inputs, target, test_size = 0.2, random_state = 0)
poly_reg = Pipeline([
    ("poly", PolynomialFeatures(degree=2)),
    ("std_scaler", StandardScaler()),
    ("lin_reg", LinearRegression())
])

poly_reg.fit(inputs_train, target_train)
y_predict = poly_reg.predict(inputs_test)
print("Coefficients: \n", poly_reg.named_steps['lin_reg'].coef_)

# plot result
target_train_p = poly_reg.predict(inputs_train)
target_test_p = poly_reg.predict(inputs_test)
target_p = np.concatenate((target_train_p,target_test_p), axis=0)

Rsq_train = metrics.r2_score(target_train, target_train_p)
Rsq_test = metrics.r2_score(target_test, target_test_p)
print("R_sqrd_train = %.4f || R_sqrd_test = %.4f" % (Rsq_train, Rsq_test))

MSE_train = metrics.mean_squared_error(target_train, target_train_p)
MSE_test = metrics.mean_squared_error(target_test, target_test_p)
print("MSE_train = %.4f || MSE_test = %.4f" % (MSE_train, MSE_test))

Coefficients: 
 [ 4.64624059e-11 -3.25627815e+03  1.59488277e+03  1.89663972e+03
  1.80061105e+04  8.52175338e+03  2.06744592e+04 -3.25627815e+03
  1.09139364e-11  3.63797881e-12  1.05082411e+04  2.64547802e+03
  9.21842211e+03  1.59488277e+03  3.63797881e-12  3.29450666e+03
  1.37007205e+03  4.46444963e+03  1.89663972e+03  5.49698114e+03
 -3.29193558e+02  7.67859260e+03 -1.70505449e+04  1.76758898e+04
  1.59174229e+04 -3.07438983e+03 -3.19986035e+04  1.66925447e+02]
R_sqrd_train = 0.9655 || R_sqrd_test = 0.9074
MSE_train = 56457075.4956 || MSE_test = 118378941.1371


In [20]:
# feature_names = regressor.get_feature_names_out()

# coefs = pd.DataFrame(
#     regressor.coef_,
#     columns=["Coefficients"],
#     index=feature_names,
# )

# coefs
print(list(zip(regressor.coef_, regressor.feature_names_in_)))

[0.22461875302759032, -6984.462961614919, 3345.6369359236005, 3638.880763853216, 0.424714486976668, 0.26049245809881577, 0.17147234018873467, -6984.407464688569, -4.165353857388254e-07, 4.4557987166626845e-07, 0.2147567902879512, 0.11102294408160233, 0.07719869041475944, 3345.536960334557, 1.3650833352585323e-08, 0.05528870134310309, 0.09359703537767239, 0.03700393469187236, 3638.870503243771, 0.07743747099631416, 0.06329995338775919, 0.05378136594726685, -2.323298358138162e-06, 2.6819027931669167e-06, 8.246834495523684e-07, -5.270299669165385e-07, -1.8825178337305204e-06, 3.159808450153218e-09]
