In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy import stats

# Load data from CSV
#file_path = r 'Advertising . csv'
df = pd . read_csv ( 'Advertising.csv' )
#print ( df . head () )

# Split data into training and testing sets
X_train , X_test , y_train , y_test = train_test_split ( df [['TV' , 'radio' , 'newspaper']] , df ['sales'] , test_size =0.2 , random_state =42)

# Create linear regression object
reg = LinearRegression ()
reg.fit ( X_train , y_train )

# Predict the response for test dataset
y_pred = reg.predict ( X_test )

# for training data set
#residual sum of squares
print ("For training data set: ")
RSS = np.sum((y_train - reg.predict(X_train))**2)
print('RSS: ', RSS)

#residual standard error
RSE = np.sqrt(RSS/(len(X_train)-2))
print('RSE: ', RSE)

#Mean Squared Error
MSE2 = mean_squared_error(y_train, reg.predict(X_train))
print('MSE2: ', MSE2)

#R Squared
R2 = reg.score(X_train, y_train)
print('R2: ', R2)
print()

#Standard Error for each feature
SE = np.sqrt(np.sum((y_train - reg.predict(X_train))**2)/(len(X_train)-2) / np.sum((X_train - np.mean(X_train))**2))

#t-statistic for each feature
t = reg.coef_/SE

#p-value for each feature
p = 2*(1 - stats.t.cdf(np.abs(t), len(X_train)-2))

result_df = pd.DataFrame({'Coefficients': reg.coef_,
                           'Standard Error': SE,
                           't-statistic': t,
                           'p-value': p})
print(result_df)

# for testing data set
print ("\nFor testing data set: ")
#residual sum of squares
RSS_test = np.sum((y_test - reg.predict(X_test))**2)
print('RSS_test: ', RSS_test)

#residual standard error
RSE_test = np.sqrt(RSS_test/(len(X_test)-2))
print('RSE_test: ', RSE_test)

#Mean Squared Error
MSE_test = mean_squared_error(y_test, reg.predict(X_test))
print('MSE_test: ', MSE_test)

#R Squared
R2_test = reg.score(X_test, y_test)
print('R2_test: ', R2_test)
print()


#Standard Error for each feature
SE_test = np.sqrt(np.sum((y_test - reg.predict(X_test))**2)/(len(X_test)-2) / np.sum((X_test - np.mean(X_test))**2))

#t-statistic for each feature
t_test = reg.coef_/SE_test

#p-value for each feature
p_test = 2*(1 - stats.t.cdf(np.abs(t_test), len(X_test)-2))

result_df_test = pd.DataFrame({'Coefficients': reg.coef_,
                            'Standard Error': SE_test,
                            't-statistic': t_test,
                            'p-value': p_test})

print(result_df_test)










For training data set: 
RSS:  432.8207076930262
RSE:  1.6551046999139907
MSE2:  2.705129423081414
R2:  0.8957008271017818

           Coefficients  Standard Error  t-statistic   p-value
TV             0.044730        0.001111    40.263322  0.000000
radio          0.189195        0.002778    68.116584  0.000000
newspaper      0.002761        0.003059     0.902667  0.368077

For testing data set: 
RSS_test:  126.96389415904413
RSE_test:  1.8278826848155574
MSE_test:  3.1740973539761033
R2_test:  0.899438024100912

           Coefficients  Standard Error  t-statistic   p-value
TV             0.044730        0.002517    17.768430  0.000000
radio          0.189195        0.006846    27.635915  0.000000
newspaper      0.002761        0.007031     0.392702  0.696734


In [19]:
set_1 = np.array([25000,25000,0])
set_2 = np.array([50000,0,0])
set_3 = np.array([0,50000,0])

print('Case 1: ', reg.predict(set_1.reshape(1,-1)))
print('Case 2: ', reg.predict(set_2.reshape(1,-1)))
print('Case 3: ', reg.predict(set_3.reshape(1,-1)))



Case 1:  [5851.09335992]
Case 2:  [2239.45494077]
Case 3:  [9462.73177906]


