In [4]:
# Linear Regression:
import pandas as pd
from scipy.stats import linregress 

auto = pd.read_csv('auto-mpg.csv') 
auto.head() 

Unnamed: 0,mpg,cylinders,displacement,horse_power,weight,acceleration,model_year,car_name
0,18.0,8,307.0,130.0,3504,12.0,70,"\t""chevrolet chevelle malibu"""
1,15.0,8,350.0,165.0,3693,11.5,70,"\t""buick skylark 320"""
2,18.0,8,318.0,150.0,3436,11.0,70,"\t""plymouth satellite"""
3,16.0,8,304.0,150.0,3433,12.0,70,"\t""amc rebel sst"""
4,17.0,8,302.0,140.0,3449,10.5,70,"\t""ford torino"""


In [5]:
slope, intercept, r_value, p_value, std_err = linregress(auto.acceleration, auto.mpg) 
slope, intercept, r_value, p_value, std_err 

(1.1912045293502276,
 4.969793004253905,
 0.42028891210165076,
 1.8230915350787203e-18,
 0.129236432831014)

In [8]:
#Linear Regression using statsmodels:

import statsmodels.api as sm

X = sm.add_constant(auto.acceleration) # We must add the intercept using the add_constant function
Y = auto.mpg

model = sm.OLS(Y, X).fit()
predictions = model.predict(X) 

#print_model = model.summary()
model.summary()

0,1,2,3
Dep. Variable:,mpg,R-squared:,0.177
Model:,OLS,Adj. R-squared:,0.175
Method:,Least Squares,F-statistic:,84.96
Date:,"Wed, 25 Sep 2019",Prob (F-statistic):,1.82e-18
Time:,21:05:49,Log-Likelihood:,-1343.9
No. Observations:,398,AIC:,2692.0
Df Residuals:,396,BIC:,2700.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.9698,2.043,2.432,0.015,0.953,8.987
acceleration,1.1912,0.129,9.217,0.000,0.937,1.445

0,1,2,3
Omnibus:,17.459,Durbin-Watson:,0.677
Prob(Omnibus):,0.0,Jarque-Bera (JB):,18.214
Skew:,0.497,Prob(JB):,0.000111
Kurtosis:,2.67,Cond. No.,91.1


In [7]:
import numpy as np
mu, sigma = 0, 1 # mean and standard deviation
s = np.random.normal(mu, sigma, 1000)



#Linear Regression using statsmodels:

import statsmodels.api as sm

X = sm.add_constant(auto.acceleration) # We must add the intercept using the add_constant function
Y = auto.mpg

for x in range(20):
    X[str(x)] = s = np.random.normal(mu, sigma, X.shape[0])


model = sm.OLS(Y, X).fit()
predictions = model.predict(X) 

print_model = model.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:                    mpg   R-squared:                       0.227
Model:                            OLS   Adj. R-squared:                  0.184
Method:                 Least Squares   F-statistic:                     5.260
Date:                Wed, 25 Sep 2019   Prob (F-statistic):           3.96e-12
Time:                        21:03:54   Log-Likelihood:                -1331.3
No. Observations:                 398   AIC:                             2707.
Df Residuals:                     376   BIC:                             2794.
Df Model:                          21                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const            4.5193      2.076      2.177   

In [5]:
X = sm.add_constant(auto[['cylinders', 'weight', 'acceleration']]) # adding a constant
Y = auto.mpg

model = sm.OLS(Y, X).fit()
predictions = model.predict(X) 

print_model = model.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:                    mpg   R-squared:                       0.700
Model:                            OLS   Adj. R-squared:                  0.698
Method:                 Least Squares   F-statistic:                     306.7
Date:                Mon, 23 Sep 2019   Prob (F-statistic):          1.14e-102
Time:                        12:19:18   Log-Likelihood:                -1142.9
No. Observations:                 398   AIC:                             2294.
Df Residuals:                     394   BIC:                             2310.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const           42.3811      1.960     21.627   

In [None]:
# When we fist look at the Regression results, we look at F-statistic & Probability of F-statistic.
# Probability of F-statistic  is our p-value, and so the lower it is, the most likely that the F-statistic 
# value is true. F-statistic is the F test result.
# An F-test is any statistical test in which the test statistic has an F-distribution under the null hypothesis. 
# It is most often used when comparing statistical models that have been fitted to a data set, in order to 
# identify the model that best fits the population from which the data were sampled. 

# We also need to look at the R-squared value; the closer it is to 1, the more valid is the model.
# Adjusted R-squared can be helpful when we have lot of variables. The R-squared can get artificially increased,
# therefore it's more reliable to look at Adjusted R-squared.