<a href="https://colab.research.google.com/github/LakshmiP1/DS/blob/main/multi_linear_toyota.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.graphics.regressionplots import influence_plot
import statsmodels.formula.api as smf
import numpy as np

In [None]:
#Load the data set
cars = pd.read_csv("Toyoto_Corrola.csv")
cars.head()

In [None]:
cars.info()

In [None]:
# dropping the case number columns as it is not required
cars.drop(["Id","Model"],inplace=True,axis = 1)

In [None]:
cars.info()

### **Correlation** **Matrix**

In [None]:
cars.corr()

In [None]:
cars.Cylinders.describe()

In [None]:
cars.drop('Cylinders',inplace=True,axis=1)


In [None]:
cars.info()

In [None]:
cars.corr()

# **Scatter Plot b/w variables and along with histogram**

In [None]:
#Format the plot background and scatter plots for all the variables
sns.set_style(style='darkgrid')
sns.pairplot(cars)



## **Preparing a model**

In [None]:
#Build model
import statsmodels.formula.api as smf 
model=smf.ols('Price~Age_08_04+KM+HP+Doors+Gears+Weight',cars).fit()

In [None]:
#coefficients
model.params

In [None]:
#t and p values
print(model.tvalues,'\n',model.pvalues)

In [None]:
#r squared values
(model.rsquared,model.rsquared_adj)

In [None]:
rsq_Age_08_04 = smf.ols('HP~KM+HP+Doors+Gears+Weight',data=cars).fit().rsquared  
vif_Age_08_04 = 1/(1-rsq_Age_08_04) # 16.33

rsq_KM = smf.ols('KM~Age_08_04+HP+Doors+Gears+Weight',data=cars).fit().rsquared  
vif_KM = 1/(1-rsq_KM) # 564.98

rsq_HP = smf.ols('HP~Age_08_04+KM+Doors+Gears+Weight',data=cars).fit().rsquared  
vif_HP = 1/(1-rsq_HP) #  564.84

rsq_Doors = smf.ols('Doors~Age_08_04+KM+Gears+Weight+HP',data=cars).fit().rsquared  
vif_Doors = 1/(1-rsq_Doors) #  16.35

rsq_Gears = smf.ols('Gears~Age_08_04+HP+KM+Doors+Weight',data=cars).fit().rsquared
vif_Gears = 1/(1-rsq_Gears)

rsq_Weight = smf.ols('Weight~Age_08_04+HP+KM+Gears+Doors',data=cars).fit().rsquared
vif_Weight = 1/(1-rsq_Weight)

# Storing vif values in a data frame
d1 = {'Variables':['Age_08_04 ','KM','HP','Doors','Gears','Weight'],'VIF':[vif_Age_08_04 ,vif_KM,vif_HP,vif_Doors,vif_Gears,vif_Weight]}
Vif_frame = pd.DataFrame(d1)  
Vif_frame

# **Residual Analysis**

# **Test for normality of residuals**

In [None]:
import statsmodels.api as sm
qqplot=sm.qqplot(model.resid,line='q') # line = 45 to draw the diagnoal line
plt.title("Normal Q-Q plot of residuals")
plt.show()

In [None]:
list(np.where(model.resid>6000))

In [None]:
list(np.where(model.resid<-7000))

# **Residual plot for homoscedascity**

In [None]:
def get_standardized_values( vals ):
    return (vals - vals.mean())/vals.std()

In [None]:
plt.scatter(get_standardized_values(model.fittedvalues),
            get_standardized_values(model.resid))

plt.title('Residual Plot')
plt.xlabel('Standardized Fitted values')
plt.ylabel('Standardized residual values')
plt.show()

# Resudual vs Regressors

In [None]:
fig = plt.figure(figsize=(15,8))
fig = sm.graphics.plot_regress_exog(model, "Age_08_04", fig=fig)
plt.show()

In [None]:
fig = plt.figure(figsize=(15,8))
fig = sm.graphics.plot_regress_exog(model, "HP", fig=fig)
plt.show()

In [None]:
fig = plt.figure(figsize=(15,8))
fig = sm.graphics.plot_regress_exog(model, "KM", fig=fig)
plt.show()

In [None]:
fig = plt.figure(figsize=(15,8))
fig = sm.graphics.plot_regress_exog(model, "Doors", fig=fig)
plt.show()

In [None]:
fig = plt.figure(figsize=(15,8))
fig = sm.graphics.plot_regress_exog(model, "Weight", fig=fig)
plt.show()

In [None]:
fig = plt.figure(figsize=(15,8))
fig = sm.graphics.plot_regress_exog(model, "Gears", fig=fig)
plt.show()

# **Model Deletion Diagnostics**

# **Detecting influencers or outliers**

# **Cook's Distance**

In [None]:
model_influence = model.get_influence()
(c, _) = model_influence.cooks_distance

In [None]:
#Plot the influencers values using stem plot
fig = plt.subplots(figsize=(20, 7))
plt.stem(np.arange(len(cars)), np.round(c, 3))
plt.xlabel('Row index')
plt.ylabel('Cooks Distance')
plt.show()

In [None]:
##index and value of influencer where c is more than .5
(np.argmax(c),np.max(c))

#High influence points

In [None]:
from statsmodels.graphics.regressionplots import influence_plot
influence_plot(model)
plt.show()

In [None]:
k = cars.shape[1]
n = cars.shape[0]
leverage_cutoff = 3*((k + 1)/n)

# **From the above plot,it is evident that data point is 221 and 960**

In [None]:
cars[cars.index.isin([221,960])]

In [None]:
#See the differences in HP and other variable values
cars.head()

In [None]:
#Load the data
cars_new = pd.read_csv("Toyoto_Corrola.csv")

In [None]:
#Discard the data points which are influencers and reasign the row number (reset_index())
car1=cars_new.drop(cars_new.index[[221,960]],axis=0).reset_index()

In [None]:
car1

In [None]:
#Exclude variable "WT" and generate R-Squared and AIC values
final_ml_P= smf.ols('Price~KM+HP+Weight+Age_08_04+Gears+Doors',data = cars).fit()

In [None]:
(final_ml_P.rsquared,final_ml_P.aic,final_ml_P.bic)

Cook's Distance

In [None]:
model_influence_P = final_ml_P.get_influence()
(c, _) = model_influence_P.cooks_distance

In [None]:
fig= plt.subplots(figsize=(20,7))
plt.stem(np.arange(len(car1)),np.round(c,3));
plt.xlabel('Row index')
plt.ylabel('Cooks Distance')