# Objective of Automobile Mileage Prediction Project
Build a predictive modeling algorithm to predict mileage of cars based on a given input variables.

In [None]:
#import libraries
#----------------------------------------------
#For data preparation and analysis
import pandas as pd
#For creating plots
import matplotlib.pyplot as plt
#For distribution plots and heatmap
import seaborn as sns
#For creating training and test samples
from sklearn.model_selection import train_test_split
#Feature selection (to select significant variables)
from sklearn.feature_selection import SelectKBest,f_regression
#For building linear regression model
from sklearn.linear_model import LinearRegression


In [None]:
#import automobile data
df=pd.read_csv(r"C:\Users\jgmk2\OneDrive\Desktop\introtallent\PYTHON\Data Files used in Projects\Data Files used in Projects\automobile data.csv")
df

# UNDERSTAND DATA

In [None]:
df.head(3)

In [None]:
df.columns

# VARIABLE DESCRIPTION

* Target variable (y):(MPG(Mileage per gallon)
                    
                    
  * Cylinders 
  * Displacement 
  * Horsepower  
  * Weight
  * Acceleration 
  * Model_year
  * Origin
  * Car_Name
      

In [None]:
df.tail()

In [None]:
df.shape
#Observations:369
#Variables:9


In [None]:
#Check the datatypes
df.dtypes

In [None]:
#Horse power is a numeric variable by in the df it is started sa categorical
#So we need to change the data type of horse power

df['Horsepower']=pd.to_numeric(df['Horsepower'],errors='coerce')

In [None]:
#Check data types again
df.dtypes

In [None]:
#Descriptive statistics
df.describe()

In [None]:
#Check missing values
df.isnull().sum()

In [None]:
#There are 6 missing values in Horsepower


In [None]:
#Missing values imputing
df['Horsepower']=df['Horsepower'].fillna(df['Horsepower'].median)
df=df.dropna(axis=0)

In [None]:
#Check once again
df.isnull().sum()

In [None]:
#Checking outlier


In [None]:
plt.boxplot(df['MPG']) #NO OUTLIERS
plt.show()

In [None]:
plt.boxplot(df['Cylinders']) #NO OUTLIERS
plt.show()

In [None]:
plt.boxplot(df['Displacement']) #NO OUTLIERS
plt.show()

In [None]:
plt.boxplot(df['Displacement']) #NO OUTLIERS
plt.show()

In [None]:
plt.boxplot(df['Weight']) #NO OUTLIERS
plt.show()

In [None]:
plt.boxplot(df['Acceleration'])
plt.show()


In [None]:
def remove_outliers(d,c):
#Where d is the variable for dataframe,c is the variable for columns

#Find q1 and q3
    q1=d[c].quantile(0.25)
    q3=d[c].quantile(0.75)
    
#Calculate the IQR
    iqr=q3-q1
    
#Find UPPER BOUND AND LOWER BOUND    

    ub=q3+1.5*iqr
    lb=q1-1.5*iqr
#Filter good data(i.e,eliminate outliers)
    
    data_without_outlier=d[(d[c]<=ub) & (d[c]>=lb)]
    
    return data_without_outlier
df=remove_outliers(df,'Acceleration')
plt.boxplot(df['Acceleration'])
plt.show()

In [None]:
plt.boxplot(df['Model_year']) #NO OUTLIERS
plt.show()

In [None]:
plt.boxplot(df['Origin']) #NO OUTLIERS
plt.show()

In [None]:
df['Horsepower']

In [None]:
df['Horsepower']=pd.to_numeric(df['Horsepower'],errors='coerce')
plt.boxplot(df['Horsepower'])
plt.show()

# EDA(Exploratory Data Analysis)
* Distribution
* Data Mix
* Correlation

In [None]:
df.columns

In [None]:
df.shape

In [None]:
#'MPG','Displacement','Horsepower','Weight','Acceleration'

In [None]:
sns.distplot(df['MPG'])

In [None]:
sns.distplot(df['Displacement'])

In [None]:
sns.distplot(df['Horsepower'])

In [None]:
sns.distplot(df['Weight'])

In [None]:
sns.distplot(df['Acceleration'])

In [None]:
#Check Datamix


In [None]:
df.groupby('Cylinders')['Cylinders'].count().plot(kind='bar')

In [None]:
df.groupby('Model_year')['Model_year'].count().plot(kind='bar')

In [None]:
df.groupby('Origin')['Origin'].count().plot(kind='bar')

In [None]:
df.groupby('Car_Name')['Car_Name'].count().plot(kind='bar')

# PEARSON CORRELATION

In [None]:
#create a set of numeric columns
df_numeric=df.select_dtypes(include=['int64','float64'])
df_numeric.head()

In [None]:
#in_df_numeric has categorical varioables mthat 3we need to drop (cylindrs.model_year.oirgin)

In [None]:
df_numeric=df_numeric.drop(['Cylinders', 'Model_year', 'Origin'], axis=1)
df_numeric.head()

In [None]:
#Create a heatmap
sns.heatmap(df_numeric.corr(),cmap='YlGnBu',annot=True)

In [None]:
#Using Pearson correlation test we found that key drivers(input variables) at 

In [None]:
---------------------End of EDA--------------------

In [None]:
#Check if there is any problem in categorical variables.
#Like spelling difference,case sensitive values ie male,male

In [None]:
df.columns


In [None]:
df['Cylinders'].unique()

In [None]:
df['Model_year'].unique()

In [None]:
df['Model_year'].unique()
#origin 1.US 2.GERMANY 3.JAPAN

In [None]:
df['Car_Name'].unique()

# Dummy conversion (One-Hot encoding)

In [None]:
#Remove model year as it doesnot signify anything in terms of impact on target
df=df.drop('Model_year',axis=1)

In [None]:
df.dtypes

In [None]:
#Cylindrs and origin are categorical variables stored as numeric.
#Hence we need to change the datatype of these variables to object
df['Cylinders']=df['Cylinders'].astype('object')
df['Origin']=df['Origin'].astype('object')
df.dtypes


In [None]:
#Create a new df to store categorical variables to dummy conversion
df_categorical=df.select_dtypes(include='object')
df_categorical.head()

In [None]:
#dummy conversion
df_dummy=pd.get_dummies(df_categorical,drop_first=True)
df_dummy.head()

In [None]:
#combine data from df_numeric and df_dummy
df_final=pd.concat([df_numeric,df_dummy], axis=1)
df_final.head()

In [None]:
#Create x and y
x=df_final.drop('MPG',axis=1)
y=df_final['MPG']

In [None]:
#from sklearn.feature_selection import RFE
#from sklearn.Linear_model import LinearRegression

In [None]:
#Training and test samples
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=.3,random_state=999)

In [None]:
#Check the sample size
print(xtrain.shape,xtest.shape,ytrain.shape,)


# FEATURE SELECTION

Select significant variables.

# VIF,Variable Inflation Factor:a score that tells us the relevance of a variable along with P value we also check VIF to find the significant variable.

# VTF=1/{1-R^2}

#P VALUE:measures strengthof evidence against null hypothesis.A variable with P value <0.05 is considered as significant variable.

#While finding the significant variable manually we shouldwe should check P values.

In [None]:
#select significant variables
#Create key_features object to select the top k features
#key_features =selectkBest(score_func=f_regression,k='all')
key_features=SelectKBest(score_func=f_regression,k=5)# to select 5 most significant variables (5 is enough)




In [None]:
key_features

In [None]:
#fit the key_features to the training and transform it
xtrain_select=key_features.fit_transform(xtrain,ytrain)

In [None]:
xtrain.isnull().sum()

In [None]:
#Get the indices of the selected features
selected_indices=key_features.get_support()
selected_indices

In [None]:
selected_indices=key_features.get_support(indices=True)
selected_indices

In [None]:
#Get the name of the selected features
selected_features=xtrain.columns[selected_indices]


In [None]:
#Print the significant variable
selected_features

In [None]:
# Create key_features object to select the top k features
key_features = SelectKBest(score_func=f_regression, k=5) # to select 5 significant features

# Fit the key_feature to the training data and transform it
xtrain_selected= key_features.fit_transform(xtrain,ytrain)

# Get the indices of the selected features
selected_indices = key_features.get_support(indices=True)

# Get the names of selected features
selected_features = xtrain.columns[selected_indices]

In [None]:
selected_features

In [None]:
xtrain_selected=xtrain.iloc[:, selected_indices]
xtrain_selected

In [None]:
#LINEAR REGRESSION MODEL

In [None]:
# Instantiate the linear regression function
linreg=LinearRegression()

# Fit the model using training sample
linreg.fit(xtrain_selected,ytrain)

# Print the accuracy of training model
linreg.score(xtrain_selected,ytrain)

In [None]:
# Evaluate the model on the test set using the selected features
xtest_selected = xtest.iloc[:, selected_indices]

linreg.score(xtest_selected,ytest)

In [None]:
# Print predicted Mileage
predicted_mpg

In [None]:
# Print B0
linreg.intercept_

In [None]:
# Print B
linreg.coef_

In [None]:
# Remove multicolinear variable
df_numeric=df_numeric.drop("scaled_displacement",axis=1)
df_numeric.head()

In [None]:
df_categorical.head()

In [None]:
df_dummy.head()

In [None]:
df_final=pd.concat([df_numeric,df_dummy],axis=1)
df_final.head()

In [None]:
# Create x and y

x=df_final.drop('MPG',axis=1)
y=df_final["MPG"]

In [None]:
# Training and test sample
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.3,random_state=999)

In [None]:
# Check sample size
print(xtrain.shape,xtest.shape,ytrain.shape,ytest.shape)# Create key_features object to select the top k features
key_features = SelectKBest(score_func=f_regression, k=5) # to select 5 significant features

# Fit the key_feature to the training data and transform it
xtrain_selected= key_features.fit_transform(xtrain,ytrain)

# Get the indices of the selected features
selected_indices = key_features.get_support(indices=True)

# Get the names of selected features
selected_features = xtrain.columns[selected_indices]

In [None]:
selected_features

In [None]:
xtrain_selected=xtrain.iloc[:, selected_indices]

In [None]:
#LINEAR REGRESSION MODEL

In [None]:
# Instantiate the linear regression function
linreg=LinearRegression()

# Fit the model using training sample
linreg.fit(xtrain_selected,ytrain)

# Print the accuracy of training model
linreg.score(xtrain_selected,ytrain)

In [None]:
# Evaluate the model on the test set using the selected features
xtest_selected = xtest.iloc[:, selected_indices]

linreg.score(xtest_selected,ytest)

In [None]:
# Predict mileage based on xtest
predicted_mpg=linreg.predict(xtest_selected)

# Check accuracy of test model
linreg.score(xtest_selected,ytest)

In [None]:
# Print predicted Mileage
predicted_mpg

In [None]:
# Print B0
linreg.intercept_

In [None]:
# Print B
linreg.coef_