## Linear Regression - Car Price Prediction

We are required to model the price of cars with the predictor variable that are part of the Car Sales transactions. It will be used by the company to predict the prices for new cars coming in for sale. 

In [1]:
import warnings
warnings.filterwarnings('ignore')

#importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

##  Exploratory Data Analysis and Pre-processing Steps

In [2]:
cars = pd.read_csv("CarPrice_Assignment.csv")
cars

In [3]:
cars.shape

In [4]:
cars.info()

In [5]:
cars['CarName'].value_counts()

In [6]:
"hello i am sivam".split(" ")[0]

### Data Pre-processing steps

In [7]:
# Splitting company name from CarName column

CompanyName = cars['CarName'].apply(lambda x : x.split(' ')[0])
cars.insert(3,"CompanyName",CompanyName)
cars.drop(['CarName'],axis=1,inplace=True)
cars.head()

In [8]:
cars.CompanyName.unique()

##### Fixing invalid values
- There seems to be some spelling error in the CompanyName column.

    - `maxda` = `mazda`
    - `Nissan` = `nissan`
    - `porsche` = `porcshce`
    - `toyota` = `toyouta`
    - `vokswagen` = `volkswagen` =  `vw`

In [9]:
cars.CompanyName = cars.CompanyName.str.lower()

def replace_name(a,b):
    cars.CompanyName.replace(a,b,inplace=True)

replace_name('alfa-romero','alfa-romeo')
replace_name('maxda','mazda')
replace_name('porcshce','porsche')
replace_name('toyouta','toyota')
replace_name('vokswagen','volkswagen')
replace_name('vw','volkswagen')

cars.CompanyName.unique()

In [10]:
cars.columns

In [11]:
cars.head()

## Feature Engineering

In [12]:
# Derive New field - Fuel economy

cars['fueleconomy'] = (0.55 * cars['citympg']) + (0.45 * cars['highwaympg'])

In [13]:
cars['price'] = cars['price'].astype('int')

In [14]:
temp = cars.copy()

In [15]:
table = temp.groupby(['CompanyName'])['price'].mean()
table.reset_index()

In [16]:
# Binning the Car Companies based on avg prices of each Company.

cars['price'] = cars['price'].astype('float')
temp = cars.copy()
table = temp.groupby(['CompanyName'])['price'].mean()
temp = temp.merge(table.reset_index(), how='left',on='CompanyName')
print(temp.head())
bins = [0,10000,20000,40000]  
cars_bin=['Budget','Medium','Highend']
cars['carsrange'] = pd.cut(temp['price_y'],bins,right=False,labels=cars_bin)
cars.head()

In [17]:
cars[["CompanyName","carsrange"]].head()

In [18]:
# Check the corr values of final list of variables
cor = cars.corr()
cor

## Find High Correlations between Features

In [19]:
# Find out the Fields with high correlation

correlated_features = set()
for i in range(len(cor.columns)):
    for j in range(i):
        if abs(cor.iloc[i, j]) > 0.8:
            colname1 = cor.columns[i]
            colname2 = cor.columns[j]
            print(abs(cor.iloc[i, j]), "--", i, '--', j, '--', colname1, '--', colname2)
            correlated_features.add(colname1)
            correlated_features.add(colname2)

In [20]:
print(cor.columns)
print('------')
print(correlated_features)

In [21]:
cor['highwaympg']['citympg']

In [22]:
cor.loc["highwaympg",'citympg']

In [23]:
# Print the Correlation values of the High Correlated fields

corh = cars[correlated_features].corr()
corh

In [24]:
# Produce a Heatmap

plt.figure(figsize=(14,14)) 
sns.heatmap(corh, annot=True, linewidths=.5, fmt=".2f", cmap="YlGnBu")

### Feature Engineering: Create Dummy Variables for Ordinal Variables

In [25]:
print(cars['fueltype'].unique())
print(cars['aspiration'].unique())
print(cars['carbody'].unique())
print(cars['drivewheel'].unique())
print(cars['enginetype'].unique())
print(cars['cylindernumber'].unique())
print(cars['carsrange'].unique())
print(cars['fuelsystem'].unique())
print(cars['CompanyName'].unique())
print(cars['doornumber'].unique())
print(cars['enginelocation'].unique())

In [26]:
pd.get_dummies(cars['carbody'],drop_first=True)

In [27]:
# Defining the map function

def dummies(x,df):
    temp = pd.get_dummies(df[x], drop_first = True)
    df = pd.concat([df, temp], axis = 1)
    df.drop([x], axis = 1, inplace = True)
    return df

# Applying the function to the cars_lr

cars_lr = cars
cars_lr = dummies('fueltype',cars_lr)
cars_lr = dummies('aspiration',cars_lr)
cars_lr = dummies('carbody',cars_lr)
cars_lr = dummies('drivewheel',cars_lr)
cars_lr = dummies('enginetype',cars_lr)
cars_lr = dummies('cylindernumber',cars_lr)
cars_lr = dummies('carsrange',cars_lr)
cars_lr = dummies('CompanyName',cars_lr)
cars_lr = dummies('doornumber',cars_lr)
cars_lr = dummies('enginelocation',cars_lr)
cars_lr = dummies('fuelsystem',cars_lr)

In [28]:
cars_lr.info()

In [29]:
cars_lr.shape

## Train-Test Split and Feature Scaling

In [30]:
from sklearn.model_selection import train_test_split



In [31]:
np.random.seed(0)
df_train, df_test = train_test_split(cars_lr, train_size = 0.7, test_size = 0.3, random_state = 100)

In [32]:
df_train.head()

In [33]:
df_test.head()

In [34]:
from sklearn.preprocessing import MinMaxScaler

In [35]:
scaler=MinMaxScaler()
num_vars=['wheelbase', 'carheight', 'stroke', 'curbweight', 'enginesize', 'boreratio', 'horsepower','fueleconomy','carlength','carwidth','price']
scaler.fit(df_train[num_vars])
scaler.transform(df_train[num_vars])

In [36]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
num_vars = ['wheelbase', 'carheight', 'stroke', 'curbweight', 'enginesize', 'boreratio', 'horsepower','fueleconomy','carlength','carwidth','price']
df_train[num_vars] = scaler.fit_transform(df_train[num_vars])

In [37]:
df_train.head()

In [38]:
#Dividing data into X and y variables
y_train = df_train.pop('price')
X_train = df_train
#X_train=df_train.drop(["price"],axis=1)
#y_train=df_train['price']
X_train

In [39]:
y_train

## Model Building

In [40]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm 
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [41]:
def build_model(X,y):
    X = sm.add_constant(X) # Adding the constant sm linear equation=y=m1x1+m2x2+....no constant
    lm = sm.OLS(y,X).fit() # fitting the model
    print(lm.summary())    # model summary
    return lm
    
def checkVIF(X):
    vif = pd.DataFrame()
    vif['Features'] = X.columns
    vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    vif['VIF'] = round(vif['VIF'], 2)
    vif = vif.sort_values(by = "VIF", ascending = False)
    return(vif)
def vifvalue(x):
    col=x.columns
    print(col.shape[0])
    for i in range(0,col.shape[0]):
        vif=pd.DataFrame(columns=["feature","VIF"])
        x_vif=x.drop(col[i],axis=1)
        y_vif=x[col[i]]
        r2_value=sm.OLS(y_vif,x_vif).fit().rsquared
        vif_value=round(1/(1-r2_value),2)
        vif.loc[i]=[col[i],vif_value]
    return vif.sort_values(by="VIF",ascending=False)
        

In [42]:
checkVIF(X_train_new)

In [None]:
vifvalue(X_train_new)

### Create the first model and clean up features with Colinearity

In [None]:
model1 = build_model(X_train, y_train)

In [None]:
X_train.columns

In [None]:
X_train1 = X_train.drop(
['car_ID', 'wheelbase', 'carlength', 'compressionratio', 'horsepower', 'citympg', 'highwaympg', 'hardtop',
 'sedan','wagon','fwd','rwd','dohcv','l','ohc','ohcf','ohcv','six','Medium',
'audi','buick','chevrolet','dodge','honda','isuzu','jaguar','mazda','nissan','porsche','renault','saab',
 'subaru','toyota','volkswagen','volvo','two','2bbl','4bbl','idi','mfi','mpfi','spdi'], axis=1)

#### Re-create the model after dropping the columns with 'P>|t|' > 0.05

In [None]:
model2 = build_model(X_train1, y_train)

### Feature Elimination using RFE (Recursive Feature Elimination)
Recursive feature elimination (RFE) is a feature selection method that fits a model and removes the weakest feature (or features) until the specified number of features is reached. Features are ranked by the model’s coef_ or feature_importances_ attributes, and by recursively eliminating a small number of features per loop, RFE attempts to eliminate dependencies and collinearity that may exist in the model.

In [None]:
lm = LinearRegression()
lm.fit(X_train1,y_train)
rfe = RFE(estimator=LinearRegression(), n_features_to_select=10)
rfe = rfe.fit(X_train1, y_train)
dir(rfe)

In [None]:
rfe.support_

In [None]:
rfe.ranking_

In [None]:
X_train1.columns[rfe.support_]

In [None]:
X_train1.columns[rfe.support_]

In [None]:
rfe.support_

In [None]:
X_train1.columns

#### Building model using statsmodel, for the detailed statistics

In [None]:
X_train_rfe = X_train[X_train1.columns[rfe.support_]]
X_train_rfe.head()

#### Create Model using RFE returned Features

In [None]:
model4 = build_model(X_train_rfe,y_train)

p-vale of `twelve` seems to be higher than the significance value of 0.05, hence dropping it as it is insignificant in presence of other variables.

In [None]:
X_train_new = X_train_rfe.drop(["twelve"], axis = 1)

#### Recreate Model

In [None]:
model5 = build_model(X_train_new, y_train)

In [None]:
X_train_new = sm.add_constant(X_train_new) # Adding the constant
y_train_pred = model5.predict(X_train_new)

## Prediction using Test Data

In [None]:
df_test

In [None]:
# Scaling the test set

num_vars = ['carwidth', 'curbweight', 'enginesize', 'boreratio', 'price', 'rotor', 'three', 'Highend', 'bmw', 'rear']

df_test1 = pd.DataFrame(scaler.fit_transform(df_test[num_vars]), columns=num_vars) 

In [None]:
df_test1

In [None]:
#Dividing into X and y

y_test = df_test1.pop('price')
X_test = df_test1

In [None]:
y_test

In [None]:
# Now let's use our model to make predictions.

X_test_new = pd.DataFrame(sm.add_constant(X_test))

In [None]:
X_test_new

In [None]:
# Making predictions
y_pred = model5.predict(X_test_new)
y_pred

In [None]:
y_test

### Model Performance Comparisn between Train and Test (Generalisation)

In [None]:
from sklearn.metrics import r2_score
print("Test Prediction R-Sqrd: ", r2_score(y_test, y_pred))

In [None]:
print("Train Prediction R-Sqrd: ", r2_score(y_train, y_train_pred))

#### Inference :

- R-sqaured value for Training data is `92.5%` and for Test Data it is `86%`.
- This scores are fairly decent, however, you can investigate further to achieve better generalisation. 

### Our Final Model

In [None]:
print(model5.summary())

In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
poly=PolynomialFeatures(degree=2)

In [None]:
X_poly=poly.fit_transform(X_train_new)

In [None]:
X_poly

In [None]:
lr=LinearRegression()

In [None]:
lr.fit(X_poly,y_train)

In [None]:
X_text_poly=poly.transform(X_test_new)

In [None]:
y_pred_final=lr.predict(X_text_poly)

In [None]:
r2_score(y_test,y_pred_final)

In [None]:
from sklearn.linear_model import Ridge

In [None]:
1.# draw a graph representing relationship between alpha parameter and performance of model for both ridge and lasso regression.(alpha-0-10)
2.#create model using ridge and lasso regression and compare there performance

In [None]:
"""for i in range (1,11):
    ridge=[]
    lasso[]
    ridge model
    lasso model(alpha=i)
    fit predict
    r2"""

In [None]:
"""ridge[performance for all values of alpha]
lasso[]"""
