## Import

In [50]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

## Fetch Data From Kaggle

In [51]:
!pip install -q kaggle

In [None]:
# from google.colab import files

# Upload My Kaggle API
# files.upload()

In [None]:
!mkdir ~/.kaggle

In [28]:
!cp kaggle.json ~/.kaggle/

In [29]:
!kaggle datasets download -d nehalbirla/vehicle-dataset-from-cardekho --unzip

Downloading vehicle-dataset-from-cardekho.zip to /content
  0% 0.00/227k [00:00<?, ?B/s]
100% 227k/227k [00:00<00:00, 67.6MB/s]


## Explore Data

In [30]:
df = pd.read_csv('car data.csv')

In [31]:
df.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Kms_Driven     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Seller_Type    301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB


In [33]:
df.isnull().sum()

Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Kms_Driven       0
Fuel_Type        0
Seller_Type      0
Transmission     0
Owner            0
dtype: int64

In [34]:
df.describe()

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Owner
count,301.0,301.0,301.0,301.0,301.0
mean,2013.627907,4.661296,7.628472,36947.20598,0.043189
std,2.891554,5.082812,8.644115,38886.883882,0.247915
min,2003.0,0.1,0.32,500.0,0.0
25%,2012.0,0.9,1.2,15000.0,0.0
50%,2014.0,3.6,6.4,32000.0,0.0
75%,2016.0,6.0,9.9,48767.0,0.0
max,2018.0,35.0,92.6,500000.0,3.0


In [35]:
df.corr()

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Owner
Year,1.0,0.236141,-0.047584,-0.524342,-0.182104
Selling_Price,0.236141,1.0,0.878983,0.029187,-0.088344
Present_Price,-0.047584,0.878983,1.0,0.203647,0.008057
Kms_Driven,-0.524342,0.029187,0.203647,1.0,0.089216
Owner,-0.182104,-0.088344,0.008057,0.089216,1.0


In [36]:
df.columns

Index(['Car_Name', 'Year', 'Selling_Price', 'Present_Price', 'Kms_Driven',
       'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner'],
      dtype='object')

In [37]:
print(df['Fuel_Type'].value_counts())
print(df['Seller_Type'].value_counts())
print(df['Transmission'].value_counts())

Petrol    239
Diesel     60
CNG         2
Name: Fuel_Type, dtype: int64
Dealer        195
Individual    106
Name: Seller_Type, dtype: int64
Manual       261
Automatic     40
Name: Transmission, dtype: int64


In [38]:
fuel_type = df['Fuel_Type']
seller_type = df['Seller_Type']
transmission_type = df['Transmission']
selling_price = df['Selling_Price']

In [39]:
petrol_data = df.groupby('Fuel_Type').get_group('Petrol')
petrol_data.describe()

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Owner
count,239.0,239.0,239.0,239.0,239.0
mean,2013.539749,3.264184,5.583556,33528.937238,0.050209
std,3.042674,3.135537,5.290685,40308.984886,0.270368
min,2003.0,0.1,0.32,500.0,0.0
25%,2012.0,0.6,0.94,13850.0,0.0
50%,2014.0,2.65,4.6,25870.0,0.0
75%,2016.0,5.2,7.98,44271.0,0.0
max,2017.0,19.75,23.73,500000.0,3.0


In [40]:
seller_data = df.groupby('Seller_Type').get_group('Dealer')
seller_data.describe()

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Owner
count,195.0,195.0,195.0,195.0,195.0
mean,2013.712821,6.721692,10.886308,39850.133333,0.020513
std,2.686275,5.136088,8.806563,24860.401003,0.142111
min,2003.0,1.05,2.69,2071.0,0.0
25%,2012.0,3.75,6.58,22148.5,0.0
50%,2014.0,5.25,8.5,39485.0,0.0
75%,2016.0,7.625,13.46,51785.5,0.0
max,2018.0,35.0,92.6,197176.0,1.0


## Manual Encoding

In [41]:
df.replace({'Fuel_Type':{'Petrol':0, 'Diesel':1, 'CNG':2}}, inplace=True)

## One Hot Encoding

In [42]:
df = pd.get_dummies(df, columns=['Seller_Type', 'Transmission'], drop_first=True)

## Train Data

In [43]:
X = df.drop(['Car_Name','Selling_Price'], axis=1)
y = df['Selling_Price']

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

## Linear Model

In [45]:
model = linear_model.LinearRegression()
model.fit(X_train, y_train)
linear_model = model.score(X_test, y_test) 
print(linear_model)

0.8773175030338084


In [46]:
from sklearn import linear_model
model2 = linear_model.Ridge(alpha=.5)
model2.fit(X_train,y_train)

ridge_model = model2.score(X_test,y_test)
print(ridge_model)

0.8774043199387811


In [47]:
model3 = linear_model.Lasso(alpha=0.1)
model3.fit(X_train,y_train)

lasso_model = model3.score(X_test,y_test)
print(lasso_model)

0.8684366992376725


In [48]:
model4 = linear_model.LassoLars(alpha=.1, normalize=False)
model4.fit(X_train,y_train)

lars_lasso_model = model4.score(X_test,y_test)
print(lars_lasso_model)

0.8684367805462015


In [49]:
model5 = linear_model.BayesianRidge()
model5.fit(X_train,y_train)

bayesian_model = model5.score(X_test,y_test)
print(bayesian_model)

0.8774031022963173
