In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Data reading / loading

In [3]:
df = pd.read_csv("car_Prediction_data.csv")


In [4]:
df

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.60,6.87,42450,Diesel,Dealer,Manual,0
...,...,...,...,...,...,...,...,...,...
296,city,2016,9.50,11.60,33988,Diesel,Dealer,Manual,0
297,brio,2015,4.00,5.90,60000,Petrol,Dealer,Manual,0
298,city,2009,3.35,11.00,87934,Petrol,Dealer,Manual,0
299,city,2017,11.50,12.50,9000,Diesel,Dealer,Manual,0


# Data Exploration

In [5]:
print(f"Number of Rows: {df.shape[0]} \nNumber of Columns: {df.shape[1]}")

Number of Rows: 301 
Number of Columns: 9


In [6]:
df.count()

Car_Name         301
Year             301
Selling_Price    301
Present_Price    301
Kms_Driven       301
Fuel_Type        301
Seller_Type      301
Transmission     301
Owner            301
dtype: int64

In [7]:
df.isnull().sum()

Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Kms_Driven       0
Fuel_Type        0
Seller_Type      0
Transmission     0
Owner            0
dtype: int64

In [8]:
df.describe()

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Owner
count,301.0,301.0,301.0,301.0,301.0
mean,2013.627907,4.661296,7.628472,36947.20598,0.043189
std,2.891554,5.082812,8.644115,38886.883882,0.247915
min,2003.0,0.1,0.32,500.0,0.0
25%,2012.0,0.9,1.2,15000.0,0.0
50%,2014.0,3.6,6.4,32000.0,0.0
75%,2016.0,6.0,9.9,48767.0,0.0
max,2018.0,35.0,92.6,500000.0,3.0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Kms_Driven     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Seller_Type    301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB


In [10]:
df.head(2)

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0


In [11]:
df.tail(2)

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
299,city,2017,11.5,12.5,9000,Diesel,Dealer,Manual,0
300,brio,2016,5.3,5.9,5464,Petrol,Dealer,Manual,0


In [12]:
for i in df.columns:
    print(i)

Car_Name
Year
Selling_Price
Present_Price
Kms_Driven
Fuel_Type
Seller_Type
Transmission
Owner


In [13]:
df['Car_Name'].unique()

array(['ritz', 'sx4', 'ciaz', 'wagon r', 'swift', 'vitara brezza',
       's cross', 'alto 800', 'ertiga', 'dzire', 'alto k10', 'ignis',
       '800', 'baleno', 'omni', 'fortuner', 'innova', 'corolla altis',
       'etios cross', 'etios g', 'etios liva', 'corolla', 'etios gd',
       'camry', 'land cruiser', 'Royal Enfield Thunder 500',
       'UM Renegade Mojave', 'KTM RC200', 'Bajaj Dominar 400',
       'Royal Enfield Classic 350', 'KTM RC390', 'Hyosung GT250R',
       'Royal Enfield Thunder 350', 'KTM 390 Duke ',
       'Mahindra Mojo XT300', 'Bajaj Pulsar RS200',
       'Royal Enfield Bullet 350', 'Royal Enfield Classic 500',
       'Bajaj Avenger 220', 'Bajaj Avenger 150', 'Honda CB Hornet 160R',
       'Yamaha FZ S V 2.0', 'Yamaha FZ 16', 'TVS Apache RTR 160',
       'Bajaj Pulsar 150', 'Honda CBR 150', 'Hero Extreme',
       'Bajaj Avenger 220 dtsi', 'Bajaj Avenger 150 street',
       'Yamaha FZ  v 2.0', 'Bajaj Pulsar  NS 200', 'Bajaj Pulsar 220 F',
       'TVS Apache RTR 180', 

In [14]:
df.nunique()

Car_Name          98
Year              16
Selling_Price    156
Present_Price    147
Kms_Driven       206
Fuel_Type          3
Seller_Type        2
Transmission       2
Owner              3
dtype: int64

# Data Pre-Processing

In [15]:
print("-- Number of Null Values in Data --")
print(df.isnull().sum())

-- Number of Null Values in Data --
Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Kms_Driven       0
Fuel_Type        0
Seller_Type      0
Transmission     0
Owner            0
dtype: int64


In [19]:
# print("-- Number of Null Values in Data --")
print(df['Selling_Price'].isnull().sum())

0


In [20]:
def fillNaObjMode(col):
    for i in col:
        df[i] = df[i].fillna(df[i].mode()[0])

columns = ['Car_Name', 'Fuel_Type', 'Transmission', 'Owner']
fillNaObjMode(columns)

In [24]:
df.drop('Car_Name', axis=1, inplace=True)
df.drop('Selling_Price', axis=1, inplace=True)

In [25]:
df.head(2)

Unnamed: 0,Year,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,2014,5.59,27000,Petrol,Dealer,Manual,0
1,2013,9.54,43000,Diesel,Dealer,Manual,0


In [26]:
print(df.isnull().sum())

Year             0
Present_Price    0
Kms_Driven       0
Fuel_Type        0
Seller_Type      0
Transmission     0
Owner            0
dtype: int64


In [52]:
# df['price'] = df['Selling_Price'].astype('int64')
def changetoint64(col):
    for i in col:
        df[i] = df[i].astype('int64')
        
columns =[
    'Car_Price', 'Selling_Price', 'Kms_Driven', 'No_of_owners', 'Age','Seller_Type', 'Fuel_Type', 'Transmission']


In [28]:
print("-- Insights of Data --")
df.info()

-- Insights of Data --
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Year           301 non-null    int64  
 1   Present_Price  301 non-null    float64
 2   Kms_Driven     301 non-null    int64  
 3   Fuel_Type      301 non-null    object 
 4   Seller_Type    301 non-null    object 
 5   Transmission   301 non-null    object 
 6   Owner          301 non-null    int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 16.6+ KB


In [30]:
df['Owner'].unique()

array([0, 1, 3])

In [31]:
df['Transmission'].unique()

array(['Manual', 'Automatic'], dtype=object)

In [32]:
df_encoded = df.copy()

In [35]:
def encodeCols(cols):
    for i in cols:
        data = pd.DataFrame({i:df[i].unique()})
        data_label_encoder = LabelEncoder()
        data_label_encoder.fit(np.ravel(data))
        df_encoded[i] = data_label_encoder.transform(df[i]) 

columns = ['Transmission','Owner']
encodeCols(columns)

In [36]:
# All the Attributes are Numerical 
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Year           301 non-null    int64  
 1   Present_Price  301 non-null    float64
 2   Kms_Driven     301 non-null    int64  
 3   Fuel_Type      301 non-null    object 
 4   Seller_Type    301 non-null    object 
 5   Transmission   301 non-null    int64  
 6   Owner          301 non-null    int64  
dtypes: float64(1), int64(4), object(2)
memory usage: 16.6+ KB


In [37]:
df_encoded.to_csv(r'encoded-data.csv', index = False, header = True)

# Train-Test Splitting

In [38]:
traindata, testdata = train_test_split(df_encoded, test_size=0.2, shuffle=False)

In [39]:
traindata.head(2)

Unnamed: 0,Year,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,2014,5.59,27000,Petrol,Dealer,1,0
1,2013,9.54,43000,Diesel,Dealer,1,0


In [40]:
testdata.head(2)

Unnamed: 0,Year,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
240,2012,9.4,32322,Diesel,Dealer,1,0
241,2015,7.13,35866,Petrol,Dealer,1,1


In [41]:
train_x = traindata.iloc[:, 1:]
train_x.head()

Unnamed: 0,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,5.59,27000,Petrol,Dealer,1,0
1,9.54,43000,Diesel,Dealer,1,0
2,9.85,6900,Petrol,Dealer,1,0
3,4.15,5200,Petrol,Dealer,1,0
4,6.87,42450,Diesel,Dealer,1,0


In [42]:
train_y = traindata.iloc[:, 0]
train_y.head()

0    2014
1    2013
2    2017
3    2011
4    2014
Name: Year, dtype: int64

In [43]:
test_x = testdata.iloc[:, 1:]
test_x.head()

Unnamed: 0,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
240,9.4,32322,Diesel,Dealer,1,0
241,7.13,35866,Petrol,Dealer,1,1
242,7.13,34000,Petrol,Dealer,1,0
243,7.6,7000,Petrol,Dealer,1,0
244,9.4,49000,Diesel,Dealer,1,0


In [44]:
test_y = testdata.iloc[:, 0]
test_y.head()

240    2012
241    2015
242    2014
243    2016
244    2013
Name: Year, dtype: int64

In [59]:
text_x = train_x.values

# Training Model Using Support Vector Classifier

In [61]:
print("-- Training using SVC on Training Data --")
print("-- Parameters & Values: ", end='')

model_svc = SVC(gamma='selling_price', random_state=0)

print(model_svc)

-- Training using SVC on Training Data --
-- Parameters & Values: SVC(gamma='selling_price', random_state=0)


In [None]:
# Saving Trained Model
pickle.dump(model_svc, open('model_svc.pkl', 'wb'))

In [None]:
model_prediction = model_svc.predict(test_x)

testdata_predict = testdata.copy(deep=True)
pd.options.mode.chained_assignment = None

testdata_predict['selling_price'] = model_prediction

In [77]:
 #Printing Testing Data
print("-- Testing Data with Prediction --")
#pd.set_option("display.max_rows", None, "display.max_columns", None)
#testdata_predict.head()

-- Testing Data with Prediction --


In [80]:
model_accuracy_score = accuracy_score(testdata_predict['selling_price'], test_x), testdata_predict['Prediction'])

print("-- Model Accuracy Score: ", end='')
print(round(model_accuracy_score,3))