## Importing the essential libraries over here

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Importing the dataset over here

In [2]:
data=pd.read_csv("Armenian Market Car Prices.csv")

In [3]:
data.head()

Unnamed: 0,Car Name,Year,Region,FuelType,Mileage,Price
0,Ford Transit,2002,Goris,Diesel,215000,12000
1,Ford Focus,2017,Ijevan,Gasoline,72000,11500
2,Opel Vectra,1998,Armavir,Gasoline,466000,4000
3,Mazda 6,2020,Nor Nork,Gasoline,100000,22000
4,Opel Vectra,1998,Armavir,Gasoline,318000,3500


In [4]:
data.describe()

Unnamed: 0,Year,Mileage,Price
count,7995.0,7995.0,7995.0
mean,2010.342089,146401.523327,15555.002877
std,9.113848,108371.306576,17731.598604
min,1943.0,10.0,200.0
25%,2004.0,60000.0,6000.0
50%,2011.0,130000.0,11500.0
75%,2018.0,210000.0,19000.0
max,2024.0,1000000.0,565000.0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7995 entries, 0 to 7994
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Car Name  7995 non-null   object
 1   Year      7995 non-null   int64 
 2   Region    7995 non-null   object
 3   FuelType  7995 non-null   object
 4   Mileage   7995 non-null   int64 
 5   Price     7995 non-null   int64 
dtypes: int64(3), object(3)
memory usage: 374.9+ KB


## Checking the existence of missing values over here

In [6]:
data.isnull().sum()

Car Name    0
Year        0
Region      0
FuelType    0
Mileage     0
Price       0
dtype: int64

## Filtering all the numerical features over here

In [7]:
numerical_features=[feature for feature in data.columns if data[feature].dtype!="O"]
for feature in numerical_features:
  print(feature)

Year
Mileage
Price


In [8]:
data[numerical_features]

Unnamed: 0,Year,Mileage,Price
0,2002,215000,12000
1,2017,72000,11500
2,1998,466000,4000
3,2020,100000,22000
4,1998,318000,3500
...,...,...,...
7990,2013,150000,35000
7991,2018,94000,5600
7992,2001,151000,9800
7993,2009,300000,4500


## Filtering all the categorical features over here

In [9]:
cat_features=[feature for feature in data.columns if data[feature].dtype=="O"]
for feature in cat_features:
  print(feature)

Car Name
Region
FuelType


In [10]:
data[cat_features]

Unnamed: 0,Car Name,Region,FuelType
0,Ford Transit,Goris,Diesel
1,Ford Focus,Ijevan,Gasoline
2,Opel Vectra,Armavir,Gasoline
3,Mazda 6,Nor Nork,Gasoline
4,Opel Vectra,Armavir,Gasoline
...,...,...,...
7990,VAZ (Lada) 2121 (4x4),Armavir,Gasoline
7991,Toyota 4Runner,Nor Nork,Gasoline
7992,Opel Astra hatchback,Ajapnyak,Gasoline
7993,Toyota Camry,Shengavit,Gasoline


## Encoding the categorical features into numerical outcomes over here

In [11]:
for feature in cat_features:
  feature_mapping={category:index for index,category in enumerate(data[feature].unique())}
  data[feature]=data[feature].map(feature_mapping)

In [12]:
data

Unnamed: 0,Car Name,Year,Region,FuelType,Mileage,Price
0,0,2002,0,0,215000,12000
1,1,2017,1,1,72000,11500
2,2,1998,2,1,466000,4000
3,3,2020,3,1,100000,22000
4,2,1998,2,1,318000,3500
...,...,...,...,...,...,...
7990,10,2013,2,1,150000,35000
7991,146,2018,3,1,94000,5600
7992,71,2001,11,1,151000,9800
7993,28,2009,12,1,300000,4500


In [21]:
data['Fuel Type']=data['FuelType']
data.drop('FuelType',axis=1,inplace=True)

## Creating the features and labels over here

In [22]:
X=data.iloc[:,:-1].values
y=data.iloc[:,-1].values

## Splitting the dataset into training set and testing set to avoid the problem of overfitting

In [23]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

## Training the model on the training dataset over here

In [28]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
classifiers = {
    'Random Forest': RandomForestClassifier(n_estimators=10, random_state=0),
    'Gradient Boosting': GradientBoostingClassifier(random_state=0),
    'Support Vector Machine': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(random_state=0),
    'Logistic Regression': LogisticRegression(max_iter=200)
}

best_classifier = None
best_accuracy = 0

for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'{name}: Accuracy = {accuracy:.4f}')

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_classifier = clf

print(f'\nBest Classifier: {best_classifier.__class__.__name__} with Accuracy = {best_accuracy:.4f}')

Random Forest: Accuracy = 0.9550
Gradient Boosting: Accuracy = 0.9619
Support Vector Machine: Accuracy = 0.9218
K-Nearest Neighbors: Accuracy = 0.9168
Decision Tree: Accuracy = 0.9293
Logistic Regression: Accuracy = 0.9287

Best Classifier: GradientBoostingClassifier with Accuracy = 0.9619


In [29]:
classifier=GradientBoostingClassifier(random_state=0)
classifier.fit(X_train,y_train)

## Evaluating the performance of the model on the training dataset over here

In [30]:
y_pred=classifier.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))

[[1 1]
 [1 1]
 [1 1]
 ...
 [1 1]
 [1 1]
 [1 1]]


## Checking the performance metrics over here

In [31]:
from sklearn.metrics import accuracy_score, classification_report
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

0.9618511569731082
              precision    recall  f1-score   support

           0       0.69      0.50      0.58        48
           1       0.97      0.99      0.98      1474
           2       0.94      0.55      0.69        31
           3       0.86      0.78      0.82        46

    accuracy                           0.96      1599
   macro avg       0.86      0.71      0.77      1599
weighted avg       0.96      0.96      0.96      1599

