In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error, r2_score

In [2]:
dataset = pd.read_csv("car_price_prediction.csv")

In [3]:
dataset.head()

Unnamed: 0,ID,Price,Levy,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags
0,45654403,13328,1399,LEXUS,RX 450,2010,Jeep,Yes,Hybrid,3.5,186005 km,6.0,Automatic,4x4,04-May,Left wheel,Silver,12
1,44731507,16621,1018,CHEVROLET,Equinox,2011,Jeep,No,Petrol,3.0,192000 km,6.0,Tiptronic,4x4,04-May,Left wheel,Black,8
2,45774419,8467,-,HONDA,FIT,2006,Hatchback,No,Petrol,1.3,200000 km,4.0,Variator,Front,04-May,Right-hand drive,Black,2
3,45769185,3607,862,FORD,Escape,2011,Jeep,Yes,Hybrid,2.5,168966 km,4.0,Automatic,4x4,04-May,Left wheel,White,0
4,45809263,11726,446,HONDA,FIT,2014,Hatchback,Yes,Petrol,1.3,91901 km,4.0,Automatic,Front,04-May,Left wheel,Silver,4


In [4]:
dataset.shape

(19237, 18)

In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19237 entries, 0 to 19236
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                19237 non-null  int64  
 1   Price             19237 non-null  int64  
 2   Levy              19237 non-null  object 
 3   Manufacturer      19237 non-null  object 
 4   Model             19237 non-null  object 
 5   Prod. year        19237 non-null  int64  
 6   Category          19237 non-null  object 
 7   Leather interior  19237 non-null  object 
 8   Fuel type         19237 non-null  object 
 9   Engine volume     19237 non-null  object 
 10  Mileage           19237 non-null  object 
 11  Cylinders         19237 non-null  float64
 12  Gear box type     19237 non-null  object 
 13  Drive wheels      19237 non-null  object 
 14  Doors             19237 non-null  object 
 15  Wheel             19237 non-null  object 
 16  Color             19237 non-null  object

In [6]:
dataset.nunique()

ID                  18924
Price                2315
Levy                  559
Manufacturer           65
Model                1590
Prod. year             54
Category               11
Leather interior        2
Fuel type               7
Engine volume         107
Mileage              7687
Cylinders              13
Gear box type           4
Drive wheels            3
Doors                   3
Wheel                   2
Color                  16
Airbags                17
dtype: int64

In [7]:
object_columns = dataset.select_dtypes(include='object').columns
unique_values_per_column = {col: dataset[col].unique() for col in object_columns}
unique_values_per_column

{'Levy': array(['1399', '1018', '-', '862', '446', '891', '761', '751', '394',
        '1053', '1055', '1079', '810', '2386', '1850', '531', '586',
        '1249', '2455', '583', '1537', '1288', '915', '1750', '707',
        '1077', '1486', '1091', '650', '382', '1436', '1194', '503',
        '1017', '1104', '639', '629', '919', '781', '530', '640', '765',
        '777', '779', '934', '769', '645', '1185', '1324', '830', '1187',
        '1111', '760', '642', '1604', '1095', '966', '473', '1138', '1811',
        '988', '917', '1156', '687', '11714', '836', '1347', '2866',
        '1646', '259', '609', '697', '585', '475', '690', '308', '1823',
        '1361', '1273', '924', '584', '2078', '831', '1172', '893', '1872',
        '1885', '1266', '447', '2148', '1730', '730', '289', '502', '333',
        '1325', '247', '879', '1342', '1327', '1598', '1514', '1058',
        '738', '1935', '481', '1522', '1282', '456', '880', '900', '798',
        '1277', '442', '1051', '790', '1292', '1047', 

In [8]:
dataset['Levy'].value_counts()

Levy
-       5819
765      486
891      461
639      410
640      405
        ... 
3156       1
2908       1
1279       1
1719       1
1901       1
Name: count, Length: 559, dtype: int64

In [9]:
dataset['Manufacturer'].value_counts()

Manufacturer
HYUNDAI          3769
TOYOTA           3662
MERCEDES-BENZ    2076
FORD             1111
CHEVROLET        1069
                 ... 
TESLA               1
PONTIAC             1
SATURN              1
ASTON MARTIN        1
GREATWALL           1
Name: count, Length: 65, dtype: int64

In [10]:
dataset['Model'].value_counts()

Model
Prius                    1083
Sonata                   1079
Camry                     938
Elantra                   922
E 350                     542
                         ... 
Feroza                      1
C-MAX C-MAX                 1
X1 4X4                      1
Land Cruiser Prado RX       1
Prius C aqua                1
Name: count, Length: 1590, dtype: int64

In [11]:
dataset['Category'].value_counts()

Category
Sedan          8736
Jeep           5473
Hatchback      2847
Minivan         647
Coupe           532
Universal       364
Microbus        306
Goods wagon     233
Pickup           52
Cabriolet        36
Limousine        11
Name: count, dtype: int64

In [12]:
dataset['Leather interior'].value_counts()

Leather interior
Yes    13954
No      5283
Name: count, dtype: int64

In [13]:
dataset['Fuel type'].value_counts()

Fuel type
Petrol            10150
Diesel             4036
Hybrid             3578
LPG                 892
CNG                 494
Plug-in Hybrid       86
Hydrogen              1
Name: count, dtype: int64

In [14]:
dataset['Engine volume'].value_counts()

Engine volume
2            3916
2.5          2277
1.8          1760
1.6          1462
1.5          1321
             ... 
6.8             1
6.7             1
3.1             1
0.8 Turbo       1
1.1 Turbo       1
Name: count, Length: 107, dtype: int64

In [15]:
dataset['Mileage'].value_counts()

Mileage
0 km         721
200000 km    183
150000 km    161
160000 km    120
100000 km    119
            ... 
63083 km       1
28750 km       1
25077 km       1
77452 km       1
186923 km      1
Name: count, Length: 7687, dtype: int64

In [16]:
dataset['Gear box type'].value_counts()

Gear box type
Automatic    13514
Tiptronic     3102
Manual        1875
Variator       746
Name: count, dtype: int64

In [17]:
dataset['Drive wheels'].value_counts()

Drive wheels
Front    12874
4x4       4058
Rear      2305
Name: count, dtype: int64

In [18]:
dataset['Wheel'].value_counts()

Wheel
Left wheel          17753
Right-hand drive     1484
Name: count, dtype: int64

In [19]:
dataset['Color'].value_counts()

Color
Black            5033
White            4489
Silver           3792
Grey             2375
Blue             1396
Red               639
Green             322
Orange            253
Brown             187
Carnelian red     179
Golden            145
Beige             134
Sky blue          122
Yellow            106
Purple             39
Pink               26
Name: count, dtype: int64

In [20]:
dataset['Levy'] = dataset['Levy'].replace('-', 0)
dataset['Levy'] = pd.to_numeric(dataset['Levy'], errors='coerce')

In [21]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19237 entries, 0 to 19236
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                19237 non-null  int64  
 1   Price             19237 non-null  int64  
 2   Levy              19237 non-null  int64  
 3   Manufacturer      19237 non-null  object 
 4   Model             19237 non-null  object 
 5   Prod. year        19237 non-null  int64  
 6   Category          19237 non-null  object 
 7   Leather interior  19237 non-null  object 
 8   Fuel type         19237 non-null  object 
 9   Engine volume     19237 non-null  object 
 10  Mileage           19237 non-null  object 
 11  Cylinders         19237 non-null  float64
 12  Gear box type     19237 non-null  object 
 13  Drive wheels      19237 non-null  object 
 14  Doors             19237 non-null  object 
 15  Wheel             19237 non-null  object 
 16  Color             19237 non-null  object

In [22]:
dataset['Mileage'] =dataset['Mileage'].str.replace('km',"").astype("Int64")

In [23]:
import datetime
datetime=datetime.datetime.now()
datetime

datetime.datetime(2024, 9, 29, 4, 17, 1, 26790)

In [24]:
dataset['Age']=datetime.year-dataset['Prod. year']

In [25]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19237 entries, 0 to 19236
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                19237 non-null  int64  
 1   Price             19237 non-null  int64  
 2   Levy              19237 non-null  int64  
 3   Manufacturer      19237 non-null  object 
 4   Model             19237 non-null  object 
 5   Prod. year        19237 non-null  int64  
 6   Category          19237 non-null  object 
 7   Leather interior  19237 non-null  object 
 8   Fuel type         19237 non-null  object 
 9   Engine volume     19237 non-null  object 
 10  Mileage           19237 non-null  Int64  
 11  Cylinders         19237 non-null  float64
 12  Gear box type     19237 non-null  object 
 13  Drive wheels      19237 non-null  object 
 14  Doors             19237 non-null  object 
 15  Wheel             19237 non-null  object 
 16  Color             19237 non-null  object

In [26]:
dataset

Unnamed: 0,ID,Price,Levy,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags,Age
0,45654403,13328,1399,LEXUS,RX 450,2010,Jeep,Yes,Hybrid,3.5,186005,6.0,Automatic,4x4,04-May,Left wheel,Silver,12,14
1,44731507,16621,1018,CHEVROLET,Equinox,2011,Jeep,No,Petrol,3,192000,6.0,Tiptronic,4x4,04-May,Left wheel,Black,8,13
2,45774419,8467,0,HONDA,FIT,2006,Hatchback,No,Petrol,1.3,200000,4.0,Variator,Front,04-May,Right-hand drive,Black,2,18
3,45769185,3607,862,FORD,Escape,2011,Jeep,Yes,Hybrid,2.5,168966,4.0,Automatic,4x4,04-May,Left wheel,White,0,13
4,45809263,11726,446,HONDA,FIT,2014,Hatchback,Yes,Petrol,1.3,91901,4.0,Automatic,Front,04-May,Left wheel,Silver,4,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19232,45798355,8467,0,MERCEDES-BENZ,CLK 200,1999,Coupe,Yes,CNG,2.0 Turbo,300000,4.0,Manual,Rear,02-Mar,Left wheel,Silver,5,25
19233,45778856,15681,831,HYUNDAI,Sonata,2011,Sedan,Yes,Petrol,2.4,161600,4.0,Tiptronic,Front,04-May,Left wheel,Red,8,13
19234,45804997,26108,836,HYUNDAI,Tucson,2010,Jeep,Yes,Diesel,2,116365,4.0,Automatic,Front,04-May,Left wheel,Grey,4,14
19235,45793526,5331,1288,CHEVROLET,Captiva,2007,Jeep,Yes,Diesel,2,51258,4.0,Automatic,Front,04-May,Left wheel,Black,4,17


In [27]:
dataset['Turbo'] = dataset['Engine volume'].apply(lambda x: 'yes' if 'turbo' in x.lower() else 'no')

In [28]:
dataset['Engine volume'] = dataset['Engine volume'].str.replace(' Turbo', '').astype(float)

In [29]:
dataset

Unnamed: 0,ID,Price,Levy,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags,Age,Turbo
0,45654403,13328,1399,LEXUS,RX 450,2010,Jeep,Yes,Hybrid,3.5,186005,6.0,Automatic,4x4,04-May,Left wheel,Silver,12,14,no
1,44731507,16621,1018,CHEVROLET,Equinox,2011,Jeep,No,Petrol,3.0,192000,6.0,Tiptronic,4x4,04-May,Left wheel,Black,8,13,no
2,45774419,8467,0,HONDA,FIT,2006,Hatchback,No,Petrol,1.3,200000,4.0,Variator,Front,04-May,Right-hand drive,Black,2,18,no
3,45769185,3607,862,FORD,Escape,2011,Jeep,Yes,Hybrid,2.5,168966,4.0,Automatic,4x4,04-May,Left wheel,White,0,13,no
4,45809263,11726,446,HONDA,FIT,2014,Hatchback,Yes,Petrol,1.3,91901,4.0,Automatic,Front,04-May,Left wheel,Silver,4,10,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19232,45798355,8467,0,MERCEDES-BENZ,CLK 200,1999,Coupe,Yes,CNG,2.0,300000,4.0,Manual,Rear,02-Mar,Left wheel,Silver,5,25,yes
19233,45778856,15681,831,HYUNDAI,Sonata,2011,Sedan,Yes,Petrol,2.4,161600,4.0,Tiptronic,Front,04-May,Left wheel,Red,8,13,no
19234,45804997,26108,836,HYUNDAI,Tucson,2010,Jeep,Yes,Diesel,2.0,116365,4.0,Automatic,Front,04-May,Left wheel,Grey,4,14,no
19235,45793526,5331,1288,CHEVROLET,Captiva,2007,Jeep,Yes,Diesel,2.0,51258,4.0,Automatic,Front,04-May,Left wheel,Black,4,17,no


In [30]:
dataset = dataset.drop(['ID', 'Prod. year'], axis=1)

In [31]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19237 entries, 0 to 19236
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Price             19237 non-null  int64  
 1   Levy              19237 non-null  int64  
 2   Manufacturer      19237 non-null  object 
 3   Model             19237 non-null  object 
 4   Category          19237 non-null  object 
 5   Leather interior  19237 non-null  object 
 6   Fuel type         19237 non-null  object 
 7   Engine volume     19237 non-null  float64
 8   Mileage           19237 non-null  Int64  
 9   Cylinders         19237 non-null  float64
 10  Gear box type     19237 non-null  object 
 11  Drive wheels      19237 non-null  object 
 12  Doors             19237 non-null  object 
 13  Wheel             19237 non-null  object 
 14  Color             19237 non-null  object 
 15  Airbags           19237 non-null  int64  
 16  Age               19237 non-null  int64 

In [32]:
dataset

Unnamed: 0,Price,Levy,Manufacturer,Model,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags,Age,Turbo
0,13328,1399,LEXUS,RX 450,Jeep,Yes,Hybrid,3.5,186005,6.0,Automatic,4x4,04-May,Left wheel,Silver,12,14,no
1,16621,1018,CHEVROLET,Equinox,Jeep,No,Petrol,3.0,192000,6.0,Tiptronic,4x4,04-May,Left wheel,Black,8,13,no
2,8467,0,HONDA,FIT,Hatchback,No,Petrol,1.3,200000,4.0,Variator,Front,04-May,Right-hand drive,Black,2,18,no
3,3607,862,FORD,Escape,Jeep,Yes,Hybrid,2.5,168966,4.0,Automatic,4x4,04-May,Left wheel,White,0,13,no
4,11726,446,HONDA,FIT,Hatchback,Yes,Petrol,1.3,91901,4.0,Automatic,Front,04-May,Left wheel,Silver,4,10,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19232,8467,0,MERCEDES-BENZ,CLK 200,Coupe,Yes,CNG,2.0,300000,4.0,Manual,Rear,02-Mar,Left wheel,Silver,5,25,yes
19233,15681,831,HYUNDAI,Sonata,Sedan,Yes,Petrol,2.4,161600,4.0,Tiptronic,Front,04-May,Left wheel,Red,8,13,no
19234,26108,836,HYUNDAI,Tucson,Jeep,Yes,Diesel,2.0,116365,4.0,Automatic,Front,04-May,Left wheel,Grey,4,14,no
19235,5331,1288,CHEVROLET,Captiva,Jeep,Yes,Diesel,2.0,51258,4.0,Automatic,Front,04-May,Left wheel,Black,4,17,no


In [33]:
dataset.duplicated().sum()

3512

In [34]:
dataset = dataset.drop_duplicates()

In [35]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15725 entries, 0 to 19236
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Price             15725 non-null  int64  
 1   Levy              15725 non-null  int64  
 2   Manufacturer      15725 non-null  object 
 3   Model             15725 non-null  object 
 4   Category          15725 non-null  object 
 5   Leather interior  15725 non-null  object 
 6   Fuel type         15725 non-null  object 
 7   Engine volume     15725 non-null  float64
 8   Mileage           15725 non-null  Int64  
 9   Cylinders         15725 non-null  float64
 10  Gear box type     15725 non-null  object 
 11  Drive wheels      15725 non-null  object 
 12  Doors             15725 non-null  object 
 13  Wheel             15725 non-null  object 
 14  Color             15725 non-null  object 
 15  Airbags           15725 non-null  int64  
 16  Age               15725 non-null  int64  
 17

In [36]:
non_object_columns = dataset.select_dtypes(exclude='object')
non_object_columns

Unnamed: 0,Price,Levy,Engine volume,Mileage,Cylinders,Airbags,Age
0,13328,1399,3.5,186005,6.0,12,14
1,16621,1018,3.0,192000,6.0,8,13
2,8467,0,1.3,200000,4.0,2,18
3,3607,862,2.5,168966,4.0,0,13
4,11726,446,1.3,91901,4.0,4,10
...,...,...,...,...,...,...,...
19230,470,645,1.8,307325,4.0,12,13
19232,8467,0,2.0,300000,4.0,5,25
19233,15681,831,2.4,161600,4.0,8,13
19234,26108,836,2.0,116365,4.0,4,14


In [37]:
non_object_columns.columns

Index(['Price', 'Levy', 'Engine volume', 'Mileage', 'Cylinders', 'Airbags',
       'Age'],
      dtype='object')

In [38]:
# IQR hesaplama ve aykırı değerleri temizleme
for column in non_object_columns.columns:
    # IQR hesaplama
    Q1 = non_object_columns[column].quantile(0.25)
    Q3 = non_object_columns[column].quantile(0.75)
    IQR = Q3 - Q1
    
    print(f"{column} sütunu için IQR: {IQR}")

    # Aykırı değerleri belirleme
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Aykırı değerleri temizleme
    dataset = dataset[(dataset[column] >= lower_bound) & (dataset[column] <= upper_bound)]

Price sütunu için IQR: 15837.0
Levy sütunu için IQR: 862.0
Engine volume sütunu için IQR: 0.8
Mileage sütunu için IQR: 113425.0
Cylinders sütunu için IQR: 0.0
Airbags sütunu için IQR: 8.0
Age sütunu için IQR: 6.0


In [39]:
dataset.head()

Unnamed: 0,Price,Levy,Manufacturer,Model,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags,Age,Turbo
2,8467,0,HONDA,FIT,Hatchback,No,Petrol,1.3,200000,4.0,Variator,Front,04-May,Right-hand drive,Black,2,18,no
3,3607,862,FORD,Escape,Jeep,Yes,Hybrid,2.5,168966,4.0,Automatic,4x4,04-May,Left wheel,White,0,13,no
4,11726,446,HONDA,FIT,Hatchback,Yes,Petrol,1.3,91901,4.0,Automatic,Front,04-May,Left wheel,Silver,4,10,no
5,39493,891,HYUNDAI,Santa FE,Jeep,Yes,Diesel,2.0,160931,4.0,Automatic,Front,04-May,Left wheel,White,4,8,no
6,1803,761,TOYOTA,Prius,Hatchback,Yes,Hybrid,1.8,258909,4.0,Automatic,Front,04-May,Left wheel,White,12,14,no


In [40]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10701 entries, 2 to 19236
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Price             10701 non-null  int64  
 1   Levy              10701 non-null  int64  
 2   Manufacturer      10701 non-null  object 
 3   Model             10701 non-null  object 
 4   Category          10701 non-null  object 
 5   Leather interior  10701 non-null  object 
 6   Fuel type         10701 non-null  object 
 7   Engine volume     10701 non-null  float64
 8   Mileage           10701 non-null  Int64  
 9   Cylinders         10701 non-null  float64
 10  Gear box type     10701 non-null  object 
 11  Drive wheels      10701 non-null  object 
 12  Doors             10701 non-null  object 
 13  Wheel             10701 non-null  object 
 14  Color             10701 non-null  object 
 15  Airbags           10701 non-null  int64  
 16  Age               10701 non-null  int64  
 17

In [41]:
object_columns = dataset.select_dtypes(include='object')
object_columns

Unnamed: 0,Manufacturer,Model,Category,Leather interior,Fuel type,Gear box type,Drive wheels,Doors,Wheel,Color,Turbo
2,HONDA,FIT,Hatchback,No,Petrol,Variator,Front,04-May,Right-hand drive,Black,no
3,FORD,Escape,Jeep,Yes,Hybrid,Automatic,4x4,04-May,Left wheel,White,no
4,HONDA,FIT,Hatchback,Yes,Petrol,Automatic,Front,04-May,Left wheel,Silver,no
5,HYUNDAI,Santa FE,Jeep,Yes,Diesel,Automatic,Front,04-May,Left wheel,White,no
6,TOYOTA,Prius,Hatchback,Yes,Hybrid,Automatic,Front,04-May,Left wheel,White,no
...,...,...,...,...,...,...,...,...,...,...,...
19230,TOYOTA,Prius,Hatchback,Yes,Hybrid,Automatic,Front,04-May,Left wheel,Silver,no
19232,MERCEDES-BENZ,CLK 200,Coupe,Yes,CNG,Manual,Rear,02-Mar,Left wheel,Silver,yes
19233,HYUNDAI,Sonata,Sedan,Yes,Petrol,Tiptronic,Front,04-May,Left wheel,Red,no
19234,HYUNDAI,Tucson,Jeep,Yes,Diesel,Automatic,Front,04-May,Left wheel,Grey,no


In [42]:
dataset['Wheel'].value_counts()    

Wheel
Left wheel          9606
Right-hand drive    1095
Name: count, dtype: int64

In [43]:
# Category - Leather interior - Fuel type - Gear box type - Drive wheels - Doors - Wheel - Turbo 

In [44]:
categorical_columns = [
    'Category', 'Leather interior', 'Fuel type', 'Gear box type',
    'Drive wheels', 'Doors', 'Wheel', 'Turbo'
]
label_columns = ['Manufacturer', 'Model', 'Color']

In [45]:
df_encoded = pd.get_dummies(dataset, columns=categorical_columns)
# Sadece One-Hot Encoding ile oluşturulan sütunları seçme
one_hot_columns = df_encoded.columns.difference(dataset.columns)

# Bu sütunlardaki boolean (True/False) değerleri sayısal (0/1) değerlere çevirme
df_encoded[one_hot_columns] = df_encoded[one_hot_columns].astype(int)

In [46]:
df_encoded

Unnamed: 0,Price,Levy,Manufacturer,Model,Engine volume,Mileage,Cylinders,Color,Airbags,Age,...,Drive wheels_4x4,Drive wheels_Front,Drive wheels_Rear,Doors_02-Mar,Doors_04-May,Doors_>5,Wheel_Left wheel,Wheel_Right-hand drive,Turbo_no,Turbo_yes
2,8467,0,HONDA,FIT,1.3,200000,4.0,Black,2,18,...,0,1,0,0,1,0,0,1,1,0
3,3607,862,FORD,Escape,2.5,168966,4.0,White,0,13,...,1,0,0,0,1,0,1,0,1,0
4,11726,446,HONDA,FIT,1.3,91901,4.0,Silver,4,10,...,0,1,0,0,1,0,1,0,1,0
5,39493,891,HYUNDAI,Santa FE,2.0,160931,4.0,White,4,8,...,0,1,0,0,1,0,1,0,1,0
6,1803,761,TOYOTA,Prius,1.8,258909,4.0,White,12,14,...,0,1,0,0,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19230,470,645,TOYOTA,Prius,1.8,307325,4.0,Silver,12,13,...,0,1,0,0,1,0,1,0,1,0
19232,8467,0,MERCEDES-BENZ,CLK 200,2.0,300000,4.0,Silver,5,25,...,0,0,1,1,0,0,1,0,0,1
19233,15681,831,HYUNDAI,Sonata,2.4,161600,4.0,Red,8,13,...,0,1,0,0,1,0,1,0,1,0
19234,26108,836,HYUNDAI,Tucson,2.0,116365,4.0,Grey,4,14,...,0,1,0,0,1,0,1,0,1,0


In [47]:
label_encoder = LabelEncoder()

# Her sütuna LabelEncoder uygulama
df_encoded['Manufacturer'] = label_encoder.fit_transform(df_encoded['Manufacturer'])
df_encoded['Model'] = label_encoder.fit_transform(df_encoded['Model'])
df_encoded['Color'] = label_encoder.fit_transform(df_encoded['Color'])

In [48]:
df_encoded

Unnamed: 0,Price,Levy,Manufacturer,Model,Engine volume,Mileage,Cylinders,Color,Airbags,Age,...,Drive wheels_4x4,Drive wheels_Front,Drive wheels_Rear,Doors_02-Mar,Doors_04-May,Doors_>5,Wheel_Left wheel,Wheel_Right-hand drive,Turbo_no,Turbo_yes
2,8467,0,17,389,1.3,200000,4.0,1,2,18,...,0,1,0,0,1,0,0,1,1,0
3,3607,862,13,376,2.5,168966,4.0,14,0,13,...,1,0,0,0,1,0,1,0,1,0
4,11726,446,17,389,1.3,91901,4.0,12,4,10,...,0,1,0,0,1,0,1,0,1,0
5,39493,891,18,753,2.0,160931,4.0,14,4,8,...,0,1,0,0,1,0,1,0,1,0
6,1803,761,45,681,1.8,258909,4.0,14,12,14,...,0,1,0,0,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19230,470,645,45,681,1.8,307325,4.0,12,12,13,...,0,1,0,0,1,0,1,0,1,0
19232,8467,0,28,195,2.0,300000,4.0,12,5,25,...,0,0,1,1,0,0,1,0,0,1
19233,15681,831,18,770,2.4,161600,4.0,11,8,13,...,0,1,0,0,1,0,1,0,1,0
19234,26108,836,18,851,2.0,116365,4.0,7,4,14,...,0,1,0,0,1,0,1,0,1,0


In [49]:
x = df_encoded.drop(['Price'], axis=1)
y = df_encoded['Price']

In [50]:
x

Unnamed: 0,Levy,Manufacturer,Model,Engine volume,Mileage,Cylinders,Color,Airbags,Age,Category_Cabriolet,...,Drive wheels_4x4,Drive wheels_Front,Drive wheels_Rear,Doors_02-Mar,Doors_04-May,Doors_>5,Wheel_Left wheel,Wheel_Right-hand drive,Turbo_no,Turbo_yes
2,0,17,389,1.3,200000,4.0,1,2,18,0,...,0,1,0,0,1,0,0,1,1,0
3,862,13,376,2.5,168966,4.0,14,0,13,0,...,1,0,0,0,1,0,1,0,1,0
4,446,17,389,1.3,91901,4.0,12,4,10,0,...,0,1,0,0,1,0,1,0,1,0
5,891,18,753,2.0,160931,4.0,14,4,8,0,...,0,1,0,0,1,0,1,0,1,0
6,761,45,681,1.8,258909,4.0,14,12,14,0,...,0,1,0,0,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19230,645,45,681,1.8,307325,4.0,12,12,13,0,...,0,1,0,0,1,0,1,0,1,0
19232,0,28,195,2.0,300000,4.0,12,5,25,0,...,0,0,1,1,0,0,1,0,0,1
19233,831,18,770,2.4,161600,4.0,11,8,13,0,...,0,1,0,0,1,0,1,0,1,0
19234,836,18,851,2.0,116365,4.0,7,4,14,0,...,0,1,0,0,1,0,1,0,1,0


In [51]:
y

2         8467
3         3607
4        11726
5        39493
6         1803
         ...  
19230      470
19232     8467
19233    15681
19234    26108
19236      470
Name: Price, Length: 10701, dtype: int64

In [52]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.25, random_state=42)

In [53]:
Model_LR=LinearRegression()
Model_RF=RandomForestRegressor(n_estimators=100, max_features= 7)
Model_XGB=XGBRegressor(ax_depth = 7, n_estimators=500, learning_rate =.05)
Model_GBoosting=GradientBoostingRegressor(learning_rate =.07, max_depth =7, verbose=0)
Model_DT=DecisionTreeRegressor()

In [54]:
Algo=['LinearRegression','RandomForestRegressor','XGBRegressor','GradientBoostingRegressor','DecisionTreeRegressor']
R2=[]
RMSE=[]

In [55]:
def predict(model):
    model.fit(x_train,y_train)
    model.predict(x_test)
    r2=r2_score(y_test,model.predict(x_test))
    rmse=np.sqrt(mean_squared_error(y_test,model.predict(x_test)))
    R2.append(r2)
    RMSE.append(rmse)
    score=model.score(x_test,y_test)
    print(f'score of model is : {score}')

In [56]:
predict(Model_LR)
predict(Model_RF)
predict(Model_XGB)
predict(Model_GBoosting)
predict(Model_DT)

score of model is : 0.42995307000923366
score of model is : 0.7498677856884202


Parameters: { "ax_depth" } are not used.



score of model is : 0.7398650646209717
score of model is : 0.748373392924195
score of model is : 0.5075841734603679


In [57]:
result = pd.DataFrame({'Algorithm':Algo,'R2_core':R2,'RMSE':RMSE})

In [58]:
result

Unnamed: 0,Algorithm,R2_core,RMSE
0,LinearRegression,0.429953,8548.514345
1,RandomForestRegressor,0.749868,5662.655656
2,XGBRegressor,0.739865,5774.769639
3,GradientBoostingRegressor,0.748373,5679.545983
4,DecisionTreeRegressor,0.507584,7945.136094


In [60]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np
import pandas as pd

# Algoritma isimleri ve sonuçları için listeler
Algo = ['LinearRegression', 'RandomForestRegressor', 'XGBRegressor', 'GradientBoostingRegressor', 'DecisionTreeRegressor']
R2 = []
RMSE = []
MAE = []
MSE = []

# Performans metriklerini hesaplayan fonksiyon
def predict(model):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    
    # Performans metriklerinin hesaplanması
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    
    # Sonuçların listelere eklenmesi
    R2.append(r2)
    RMSE.append(rmse)
    MSE.append(mse)
    MAE.append(mae)
    
    score = model.score(x_test, y_test)
    print(f'Score of model is: {score}')

# Modellerin sırasıyla test edilmesi
predict(Model_LR)
predict(Model_RF)
predict(Model_XGB)
predict(Model_GBoosting)
predict(Model_DT)

# Sonuçların bir DataFrame'e dönüştürülmesi
result = pd.DataFrame({'Algorithm': Algo, 'R2': R2, 'RMSE': RMSE, 'MSE': MSE, 'MAE': MAE})

print(result)


Score of model is: 0.42995307000923366
Score of model is: 0.745925996334063


Parameters: { "ax_depth" } are not used.



Score of model is: 0.7398650646209717
Score of model is: 0.7488493870807693
Score of model is: 0.5153659814303184
                   Algorithm        R2         RMSE           MSE          MAE
0           LinearRegression  0.429953  8548.514345  7.307710e+07  6603.812453
1      RandomForestRegressor  0.745926  5707.099639  3.257099e+07  3751.311396
2               XGBRegressor  0.739865  5774.769639  3.334796e+07  3797.450322
3  GradientBoostingRegressor  0.748849  5674.171531  3.219622e+07  3766.367368
4      DecisionTreeRegressor  0.515366  7882.106294  6.212760e+07  5035.784006


In [61]:
result

Unnamed: 0,Algorithm,R2,RMSE,MSE,MAE
0,LinearRegression,0.429953,8548.514345,73077100.0,6603.812453
1,RandomForestRegressor,0.745926,5707.099639,32570990.0,3751.311396
2,XGBRegressor,0.739865,5774.769639,33347960.0,3797.450322
3,GradientBoostingRegressor,0.748849,5674.171531,32196220.0,3766.367368
4,DecisionTreeRegressor,0.515366,7882.106294,62127600.0,5035.784006
