In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [3]:
df=pd.read_csv("vehicles_dataset.csv")
df.head(1)

Unnamed: 0,name,description,make,model,type,year,price,engine,cylinders,fuel,mileage,transmission,trim,body,doors,exterior_color,interior_color,drivetrain
0,2024 Jeep Wagoneer Series II,"\n \n Heated Leather Seats, Nav Sy...",Jeep,Wagoneer,New,2024,74600.0,24V GDI DOHC Twin Turbo,6.0,Gasoline,10.0,8-Speed Automatic,Series II,SUV,4.0,White,Global Black,Four-wheel Drive


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1002 entries, 0 to 1001
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   name            1002 non-null   object 
 1   description     946 non-null    object 
 2   make            1002 non-null   object 
 3   model           1002 non-null   object 
 4   type            1002 non-null   object 
 5   year            1002 non-null   int64  
 6   price           979 non-null    float64
 7   engine          1000 non-null   object 
 8   cylinders       897 non-null    float64
 9   fuel            995 non-null    object 
 10  mileage         968 non-null    float64
 11  transmission    1000 non-null   object 
 12  trim            1001 non-null   object 
 13  body            999 non-null    object 
 14  doors           995 non-null    float64
 15  exterior_color  997 non-null    object 
 16  interior_color  964 non-null    object 
 17  drivetrain      1002 non-null   o

In [5]:
df.isnull().sum()

name                0
description        56
make                0
model               0
type                0
year                0
price              23
engine              2
cylinders         105
fuel                7
mileage            34
transmission        2
trim                1
body                3
doors               7
exterior_color      5
interior_color     38
drivetrain          0
dtype: int64

In [6]:
missing_columns = df.columns[df.isnull().sum() > 0]

for col in missing_columns:
    if df[col].dtype == 'object': 
        df[col].fillna(df[col].mode()[0], inplace=True)
    else:  
        df[col].fillna(df[col].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1002 entries, 0 to 1001
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   name            1002 non-null   object 
 1   description     1002 non-null   object 
 2   make            1002 non-null   object 
 3   model           1002 non-null   object 
 4   type            1002 non-null   object 
 5   year            1002 non-null   int64  
 6   price           1002 non-null   float64
 7   engine          1002 non-null   object 
 8   cylinders       1002 non-null   float64
 9   fuel            1002 non-null   object 
 10  mileage         1002 non-null   float64
 11  transmission    1002 non-null   object 
 12  trim            1002 non-null   object 
 13  body            1002 non-null   object 
 14  doors           1002 non-null   float64
 15  exterior_color  1002 non-null   object 
 16  interior_color  1002 non-null   object 
 17  drivetrain      1002 non-null   o

In [8]:
df.isnull().sum()

name              0
description       0
make              0
model             0
type              0
year              0
price             0
engine            0
cylinders         0
fuel              0
mileage           0
transmission      0
trim              0
body              0
doors             0
exterior_color    0
interior_color    0
drivetrain        0
dtype: int64

In [9]:
print(df.dtypes)

name               object
description        object
make               object
model              object
type               object
year                int64
price             float64
engine             object
cylinders         float64
fuel               object
mileage           float64
transmission       object
trim               object
body               object
doors             float64
exterior_color     object
interior_color     object
drivetrain         object
dtype: object


In [10]:
#Muammoni aniqlash
problem_columns = df.select_dtypes(include=['object']).columns
print("Object columns to investigate:", problem_columns)

Object columns to investigate: Index(['name', 'description', 'make', 'model', 'type', 'engine', 'fuel',
       'transmission', 'trim', 'body', 'exterior_color', 'interior_color',
       'drivetrain'],
      dtype='object')


In [11]:
columns_to_clean=["engine","transmission"]

def clean_numeric_column(column):
    if column.dtype != 'object':
        column = column.astype(str)
    return (
        column.str.replace(r'[^\d.]', '', regex=True)  # Belgilarni olib tashlash
              .replace('', float('nan'))  # Bo'sh qiymatlarni NaN ga almashtirish
              .astype(float)  
    )


In [12]:
print(df.isnull().sum())

name              0
description       0
make              0
model             0
type              0
year              0
price             0
engine            0
cylinders         0
fuel              0
mileage           0
transmission      0
trim              0
body              0
doors             0
exterior_color    0
interior_color    0
drivetrain        0
dtype: int64


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1002 entries, 0 to 1001
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   name            1002 non-null   object 
 1   description     1002 non-null   object 
 2   make            1002 non-null   object 
 3   model           1002 non-null   object 
 4   type            1002 non-null   object 
 5   year            1002 non-null   int64  
 6   price           1002 non-null   float64
 7   engine          1002 non-null   object 
 8   cylinders       1002 non-null   float64
 9   fuel            1002 non-null   object 
 10  mileage         1002 non-null   float64
 11  transmission    1002 non-null   object 
 12  trim            1002 non-null   object 
 13  body            1002 non-null   object 
 14  doors           1002 non-null   float64
 15  exterior_color  1002 non-null   object 
 16  interior_color  1002 non-null   object 
 17  drivetrain      1002 non-null   o

In [14]:
df['name'] = pd.to_datetime(df['name'], errors='coerce')


print(df['name'].dtypes)

datetime64[ns]


  df['name'] = pd.to_datetime(df['name'], errors='coerce')


In [15]:
df.drop("year", axis=1, inplace=True)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1002 entries, 0 to 1001
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   name            0 non-null      datetime64[ns]
 1   description     1002 non-null   object        
 2   make            1002 non-null   object        
 3   model           1002 non-null   object        
 4   type            1002 non-null   object        
 5   price           1002 non-null   float64       
 6   engine          1002 non-null   object        
 7   cylinders       1002 non-null   float64       
 8   fuel            1002 non-null   object        
 9   mileage         1002 non-null   float64       
 10  transmission    1002 non-null   object        
 11  trim            1002 non-null   object        
 12  body            1002 non-null   object        
 13  doors           1002 non-null   float64       
 14  exterior_color  1002 non-null   object        
 15  inte

In [17]:
df['Car Year'] = df['name'].dt.year
df['Car Name'] = df['name'].dt.name

In [18]:
df.drop("name", axis=1, inplace=True)

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1002 entries, 0 to 1001
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   description     1002 non-null   object 
 1   make            1002 non-null   object 
 2   model           1002 non-null   object 
 3   type            1002 non-null   object 
 4   price           1002 non-null   float64
 5   engine          1002 non-null   object 
 6   cylinders       1002 non-null   float64
 7   fuel            1002 non-null   object 
 8   mileage         1002 non-null   float64
 9   transmission    1002 non-null   object 
 10  trim            1002 non-null   object 
 11  body            1002 non-null   object 
 12  doors           1002 non-null   float64
 13  exterior_color  1002 non-null   object 
 14  interior_color  1002 non-null   object 
 15  drivetrain      1002 non-null   object 
 16  Car Year        0 non-null      float64
 17  Car Name        1002 non-null   o

In [20]:
df.head(1)

Unnamed: 0,description,make,model,type,price,engine,cylinders,fuel,mileage,transmission,trim,body,doors,exterior_color,interior_color,drivetrain,Car Year,Car Name
0,"\n \n Heated Leather Seats, Nav Sy...",Jeep,Wagoneer,New,74600.0,24V GDI DOHC Twin Turbo,6.0,Gasoline,10.0,8-Speed Automatic,Series II,SUV,4.0,White,Global Black,Four-wheel Drive,,name


In [21]:
categorical_col=df.select_dtypes(include=['object','category']).columns

In [22]:
categorical_col

Index(['description', 'make', 'model', 'type', 'engine', 'fuel',
       'transmission', 'trim', 'body', 'exterior_color', 'interior_color',
       'drivetrain', 'Car Name'],
      dtype='object')

In [23]:
cardinality=df[categorical_col].nunique()

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1002 entries, 0 to 1001
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   description     1002 non-null   object 
 1   make            1002 non-null   object 
 2   model           1002 non-null   object 
 3   type            1002 non-null   object 
 4   price           1002 non-null   float64
 5   engine          1002 non-null   object 
 6   cylinders       1002 non-null   float64
 7   fuel            1002 non-null   object 
 8   mileage         1002 non-null   float64
 9   transmission    1002 non-null   object 
 10  trim            1002 non-null   object 
 11  body            1002 non-null   object 
 12  doors           1002 non-null   float64
 13  exterior_color  1002 non-null   object 
 14  interior_color  1002 non-null   object 
 15  drivetrain      1002 non-null   object 
 16  Car Year        0 non-null      float64
 17  Car Name        1002 non-null   o

In [25]:
cardinality

description       761
make               28
model             153
type                1
engine            100
fuel                7
transmission       38
trim              197
body                8
exterior_color    263
interior_color     91
drivetrain          4
Car Name            1
dtype: int64

In [26]:
encoder=LabelEncoder()

In [27]:
for col in categorical_col:
    df[col]=encoder.fit_transform(df[col])

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1002 entries, 0 to 1001
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   description     1002 non-null   int64  
 1   make            1002 non-null   int64  
 2   model           1002 non-null   int64  
 3   type            1002 non-null   int64  
 4   price           1002 non-null   float64
 5   engine          1002 non-null   int64  
 6   cylinders       1002 non-null   float64
 7   fuel            1002 non-null   int64  
 8   mileage         1002 non-null   float64
 9   transmission    1002 non-null   int64  
 10  trim            1002 non-null   int64  
 11  body            1002 non-null   int64  
 12  doors           1002 non-null   float64
 13  exterior_color  1002 non-null   int64  
 14  interior_color  1002 non-null   int64  
 15  drivetrain      1002 non-null   int64  
 16  Car Year        0 non-null      float64
 17  Car Name        1002 non-null   i

In [29]:
X = df.drop(columns=['price'])  
y = df['price'] 

In [30]:
X_train,X_temp,y_train,y_temp=train_test_split(X,y,test_size=0.3,random_state=42)

X_val,X_test,y_val,y_test=train_test_split(X_temp,y_temp, test_size=0.5,random_state=42)

In [31]:
len(X_train)

701

In [32]:
len(y_train)

701

In [33]:
model=DecisionTreeRegressor(random_state=42)
model.fit(X_train,y_train)

In [34]:
y_pred=model.predict(X_test)

In [35]:
mse=mean_squared_error(y_test,y_pred)
r2=r2_score(y_test,y_pred)

In [36]:
print(f"mse:{mse}")
print(f"r2:{r2}")

mse:182908396.38177714
r2:0.36109125612613424
