In [507]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [508]:
df = pd.read_csv('cars_dataset_500.csv')

In [509]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ID                        500 non-null    int64  
 1   Make                      498 non-null    object 
 2   Model                     499 non-null    object 
 3   Year                      498 non-null    float64
 4   Fuel_Type                 499 non-null    object 
 5   Engine_Capacity_cc        498 non-null    float64
 6   Horsepower                500 non-null    int64  
 7   Torque_Nm                 499 non-null    float64
 8   Transmission              499 non-null    object 
 9   Drivetrain                499 non-null    object 
 10  Doors                     499 non-null    float64
 11  Seats                     499 non-null    float64
 12  Weight_kg                 499 non-null    float64
 13  Length_mm                 499 non-null    float64
 14  Width_mm  

In [510]:
def MissingValue(df):
    for col in df.columns:
        if df[col].isnull().any():
            if df[col].dtype == 'object':
                df[col].fillna(df[col].mode()[0], inplace=True)
            else:
                df[col].fillna(df[col].mean(), inplace=True)
    return df

In [511]:
def Encoder(df):
    encoder = LabelEncoder()
    for col in df.columns:
        if df[col].dtype == 'object':
            if df[col].nunique() <= 5:
                dummies = pd.get_dummies(df[col], prefix=col, dtype=int)
                df = pd.concat([df.drop(columns=[col]), dummies], axis=1)
            else:
                df[col] = encoder.fit_transform(df[col])
    return df

In [512]:
def Scaler(df):
    scaler = MinMaxScaler()
    num_col = df.select_dtypes(include=['float64', 'int64']).columns.drop('Price')
    df[num_col] = scaler.fit_transform(df[num_col])
    return df

In [513]:
MissingValue(df)
df = Encoder(df)
df = Scaler(df)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)


In [514]:
df.head(20)

Unnamed: 0,ID,Make,Model,Year,Engine_Capacity_cc,Horsepower,Torque_Nm,Doors,Seats,Weight_kg,...,Fuel_Type_Diesel,Fuel_Type_Electric,Fuel_Type_Hybrid,Fuel_Type_Petrol,Transmission_Automatic,Transmission_CVT,Transmission_Manual,Drivetrain_AWD,Drivetrain_FWD,Drivetrain_RWD
0,0.0,0.066667,0.811111,0.884615,0.819156,0.501002,0.37871,0.666667,0.4,0.931616,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,0.002004,0.666667,0.8,0.269231,0.250911,0.158317,0.120778,0.333333,0.6,0.147073,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.004008,0.533333,0.033333,0.115385,0.292479,0.174349,0.143296,0.0,0.4,0.2726,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.006012,0.4,0.288889,0.5,0.496465,0.232465,0.235415,0.666667,0.6,0.561593,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
4,0.008016,0.266667,0.211111,0.076923,0.423827,0.204409,0.207779,0.666667,0.6,0.21452,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
5,0.01002,0.333333,0.844444,0.615385,0.451468,0.176353,0.131013,0.666667,0.6,0.399532,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
6,0.012024,0.866667,0.688889,0.230769,0.405335,0.146293,0.136131,0.666667,0.6,0.407494,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
7,0.014028,0.466667,0.522222,0.538462,0.299121,0.11022,0.089048,0.666667,0.6,0.217799,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
8,0.016032,0.666667,0.055556,0.923077,0.364688,0.058116,0.064483,0.666667,0.6,0.219204,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
9,0.018036,0.0,0.722222,0.5,0.349261,0.158317,0.180143,0.666667,0.6,0.277752,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [515]:
from sklearn.model_selection import train_test_split

x = df.drop('Price', axis=1)
y = df['Price']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Linear Regression

In [516]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [517]:
lr.fit(x_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [518]:
y_pred = lr.predict(x_test)

In [519]:
from sklearn.metrics import r2_score

In [520]:
lr_score = r2_score(y_test, y_pred)

In [521]:
print(lr_score)

0.9624433409746045


# Decision Tree

In [522]:
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor()

In [523]:
dt.fit(x_train, y_train)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [524]:
y_pred = dt.predict(x_test)

In [525]:
dt_score = r2_score(y_test, y_pred)

In [526]:
print(dt_score)

0.8887445188684189


# Random Forest

In [527]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()

In [528]:
rf.fit(x_train, y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [529]:
y_pred = rf.predict(x_test)

In [530]:
rf_score = r2_score(y_test, y_pred)

In [531]:
print(rf_score)

0.9514055680440958


# Tabulate

In [532]:
from tabulate import tabulate

In [533]:
result = [
    ['Linear Regression', lr_score],
    ['Decision Tree', dt_score],
    ['Random Forest', rf_score]
]

headers = ['Algorithm', 'r2_score']

table = tabulate(result, headers=headers, tablefmt='grid', floatfmt='.2f')

print(table)

+-------------------+------------+
| Algorithm         |   r2_score |
| Linear Regression |       0.96 |
+-------------------+------------+
| Decision Tree     |       0.89 |
+-------------------+------------+
| Random Forest     |       0.95 |
+-------------------+------------+


# Joblib

In [534]:
from joblib import dump, load

In [537]:
dump(lr, 'car_price_predict.joblib')

['car_price_predict.joblib']

In [538]:
dump(dt, 'car_price_predict_dt.joblib')

['car_price_predict_dt.joblib']

In [539]:
dump(rf, 'car_price_predict_rf.joblib')

['car_price_predict_rf.joblib']