In [13]:
import pandas as pd

In [14]:
data = pd.read_csv('data/refine_car.csv', index_col=[0])
data.head()

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,80000,45000,Petrol
1,Mahindra Jeep CL550,Mahindra,2006,425000,40,Diesel
2,Hyundai Grand i10,Hyundai,2014,325000,28000,Petrol
3,Ford EcoSport Titanium,Ford,2014,575000,36000,Diesel
4,Ford Figo,Ford,2012,175000,41000,Diesel


In [15]:
X = data.drop(['Price'], axis=1)
y = data['Price']

In [16]:
X.head()

Unnamed: 0,name,company,year,kms_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,45000,Petrol
1,Mahindra Jeep CL550,Mahindra,2006,40,Diesel
2,Hyundai Grand i10,Hyundai,2014,28000,Petrol
3,Ford EcoSport Titanium,Ford,2014,36000,Diesel
4,Ford Figo,Ford,2012,41000,Diesel


In [17]:
numerical_columns = ['year', 'kms_driven']
categorical_columns = ['name', 'company', 'fuel_type']

In [19]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
cat_pipe = Pipeline(
    steps = [
        ("one_hot_encoding", OneHotEncoder(sparse_output=False)),
        ("scaling", StandardScaler(with_mean=False))
    ]
)

In [21]:
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    [
        ("numerical_scaling", StandardScaler(), numerical_columns),
        ("categorical_pipeline", cat_pipe, categorical_columns)
    ]
)

In [22]:
X = preprocessor.fit_transform(X)

In [24]:
X.shape

(815, 284)

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [27]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [29]:
lr.fit(X_train, y_train)

In [30]:
y_test_pred = model.predict(X_test)

In [38]:
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error

In [35]:
# acc = accuracy_score(y_test, y_test_pred)
r2 = r2_score(y_test, y_test_pred)

In [36]:
print(f"R2 Score is {r2}")

R2 Score is -2.699646810982194e+27


In [39]:
mse = mean_squared_error(y_test, y_test_pred)
mse

2.672701034400147e+38

## Testing on new data

In [41]:
columns = ['name', 'company', 'year', 'kms_driven', 'fuel_type']
new_df = pd.DataFrame(data=[['Hyundai Santro Xing', 'Hyundai', 2007, 45000, 'Petrol']], columns=columns)
new_df

Unnamed: 0,name,company,year,kms_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,45000,Petrol


In [43]:
# Applying preprocessor
new_df_processed = preprocessor.transform(new_df)

In [45]:
new_df_processed.shape

(1, 284)

In [46]:
out = lr.predict(new_df_processed)

In [47]:
print(out)

[101376.]


In [48]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

from sklearn.ensemble import (
    AdaBoostRegressor,
    GradientBoostingRegressor,
    RandomForestRegressor
)

In [49]:
models = {
    "Random Forest": RandomForestRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "K-Neighbors": KNeighborsRegressor(),
    "XGBRegressor": XGBRegressor(),
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}

In [52]:
def evaluate_models(X_train, y_train, X_test, y_test, models):
    report={}
    for i in range(len(list(models))):
        model = list(models.values())[i]
        model.fit(X_train, y_train)
        y_test_pred = model.predict(X_test)
        test_model_score = r2_score(y_test, y_test_pred)
        report[list(models.keys())[i]] = test_model_score
    return report

In [53]:
model_report:dict = evaluate_models(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, models=models)

In [54]:
best_model_score = max(sorted(model_report.values()))
print(f"Best Model Score is {best_model_score}")

Best Model Score is 0.7956695443535798


In [57]:
best_model_name = list(model_report.keys())[
    list(model_report.values()).index(best_model_score)
]
print(f"Best model = {best_model_name}")

Best model = Random Forest


In [58]:
best_model = models[best_model_name]

In [None]:
best_model.predict(