In [53]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s4e9/sample_submission.csv
/kaggle/input/playground-series-s4e9/train.csv
/kaggle/input/playground-series-s4e9/test.csv


In [54]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor


In [55]:
traindata=pd.read_csv("/kaggle/input/playground-series-s4e9/train.csv")
testdata=pd.read_csv("/kaggle/input/playground-series-s4e9/test.csv")
traindata.head(5)

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500


In [56]:
traindata.isna().sum()

id                  0
brand               0
model               0
model_year          0
milage              0
fuel_type        5083
engine              0
transmission        0
ext_col             0
int_col             0
accident         2452
clean_title     21419
price               0
dtype: int64

In [57]:
from sklearn.preprocessing import LabelEncoder
def preprocess (data):
    categorical_columns = data.select_dtypes(include=['object', 'category']).columns.tolist()
    #filling cat coloumn nulls
    data[categorical_columns] = data[categorical_columns].fillna(data[categorical_columns].mode().iloc[0])
    label_encoder = LabelEncoder()
    for col in categorical_columns:
        data[col] = label_encoder.fit_transform(data[col])
    return data

In [58]:
traindata=preprocess(traindata)

In [59]:
traindata.isna().sum()

id              0
brand           0
model           0
model_year      0
milage          0
fuel_type       0
engine          0
transmission    0
ext_col         0
int_col         0
accident        0
clean_title     0
price           0
dtype: int64

In [60]:
X=traindata.drop(['price','id'],axis=1)
y=traindata['price']

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [62]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [63]:
# Function to evaluate and print results
def evaluate_model(y_true, y_pred, model_name):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print(f"\n{model_name}:")
    print(f"Mean Squared Error: {mse:.4f}")
    print(f"Mean Absolute Error: {mae:.4f}")
    print(f"R2 Score: {r2:.4f}")

    return {'MSE': mse, 'MAE': mae, 'R2': r2}

In [64]:
# Random Forest Regressor
RandomForestModel = RandomForestRegressor(random_state=42)
RandomForestModel.fit(X_train_scaled, y_train)
RF_y_pred = RandomForestModel.predict(X_test_scaled)
rf_results = evaluate_model(y_test, RF_y_pred, "Random Forest")


Random Forest:
Mean Squared Error: 5732675975.8438
Mean Absolute Error: 22144.1061
R2 Score: -0.0308


In [65]:
# Gradient Boosting Regressor
GradientBoostingModel = GradientBoostingRegressor(random_state=42)
GradientBoostingModel.fit(X_train_scaled, y_train)
GB_y_pred = GradientBoostingModel.predict(X_test_scaled)
gb_results = evaluate_model(y_test, GB_y_pred, "Gradient Boosting")


Gradient Boosting:
Mean Squared Error: 4655176161.2776
Mean Absolute Error: 19721.1239
R2 Score: 0.1629


In [66]:
# Decision Tree Regressor
DecisionTreeModel = DecisionTreeRegressor(random_state=42)
DecisionTreeModel.fit(X_train_scaled, y_train)
DT_y_pred = DecisionTreeModel.predict(X_test_scaled)
dt_results = evaluate_model(y_test, DT_y_pred, "Decision Tree")


Decision Tree:
Mean Squared Error: 13159294605.8681
Mean Absolute Error: 29421.6229
R2 Score: -1.3663


In [67]:
# K-Nearest Neighbors Regressor
KNNModel = KNeighborsRegressor()
KNNModel.fit(X_train_scaled, y_train)
KNN_y_pred = KNNModel.predict(X_test_scaled)
knn_results = evaluate_model(y_test, KNN_y_pred, "KNN")


KNN:
Mean Squared Error: 5709983213.1117
Mean Absolute Error: 22295.6368
R2 Score: -0.0268


In [68]:
# XGBoost Regressor
XGBoostModel = XGBRegressor(random_state=42)
XGBoostModel.fit(X_train_scaled, y_train)
XGB_y_pred = XGBoostModel.predict(X_test_scaled)
xgb_results = evaluate_model(y_test, XGB_y_pred, "XGBoost")


XGBoost:
Mean Squared Error: 5018516913.6896
Mean Absolute Error: 20254.6254
R2 Score: 0.0976


In [69]:
# CatBoost Regressor
CatBoostModel = CatBoostRegressor(random_state=42, verbose=0)
CatBoostModel.fit(X_train_scaled, y_train)
CatBoost_y_pred = CatBoostModel.predict(X_test_scaled)
catboost_results = evaluate_model(y_test, CatBoost_y_pred, "CatBoost")


CatBoost:
Mean Squared Error: 4644269220.7270
Mean Absolute Error: 19750.2755
R2 Score: 0.1649


In [70]:
# Voting Regressor
VotingModel = VotingRegressor([
    ('rf', RandomForestModel),
    ('gb', GradientBoostingModel),
    ('dt', DecisionTreeModel),
    ('knn', KNNModel),
    ('xgb', XGBoostModel),
    ('catboost', CatBoostModel)
])
VotingModel.fit(X_train_scaled, y_train)
Voting_y_pred = VotingModel.predict(X_test_scaled)
voting_results = evaluate_model(y_test, Voting_y_pred, "Voting Regressor")


Voting Regressor:
Mean Squared Error: 5061021410.1220
Mean Absolute Error: 20488.7391
R2 Score: 0.0899


In [71]:
testdata=preprocess(testdata)

In [72]:
testdata.head(5)

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
0,188533,26,1388,2015,98000,2,326,16,302,10,1,0
1,188534,26,1375,2020,9142,3,787,31,261,14,1,0
2,188535,14,636,2022,28121,2,541,3,302,57,1,0
3,188536,3,182,2016,61258,2,193,39,259,14,1,0
4,188537,3,181,2018,59000,2,365,38,127,14,1,0


In [73]:
test_predictions = CatBoostModel.predict(testdata.drop(['id'], axis=1))
# Create submission file
submission = pd.DataFrame({'id': testdata['id'], 'price': test_predictions})
submission.to_csv('submission.csv', index=False)
print("\nSubmission file created successfully!")


Submission file created successfully!
