In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import  mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler
from sklearn.linear_model import Lasso, Ridge, SGDRegressor, LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,BaggingRegressor,ExtraTreesRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
import h2o
from h2o.automl import H2OAutoML

In [3]:
df=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [4]:
df.head(10)

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500
5,5,Audi,A6 2.0T Sport,2018,40950,Gasoline,252.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,A/T,White,–,None reported,Yes,29950
6,6,Audi,A8 L 3.0T,2016,62200,Gasoline,333.0HP 3.0L V6 Cylinder Engine Gasoline Fuel,8-Speed A/T,Black,Black,None reported,Yes,28500
7,7,Chevrolet,Silverado 1500 1LZ,2016,102604,E85 Flex Fuel,355.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,White,Gray,None reported,Yes,12500
8,8,Ford,F-150 XLT,2020,38352,Gasoline,2.7L V6 24V PDI DOHC Twin Turbo,10-Speed Automatic,Snowflake White Pearl Metallic,Black,None reported,Yes,62890
9,9,BMW,M4 Base,2015,74850,Gasoline,425.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,Transmission w/Dual Shift Mode,Black,Blue,None reported,Yes,4000


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188533 entries, 0 to 188532
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            188533 non-null  int64 
 1   brand         188533 non-null  object
 2   model         188533 non-null  object
 3   model_year    188533 non-null  int64 
 4   milage        188533 non-null  int64 
 5   fuel_type     183450 non-null  object
 6   engine        188533 non-null  object
 7   transmission  188533 non-null  object
 8   ext_col       188533 non-null  object
 9   int_col       188533 non-null  object
 10  accident      186081 non-null  object
 11  clean_title   167114 non-null  object
 12  price         188533 non-null  int64 
dtypes: int64(4), object(9)
memory usage: 18.7+ MB


## Preprocessing

In [6]:
df.isnull().sum()

id                  0
brand               0
model               0
model_year          0
milage              0
fuel_type        5083
engine              0
transmission        0
ext_col             0
int_col             0
accident         2452
clean_title     21419
price               0
dtype: int64

In [7]:
df.dropna(subset=['fuel_type','accident','clean_title'],inplace=True)

- Dropped Null columns

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 162610 entries, 0 to 188532
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            162610 non-null  int64 
 1   brand         162610 non-null  object
 2   model         162610 non-null  object
 3   model_year    162610 non-null  int64 
 4   milage        162610 non-null  int64 
 5   fuel_type     162610 non-null  object
 6   engine        162610 non-null  object
 7   transmission  162610 non-null  object
 8   ext_col       162610 non-null  object
 9   int_col       162610 non-null  object
 10  accident      162610 non-null  object
 11  clean_title   162610 non-null  object
 12  price         162610 non-null  int64 
dtypes: int64(4), object(9)
memory usage: 17.4+ MB


In [9]:
df.isnull().sum()

id              0
brand           0
model           0
model_year      0
milage          0
fuel_type       0
engine          0
transmission    0
ext_col         0
int_col         0
accident        0
clean_title     0
price           0
dtype: int64

## Encoding

In [10]:
encoder=LabelEncoder()
df['brand']=encoder.fit_transform(df['brand'])
df['model']=encoder.fit_transform(df['model'])
df['fuel_type']=encoder.fit_transform(df['fuel_type'])
df['engine']=encoder.fit_transform(df['engine'])
df['transmission']=encoder.fit_transform(df['transmission'])
df['ext_col']=encoder.fit_transform(df['ext_col'])
df['int_col']=encoder.fit_transform(df['int_col'])
df['accident']=encoder.fit_transform(df['accident'])
df['clean_title']=encoder.fit_transform(df['clean_title'])

In [11]:
test['brand']=encoder.fit_transform(test['brand'])
test['model']=encoder.fit_transform(test['model'])
test['fuel_type']=encoder.fit_transform(test['fuel_type'])
test['engine']=encoder.fit_transform(test['engine'])
test['transmission']=encoder.fit_transform(test['transmission'])
test['ext_col']=encoder.fit_transform(test['ext_col'])
test['int_col']=encoder.fit_transform(test['int_col'])
test['accident']=encoder.fit_transform(test['accident'])
test['clean_title']=encoder.fit_transform(test['clean_title'])

In [12]:
df.corr(numeric_only=True)


Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
id,1.0,-0.000393,0.000307,-0.000586,-0.001549,0.00282,0.000481,-0.002226,-0.002831,0.000885,-0.000461,,-0.000543
brand,-0.000393,1.0,-0.038374,-0.054778,0.027051,0.053174,-0.133473,0.050962,0.003225,-0.018418,-0.014867,,0.005627
model,0.000307,-0.038374,1.0,-0.027378,0.069445,-0.002154,-0.051063,-0.028345,0.004018,0.069796,-0.031982,,-0.034331
model_year,-0.000586,-0.054778,-0.027378,1.0,-0.64603,-0.005855,0.325681,0.037799,-0.028926,-0.010451,0.227161,,0.228388
milage,-0.001549,0.027051,0.069445,-0.64603,1.0,-0.096946,-0.374033,-0.033936,0.024609,-0.00129,-0.299756,,-0.282086
fuel_type,0.00282,0.053174,-0.002154,-0.005855,-0.096946,1.0,0.044549,0.088597,-0.015023,0.001408,0.030344,,0.014424
engine,0.000481,-0.133473,-0.051063,0.325681,-0.374033,0.044549,1.0,0.010894,-0.039274,-0.003822,0.177838,,0.22019
transmission,-0.002226,0.050962,-0.028345,0.037799,-0.033936,0.088597,0.010894,1.0,0.008568,-0.001518,-0.020126,,0.013923
ext_col,-0.002831,0.003225,0.004018,-0.028926,0.024609,-0.015023,-0.039274,0.008568,1.0,0.064974,-0.018068,,-0.01569
int_col,0.000885,-0.018418,0.069796,-0.010451,-0.00129,0.001408,-0.003822,-0.001518,0.064974,1.0,-0.017836,,0.025883


## Selection

In [13]:
X=df.drop(columns={'id','price'})
y=df.price

In [14]:
X

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
0,31,493,2007,213000,2,110,38,306,71,1,0
1,28,925,2002,143250,2,358,38,257,10,0,0
2,9,1567,2002,136731,1,627,38,37,71,1,0
3,16,755,2017,19500,2,846,49,28,14,1,0
4,36,1072,2021,7388,2,252,23,28,10,1,0
...,...,...,...,...,...,...,...,...,...,...,...
188527,9,402,1999,110000,2,604,38,298,71,1,0
188528,8,601,2017,49000,2,849,49,298,10,1,0
188529,36,204,2018,28600,2,757,31,298,14,0,0
188530,36,221,2021,13650,2,904,23,298,14,1,0


In [15]:
y

0          4200
1          4999
2         13900
3         45000
4         97500
          ...  
188527    14500
188528    27500
188529    30000
188530    86900
188532    28995
Name: price, Length: 162610, dtype: int64

In [16]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=33)

In [17]:
X_train = X
X_valid = X
y_train = y
y_valid = y
X_test = test.drop(columns='id')

## Scalling

In [18]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
y_train = y_train.values.reshape(-1, 1)
y_valid = y_valid.values.reshape(-1, 1)

# Machine Learning Models

# H2O ML

In [82]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,40 mins 11 secs
H2O_cluster_timezone:,Africa/Cairo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.5
H2O_cluster_version_age:,15 days
H2O_cluster_name:,H2O_from_python_George_Hany_0900iq
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.794 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


In [89]:
data = h2o.import_file("train.csv") 

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [98]:
x=data.drop(['price','id'])
y='price'

In [99]:
x

brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes
Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes
Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capability,A/T,Blue,Gray,None reported,Yes
Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes
Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes
Audi,A6 2.0T Sport,2018,40950,Gasoline,252.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,A/T,White,–,None reported,Yes
Audi,A8 L 3.0T,2016,62200,Gasoline,333.0HP 3.0L V6 Cylinder Engine Gasoline Fuel,8-Speed A/T,Black,Black,None reported,Yes
Chevrolet,Silverado 1500 1LZ,2016,102604,E85 Flex Fuel,355.0HP 5.3L 8 Cylinder Engine Flex Fuel Capability,A/T,White,Gray,None reported,Yes
Ford,F-150 XLT,2020,38352,Gasoline,2.7L V6 24V PDI DOHC Twin Turbo,10-Speed Automatic,Snowflake White Pearl Metallic,Black,None reported,Yes
BMW,M4 Base,2015,74850,Gasoline,425.0HP 3.0L Straight 6 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Blue,None reported,Yes


In [100]:
y

'price'

In [102]:
aml = H2OAutoML(max_models=10, seed=1)

aml.train(x=x.columns, y=y, training_frame=data)

AutoML progress: |█
20:45:16.960: AutoML: XGBoost is not available; skipping it.
20:45:16.965: _train param, Dropping bad and constant columns: [clean_title]

██████████████
20:52:20.424: XRT_1_AutoML_3_20240913_204516 [DRF XRT (Extremely Randomized Trees)] failed: java.lang.AssertionError

█
20:52:24.236: _train param, Dropping bad and constant columns: [clean_title]

███████████████████████████████████████████████| (done) 100%


key,value
Stacking strategy,cross_validation
Number of base models (used / total),7/10
# GBM base models (used / total),4/6
# DeepLearning base models (used / total),2/2
# DRF base models (used / total),1/1
# GLM base models (used / total),0/1
Metalearner algorithm,GLM
Metalearner fold assignment scheme,Random
Metalearner nfolds,5
Metalearner fold_column,

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
aic,950340.56,12773.619,958451.6,944740.5,968672.1,938856.75,940981.75
loglikelihood,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mae,19205.355,537.3933,19332.105,18855.023,20072.85,19051.488,18715.307
mean_residual_deviance,5275782700.0,1291163100.0,5781576000.0,4268717000.0,7322456000.0,4590820400.0,4415343000.0
mse,5275782700.0,1291163100.0,5781576000.0,4268717000.0,7322456000.0,4590820400.0,4415343000.0
null_deviance,234257530000000.0,49871952000000.0,254459720000000.0,195798150000000.0,313012850000000.0,207828350000000.0,200188630000000.0
r2,0.154785,0.0276335,0.1398002,0.17654,0.1130368,0.1732676,0.1712804
residual_deviance,199079430000000.0,49582755000000.0,218884690000000.0,161229450000000.0,277586990000000.0,171816040000000.0,165880030000000.0
rmse,72229.414,8565.467,76036.68,65335.418,85571.35,67755.59,66448.05
rmsle,0.5393578,0.0018816,0.5391628,0.5372526,0.5414741,0.5410617,0.537838


In [103]:
lb = aml.leaderboard
print(lb)

model_id                                                    rmse          mse      mae       rmsle    mean_residual_deviance
StackedEnsemble_AllModels_1_AutoML_3_20240913_204516     72660.6  5.27956e+09  19208.2    0.539031               5.27956e+09
StackedEnsemble_BestOfFamily_1_AutoML_3_20240913_204516  72690.9  5.28397e+09  19321.1    0.548021               5.28397e+09
DeepLearning_1_AutoML_3_20240913_204516                  72921.2  5.3175e+09   18639.9  nan                      5.3175e+09
DeepLearning_grid_1_AutoML_3_20240913_204516_model_1     73087.4  5.34177e+09  18817.5  nan                      5.34177e+09
GBM_1_AutoML_3_20240913_204516                           73127.1  5.34757e+09  19661.1  nan                      5.34757e+09
GBM_2_AutoML_3_20240913_204516                           73471.8  5.3981e+09   19622.4  nan                      5.3981e+09
GBM_3_AutoML_3_20240913_204516                           73521.2  5.40536e+09  19694.9  nan                      5.40536e+09
GB

In [104]:
best_model = aml.leader

In [105]:
test_data = h2o.import_file("test.csv")  # Replace with your test data file path
predictions = best_model.predict(test_data)

# View predictions
predictions.head()

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%




predict
17541.0
73378.3
53005.6
27761.7
30557.4
20014.3
12615.5
43389.2
78304.4
33646.8


In [114]:

predictions_array = predictions.as_data_frame().values  


predictions_flat = predictions_array.ravel()  

output = pd.DataFrame({'id': test['id'], 'price': predictions_flat})
output.to_csv('submission.csv', index=False)






## Lasso

In [19]:
modelasso=Lasso()
modelasso.fit(X_train,y_train)

In [20]:
modelasso.score(X_train,y_train)*100

9.977159748664356

In [21]:
modelasso.score(X_valid,y_valid)*100

9.977159748664356

In [22]:
y_predlasso=modelasso.predict(X_valid)

In [23]:
print(mean_squared_error(y_valid,y_predlasso))

5273642440.779263


## Ridge

In [24]:
modelridge=Ridge(alpha=0.5)
modelridge.fit(X_train,y_train)

In [25]:
modelridge.score(X_valid,y_valid)*100

9.97716509619202

In [26]:
modelridge.score(X_train,y_train)*100

9.97716509619202

In [27]:
y_predridge=modelridge.predict(X_valid)

In [28]:
mean_squared_error(y_valid,y_predridge)

5273642127.514887

## SGDRegressor

In [29]:
modelSGD=SGDRegressor()
modelSGD.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


In [30]:
modelSGD.score(X_train,y_train)*100

9.934594351894056

In [31]:
modelSGD.score(X_valid,y_valid)*100

9.934594351894056

In [32]:
y_predSGD=modelSGD.predict(X_valid)

In [33]:
mean_squared_error(y_valid,y_predSGD)

5276135971.113228

## Linear Regressor 

In [34]:
modellinear=LinearRegression()
modellinear.fit(X_train,y_train)

In [35]:
modellinear.score(X_train,y_train)*100

9.977165280131706

In [36]:
modellinear.score(X_valid,y_valid)*100

9.977165280131706

In [37]:
y_predLinear=modellinear.predict(X_valid)

In [38]:
mean_squared_error(y_train,y_predLinear)

5273642116.739487

## Decision Tree Regressor

In [39]:
ModelDT=DecisionTreeRegressor()
ModelDT.fit(X_train,y_train)

In [40]:
ModelDT.score(X_train,y_train)*100

100.0

In [41]:
ModelDT.score(X_valid,y_valid)*100

100.0

In [42]:
y_predDT=ModelDT.predict(X_valid)

In [43]:
mean_squared_error(y_valid,y_predDT)

0.0

## RandomForest Regressor

In [44]:
ModelRF=RandomForestRegressor()
ModelRF.fit(X_train,y_train)

  return fit_method(estimator, *args, **kwargs)


In [45]:
ModelRF.score(X_train,y_train)*100

86.3807408825713

In [46]:
ModelRF.score(X_valid,y_valid)*100

86.3807408825713

In [47]:
y_predRF=ModelRF.predict(X_valid)

In [48]:
mean_squared_error(y_valid,y_predRF)

797832002.3353882

## Bagging Regressor

In [49]:
ModelB=BaggingRegressor()
ModelB.fit(X_train,y_train)

  return column_or_1d(y, warn=True)


In [50]:
ModelB.score(X_train,y_train)*100

80.68701573231529

In [51]:
ModelB.score(X_valid,y_valid)*100

80.68701573231529

In [52]:
y_predB=ModelB.predict(X_valid)


In [53]:
mean_squared_error(y_valid,y_predB)

1131377028.4053354

## Extra Trees Regressor

In [54]:
modelET=ExtraTreesRegressor()
modelET.fit(X_train,y_train)

  return fit_method(estimator, *args, **kwargs)


In [55]:
modelET.score(X_train,y_train)*100

99.9999982635968

In [56]:
modelET.score(X_valid,y_valid)*100

99.9999982635968

In [57]:
y_predET=modelET.predict(X_valid)

In [58]:
mean_squared_error(y_valid,y_predET)

101.72051491421192

## AdaBoost Regressor

In [59]:
modelADA=AdaBoostRegressor()
modelADA.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


In [60]:
modelADA.score(X_train,y_train)*100

11.188526827161338

In [61]:
modelADA.score(X_valid,y_valid)*100

11.188526827161338

In [62]:
y_predADA=modelADA.predict(X_valid)

In [63]:
mean_squared_error(y_valid,y_predADA)

5202679151.699637

## XGB Regressor

In [64]:
modelXGB=XGBRegressor()
modelXGB.fit(X_train,y_train)

In [65]:
modelXGB.score(X_train,y_train)*100

44.1348102093083

In [66]:
modelXGB.score(X_valid,y_valid)*100

44.1348102093083

In [67]:
y_predXGB=modelXGB.predict(X_valid)

In [68]:
mean_squared_error(y_valid,y_predXGB)

3272647641.641245

## Catboost Regressor

In [69]:
ModelCAT=CatBoostRegressor(verbose=False)
ModelCAT.fit(X_train,y_train)

<catboost.core.CatBoostRegressor at 0x181281fdd00>

In [70]:
ModelCAT.score(X_train,y_train)*100

35.78099273079608

In [71]:
ModelCAT.score(X_valid,y_valid)*100

35.78099273079608

In [72]:
y_predCAT=ModelCAT.predict(X_valid)

In [73]:
mean_squared_error(y_valid,y_predCAT)

3762023962.9637885

## LGBM Regressor 

In [74]:
modelLGB=LGBMRegressor()
modelLGB.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003847 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1209
[LightGBM] [Info] Number of data points in the train set: 162610, number of used features: 10
[LightGBM] [Info] Start training from score 40816.990253


In [75]:
modelLGB.score(X_train,y_train)*100

23.264647398048645

In [76]:
modelLGB.score(X_valid,y_valid)*100

23.264647398048645

In [77]:
y_predLGB=modelLGB.predict(X_valid)

In [78]:
mean_squared_error(y_valid,y_predLGB)

4495245996.016084

In [79]:
y_pred=modelET.predict(X_test)



In [80]:
#output = pd.DataFrame({'id': test['id'], 'Price': y_pred})
#output.to_csv('submission.csv', index=False)

In [81]:
#output

Unnamed: 0,id,Price
0,188533,145630.84
1,188534,145630.84
2,188535,145630.84
3,188536,145630.84
4,188537,145630.84
...,...,...
125685,314218,145630.84
125686,314219,145630.84
125687,314220,145630.84
125688,314221,145630.84
