In [1]:
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from scipy import stats
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from lightgbm import LGBMRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder
from sklearn.tree import DecisionTreeClassifier

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv("test.csv")

train_data.head()
test_data.head()

Unnamed: 0,listing_id,title,make,model,description,manufactured,original_reg_date,reg_date,type_of_vehicle,category,...,dereg_value,mileage,omv,arf,opc_scheme,lifespan,eco_category,features,accessories,indicative_price
0,1303772,Honda Vezel 1.5A X,honda,vezel,4614,2015.0,,29-apr-2015,suv,parf car,...,9582.0,112000.0,19229.0,9229.0,,,uncategorized,"powerful 1.5l i-vtec engine producing 128bhp, ...","pioneer touch screen with reverse camera, 16"" ...",
1,1323166,Mazda 3 1.6A SP (COE till 10/2027),mazda,3,extremely well maintained and in pristine cond...,2007.0,,26-oct-2007,mid-sized sedan,"coe car, premium ad car, low mileage car",...,13644.0,120000.0,14347.0,15782.0,,,uncategorized,fuel efficient 1.6l 4-cylinder inline 16-valve...,"multi-function steering wheel, keyless entry, ...",
2,1308405,MINI Cooper S Countryman 2.0A,mini,cooper,1 owner! beautiful island blue color! eurokars...,2019.0,,27-mar-2020,sports car,parf car,...,54818.0,43000.0,39863.0,47809.0,,,uncategorized,"output of 141kw, 189bhp at 5000rpm to 6000rpm,...","18"" sports rims, sports leather seats, navigat...",
3,1216706,Toyota Vios 1.5A G,toyota,vios,fully agent maintain! genuine low mileage at 5...,2019.0,,28-jun-2019,mid-sized sedan,"parf car, premium ad car",...,26363.0,53300.0,15573.0,15573.0,,,uncategorized,"1.5l 4 cylinder 16 valves dohc vvt-i engine, 7...","push start button, toyota factory player, reve...",
4,1298206,Mazda 3 HB 1.5A,mazda,3,workshop check/sta evaluation available. accid...,2015.0,,19-nov-2015,hatchback,"parf car, premium ad car",...,15197.0,149000.0,18097.0,13097.0,,,uncategorized,1.5l 4 cylinder inline dohc 16 valves skyactiv...,factory fitted audio with audio & multi functi...,


In [3]:
print('Training data number = {}'.format(train_data.shape[0]))
print('Test data number = {}\n'.format(test_data.shape[0]))
train_data.columns 

Training data number = 25000
Test data number = 10000



Index(['listing_id', 'title', 'make', 'model', 'description', 'manufactured',
       'original_reg_date', 'reg_date', 'type_of_vehicle', 'category',
       'transmission', 'curb_weight', 'power', 'fuel_type', 'engine_cap',
       'no_of_owners', 'depreciation', 'coe', 'road_tax', 'dereg_value',
       'mileage', 'omv', 'arf', 'opc_scheme', 'lifespan', 'eco_category',
       'features', 'accessories', 'indicative_price', 'price'],
      dtype='object')

In [4]:
train_data.isna().sum()

listing_id               0
title                    0
make                  1316
model                    0
description            680
manufactured             7
original_reg_date    24745
reg_date                 0
type_of_vehicle          0
category                 0
transmission             0
curb_weight            307
power                 2640
fuel_type            19121
engine_cap             596
no_of_owners            18
depreciation           507
coe                      0
road_tax              2632
dereg_value            220
mileage               5304
omv                     64
arf                    174
opc_scheme           24838
lifespan             22671
eco_category             0
features               843
accessories           3813
indicative_price     25000
price                    0
dtype: int64

In [5]:
# 2. Drop unnecessary columns
train_data.drop(columns=['listing_id', 'original_reg_date', 'category', 'opc_scheme', 'lifespan', 'eco_category', 'indicative_price', 'description'], inplace=True)
test_data.drop(columns=['listing_id', 'original_reg_date', 'category', 'opc_scheme', 'lifespan', 'eco_category', 'indicative_price', 'description'], inplace=True)

# 3. Extract information from 'title' (engine capacity and COE expiry)
def extract_features(title):    
    # Extract COE expiry date (e.g., COE till 01/2027)
    coe_info = re.search(r'\(COE till (\d{2}/\d{4})\)', title)
    coe_expiry = coe_info.group(1) if coe_info else None
    
    # Extract make (first word in title, which is the car make/brand)
    make = title.split()[0] if title else None

    return pd.Series([coe_expiry, make])

# Apply the extract_features function to both train and test data
train_data[['coe_expiry', 'extracted_make']] = train_data['title'].apply(extract_features)
test_data[['coe_expiry', 'extracted_make']] = test_data['title'].apply(extract_features)

# Fill missing 'make' values with the extracted make from title
train_data['make'] = train_data['make'].fillna(train_data['extracted_make'])
test_data['make'] = test_data['make'].fillna(test_data['extracted_make'])


# Drop the 'title' column and 'extracted_make', 'extracted_engine_cap' since we no longer need them
train_data.drop(columns=['title', 'extracted_make'], inplace=True)
test_data.drop(columns=['title', 'extracted_make'], inplace=True)

def extract_year_month(date_str):
    try:
        month, year = date_str.split('/')
        return int(year), int(month)
    except:
        return None, None  # Handle any unexpected format

# Apply the function to 'coe_expiry' in train_data and test_data
train_data['coe_expiry_year'], train_data['coe_expiry_month'] = zip(*train_data['coe_expiry'].apply(extract_year_month))
test_data['coe_expiry_year'], test_data['coe_expiry_month'] = zip(*test_data['coe_expiry'].apply(extract_year_month))

# Optionally drop the original 'coe_expiry' column if it's no longer needed
train_data.drop(columns=['coe_expiry'], inplace=True)
test_data.drop(columns=['coe_expiry'], inplace=True)

# Imputation for 'road_tax' based only on 'engine_cap'
train_data['engine_cap'] = train_data.groupby(['curb_weight'])['engine_cap'].transform(lambda x: x.fillna(x.mean()))
test_data['engine_cap'] = test_data.groupby(['curb_weight'])['engine_cap'].transform(lambda x: x.fillna(x.mean()))


In [6]:
knn_imputer = KNNImputer(n_neighbors=5)
num_cols = ['curb_weight', 'engine_cap']

train_data[num_cols] = knn_imputer.fit_transform(train_data[num_cols])
test_data[num_cols] = knn_imputer.transform(test_data[num_cols])

In [7]:
train_data.isna().sum()

make                    0
model                   0
manufactured            7
reg_date                0
type_of_vehicle         0
transmission            0
curb_weight             0
power                2640
fuel_type           19121
engine_cap              0
no_of_owners           18
depreciation          507
coe                     0
road_tax             2632
dereg_value           220
mileage              5304
omv                    64
arf                   174
features              843
accessories          3813
price                   0
coe_expiry_year     19032
coe_expiry_month    19032
dtype: int64

In [8]:
# 1. Convert 'reg_date' to datetime format
train_data['reg_date'] = pd.to_datetime(train_data['reg_date'], format='%d-%b-%Y')
test_data['reg_date'] = pd.to_datetime(test_data['reg_date'], format='%d-%b-%Y')
# 2. Extract useful information from 'reg_date'
train_data['reg_year'] = train_data['reg_date'].dt.year
train_data['reg_month'] = train_data['reg_date'].dt.month
train_data['reg_day'] = train_data['reg_date'].dt.day
test_data['reg_year'] = test_data['reg_date'].dt.year
test_data['reg_month'] = test_data['reg_date'].dt.month
test_data['reg_day'] = test_data['reg_date'].dt.day

train_data.drop(columns=['reg_date'], inplace=True)
test_data.drop(columns=['reg_date'], inplace=True)

train_data['manufactured'].fillna(train_data['reg_year'], inplace=True)
test_data['manufactured'].fillna(test_data['reg_year'], inplace=True)

# 4. Create 'car_age' and 'power_to_weight' ratio
current_year = 2024
train_data['car_age'] = current_year - train_data['manufactured']
test_data['car_age'] = current_year - test_data['manufactured']

def fill_no_of_owners_based_on_manufactured(df):
    # Group by 'manufactured' and fill 'no_of_owners' based on the mode within each group
    return df['no_of_owners'].fillna(df.groupby('manufactured')['no_of_owners'].transform(lambda x: x.mode()[0] if not x.mode().empty else x.median()))

# Apply this function to both train_data and test_data
train_data['no_of_owners'] = fill_no_of_owners_based_on_manufactured(train_data)
test_data['no_of_owners'] = fill_no_of_owners_based_on_manufactured(test_data)

train_data['coe_expiry_year'] = train_data['coe_expiry_year'].fillna(train_data['reg_year'] + 20)
test_data['coe_expiry_year'] = test_data['coe_expiry_year'].fillna(test_data['reg_year'] + 20)

train_data['coe_expiry_month'] = train_data['coe_expiry_month'].fillna(train_data['reg_month'])
test_data['coe_expiry_month'] = test_data['coe_expiry_month'].fillna(test_data['reg_month'])

In [9]:
train_data.isna().sum()

make                    0
model                   0
manufactured            0
type_of_vehicle         0
transmission            0
curb_weight             0
power                2640
fuel_type           19121
engine_cap              0
no_of_owners            0
depreciation          507
coe                     0
road_tax             2632
dereg_value           220
mileage              5304
omv                    64
arf                   174
features              843
accessories          3813
price                   0
coe_expiry_year         0
coe_expiry_month        0
reg_year                0
reg_month               0
reg_day                 0
car_age                 0
dtype: int64

In [10]:
# 5. Indicator for missing values
for col in ['power', 'curb_weight', 'mileage']:
    train_data[col + '_missing'] = train_data[col].isna().astype(int)
    test_data[col + '_missing'] = test_data[col].isna().astype(int)

# 6. Group-based imputation for 'mileage' and 'road_tax' based on 'make' and 'model'
group_cols = ['type_of_vehicle', 'manufactured']

# Fill 'mileage' and 'road_tax' based on mean within the refined group
train_data['mileage'] = train_data.groupby(group_cols)['mileage'].transform(lambda x: x.fillna(x.mean()))
test_data['mileage'] = test_data.groupby(group_cols)['mileage'].transform(lambda x: x.fillna(x.mean()))

# 1. Select rows where both engine_cap and road_tax are not missing
train_non_missing = train_data.dropna(subset=['mileage', 'manufactured'])

# 2. Fit a linear regression model to find the relationship between engine_cap and road_tax
X_train = train_non_missing[['manufactured']].astype(float)  # Use engine_cap as the predictor
y_train = train_non_missing['mileage'].astype(float)      # road_tax is the target variable

model = LinearRegression()
model.fit(X_train, y_train)

# 3. Use the fitted model to predict missing road_tax values based on engine_cap
# Identify rows with missing road_tax but known engine_cap
train_missing_rt = train_data[(train_data['mileage'].isna()) & (train_data['manufactured'].notna())]

# Predict road_tax for rows where it's missing based on engine_cap
train_data.loc[train_missing_rt.index, 'mileage'] = model.predict(train_missing_rt[['manufactured']].astype(float))

# For test data (if applicable), follow the same process:
test_missing_rt = test_data[(test_data['mileage'].isna()) & (test_data['manufactured'].notna())]
test_data.loc[test_missing_rt.index, 'mileage'] = model.predict(test_missing_rt[['manufactured']].astype(float))

# Imputation for 'road_tax' based only on 'engine_cap'
train_data['road_tax'] = train_data.groupby('engine_cap')['road_tax'].transform(lambda x: x.fillna(x.mean()))
test_data['road_tax'] = test_data.groupby('engine_cap')['road_tax'].transform(lambda x: x.fillna(x.mean()))

In [11]:
train_data.isna().sum()

make                       0
model                      0
manufactured               0
type_of_vehicle            0
transmission               0
curb_weight                0
power                   2640
fuel_type              19121
engine_cap                 0
no_of_owners               0
depreciation             507
coe                        0
road_tax                 967
dereg_value              220
mileage                    0
omv                       64
arf                      174
features                 843
accessories             3813
price                      0
coe_expiry_year            0
coe_expiry_month           0
reg_year                   0
reg_month                  0
reg_day                    0
car_age                    0
power_missing              0
curb_weight_missing        0
mileage_missing            0
dtype: int64

In [12]:
# 1. Select rows where both engine_cap and road_tax are not missing
train_non_missing = train_data.dropna(subset=['engine_cap', 'road_tax'])

# 2. Fit a linear regression model to find the relationship between engine_cap and road_tax
X_train = train_non_missing[['engine_cap']].astype(float)  # Use engine_cap as the predictor
y_train = train_non_missing['road_tax'].astype(float)      # road_tax is the target variable

model = LinearRegression()
model.fit(X_train, y_train)

# 3. Use the fitted model to predict missing road_tax values based on engine_cap
# Identify rows with missing road_tax but known engine_cap
train_missing_rt = train_data[(train_data['road_tax'].isna()) & (train_data['engine_cap'].notna())]

# Predict road_tax for rows where it's missing based on engine_cap
train_data.loc[train_missing_rt.index, 'road_tax'] = model.predict(train_missing_rt[['engine_cap']].astype(float))

# For test data (if applicable), follow the same process:
test_missing_rt = test_data[(test_data['road_tax'].isna()) & (test_data['engine_cap'].notna())]
test_data.loc[test_missing_rt.index, 'road_tax'] = model.predict(test_missing_rt[['engine_cap']].astype(float))

In [13]:
train_data.isna().sum()

make                       0
model                      0
manufactured               0
type_of_vehicle            0
transmission               0
curb_weight                0
power                   2640
fuel_type              19121
engine_cap                 0
no_of_owners               0
depreciation             507
coe                        0
road_tax                   0
dereg_value              220
mileage                    0
omv                       64
arf                      174
features                 843
accessories             3813
price                      0
coe_expiry_year            0
coe_expiry_month           0
reg_year                   0
reg_month                  0
reg_day                    0
car_age                    0
power_missing              0
curb_weight_missing        0
mileage_missing            0
dtype: int64

In [14]:
# 7. Machine Learning Imputation for 'power' using RandomForest
def impute_with_model(train_data, target_col, feature_cols):
    # Impute missing values in feature columns using KNNImputer
    knn_imputer = KNNImputer(n_neighbors=5)
    train_data[feature_cols] = knn_imputer.fit_transform(train_data[feature_cols])
    
    model = LGBMRegressor()

    # Split into known (non-missing) and unknown (missing)
    known = train_data[train_data[target_col].notna()]
    unknown = train_data[train_data[target_col].isna()]

    # Train the model on non-missing values
    model.fit(known[feature_cols], known[target_col])

    # Predict missing values in target column
    train_data.loc[train_data[target_col].isna(), target_col] = model.predict(unknown[feature_cols])
    
    return train_data

feature_cols = ['curb_weight', 'engine_cap', 'road_tax', 'omv']
train_data = impute_with_model(train_data, 'power', feature_cols)
test_data = impute_with_model(test_data, 'power', feature_cols)

train_data['power_to_weight'] = train_data['power'] / train_data['curb_weight']
test_data['power_to_weight'] = test_data['power'] / test_data['curb_weight']


# 8. KNN-based imputation for continuous features
knn_imputer = KNNImputer(n_neighbors=5)
num_cols = ['depreciation', 'dereg_value', 'omv', 'arf']

train_data[num_cols] = knn_imputer.fit_transform(train_data[num_cols])
test_data[num_cols] = knn_imputer.transform(test_data[num_cols])

# Step 1: Fill all 'fuel_type' with 'diesel' for cars manufactured in or before 2005
train_data.loc[train_data['manufactured'] <= 2005, 'fuel_type'] = train_data.loc[train_data['manufactured'] <= 2005, 'fuel_type'].fillna('diesel')
test_data.loc[test_data['manufactured'] <= 2005, 'fuel_type'] = test_data.loc[test_data['manufactured'] <= 2005, 'fuel_type'].fillna('diesel')

# Step 2: Fill all 'fuel_type' with 'electric' for specific makes
electric_makes = ['aion', 'byd', 'tesla', 'dfsk', 'kyc', 'mg', 'polestar', 'shineray', 'smart', 'sokon']
train_data.loc[train_data['make'].isin(electric_makes), 'fuel_type'] = train_data.loc[train_data['make'].isin(electric_makes), 'fuel_type'].fillna('electric')
test_data.loc[test_data['make'].isin(electric_makes), 'fuel_type'] = test_data.loc[test_data['make'].isin(electric_makes), 'fuel_type'].fillna('electric')

# Step 3: Fill remaining 'NaN' values based on mode of 'fuel_type' grouped by 'make'
def fill_fuel_type_by_make(df):
    for make in df['make'].unique():
        make_mode = df.loc[df['make'] == make, 'fuel_type'].mode()
        if not make_mode.empty:
            # Fill by mode of the 'fuel_type' for that make
            df.loc[df['make'] == make, 'fuel_type'] = df.loc[df['make'] == make, 'fuel_type'].fillna(make_mode[0])
        else:
            # If all 'fuel_type' are NaN for this make, use overall mode
            overall_mode = df['fuel_type'].mode()[0]
            df.loc[df['make'] == make, 'fuel_type'] = df.loc[df['make'] == make, 'fuel_type'].fillna(overall_mode)
    return df

# Apply this logic for both train and test datasets
train_data = fill_fuel_type_by_make(train_data)
test_data = fill_fuel_type_by_make(test_data)

# 9. Fill missing values for categorical columns using mode
cat_cols = ['make', 'model', 'transmission', 'fuel_type', 'type_of_vehicle']
for col in cat_cols:
    train_data[col].fillna(train_data[col].mode()[0], inplace=True)
    test_data[col].fillna(test_data[col].mode()[0], inplace=True)

# 10. Label Encoding for categorical columns
onehot_enc = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

# Apply OneHotEncoder to both train and test data
train_encoded = onehot_enc.fit_transform(train_data[cat_cols])
test_encoded = onehot_enc.transform(test_data[cat_cols])

# Convert the encoded data to DataFrames with appropriate column names
train_encoded_df = pd.DataFrame(train_encoded, columns=onehot_enc.get_feature_names_out(cat_cols))
test_encoded_df = pd.DataFrame(test_encoded, columns=onehot_enc.get_feature_names_out(cat_cols))

# Merge the encoded columns back to the original train and test data
train_data = pd.concat([train_data.reset_index(drop=True), train_encoded_df], axis=1).drop(columns=cat_cols)
test_data = pd.concat([test_data.reset_index(drop=True), test_encoded_df], axis=1).drop(columns=cat_cols)
"""
train_data['features'].fillna('', inplace=True)
test_data['features'].fillna('', inplace=True)

train_data['accessories'].fillna('', inplace=True)
test_data['accessories'].fillna('', inplace=True)

# 2. TF-IDF Vectorization for textual columns (features and accessories)
tfidf_features = TfidfVectorizer(max_features=10)
tfidf_accessories = TfidfVectorizer(max_features=10)

# 3. Fit and transform text fields
features_matrix_train = tfidf_features.fit_transform(train_data['features']).toarray()
features_matrix_test = tfidf_features.transform(test_data['features']).toarray()

accessories_matrix_train = tfidf_accessories.fit_transform(train_data['accessories']).toarray()
accessories_matrix_test = tfidf_accessories.transform(test_data['accessories']).toarray()

# Convert the TF-IDF matrices to DataFrames with appropriate column names
features_df_train = pd.DataFrame(features_matrix_train, columns=tfidf_features.get_feature_names_out())
features_df_test = pd.DataFrame(features_matrix_test, columns=tfidf_features.get_feature_names_out())

accessories_df_train = pd.DataFrame(accessories_matrix_train, columns=tfidf_accessories.get_feature_names_out())
accessories_df_test = pd.DataFrame(accessories_matrix_test, columns=tfidf_accessories.get_feature_names_out())

# 4. Merge the encoded columns back to the original train and test data
train_data = pd.concat([train_data.reset_index(drop=True), features_df_train, accessories_df_train], axis=1)
test_data = pd.concat([test_data.reset_index(drop=True), features_df_test, accessories_df_test], axis=1)
"""

# 5. Drop the original 'features' and 'accessories' columns after vectorization
train_data.drop(columns=['features', 'accessories'], inplace=True)
test_data.drop(columns=['features', 'accessories'], inplace=True)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000216 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 941
[LightGBM] [Info] Number of data points in the train set: 22360, number of used features: 4
[LightGBM] [Info] Start training from score 140.908095
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000095 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 931
[LightGBM] [Info] Number of data points in the train set: 8914, number of used features: 4
[LightGBM] [Info] Start training from score 140.861230


In [15]:
print("Columns with missing values in train_data:")
print(train_data.isna().sum()[train_data.isna().sum() > 0])

# For test_data: Print columns that contain NaN values
print("Columns with missing values in test_data:")
print(test_data.isna().sum()[test_data.isna().sum() > 0])

Columns with missing values in train_data:
Series([], dtype: int64)
Columns with missing values in test_data:
Series([], dtype: int64)


In [16]:
train_data.head()
train_data.columns

Index(['manufactured', 'curb_weight', 'power', 'engine_cap', 'no_of_owners',
       'depreciation', 'coe', 'road_tax', 'dereg_value', 'mileage',
       ...
       'type_of_vehicle_hatchback', 'type_of_vehicle_luxury sedan',
       'type_of_vehicle_mid-sized sedan', 'type_of_vehicle_mpv',
       'type_of_vehicle_others', 'type_of_vehicle_sports car',
       'type_of_vehicle_stationwagon', 'type_of_vehicle_suv',
       'type_of_vehicle_truck', 'type_of_vehicle_van'],
      dtype='object', length=944)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(train_data.drop(columns = ['price']), train_data['price'], test_size=0.8, random_state=42)

X_train
X_test
y_train
y_test

17013     88800.0
6549       7800.0
18919     46800.0
9011      54000.0
1164      73800.0
           ...   
17306     51000.0
24494     46800.0
8800      14000.0
18904    510000.0
4326      42800.0
Name: price, Length: 5000, dtype: float64

In [18]:
print("NaNs in X_train:", X_train.isna().sum().sum())

# Check for NaNs in the test set
print("NaNs in X_test:", X_test.isna().sum().sum())

NaNs in X_train: 0
NaNs in X_test: 0


In [19]:
model2 = BaggingRegressor(random_state=50, max_samples = 950)
model2.fit(X_train, y_train)
y_pred2 = model2.predict(X_test)

In [20]:
model3 = RandomForestRegressor(random_state=50,  max_features=0.43, min_samples_leaf=1,)
model3.fit(X_train, y_train)
y_pred3 = model3.predict(X_test)

In [21]:
model4 = GradientBoostingRegressor(
    random_state=50, 
    min_samples_split = 6, 
    min_samples_leaf = 3, 
    max_depth = 7
)

model4.fit(X_train, y_train) 

y_pred4 = model4.predict(X_test)

In [22]:
training_residuals = y_train - model3.predict(X_train)
model4.fit(X_train, training_residuals)
pred_residuals = model4.predict(X_test)
y_pred5 = pred_residuals + model3.predict(X_test)

In [23]:
y_pred = [y_pred2, y_pred3, y_pred4, y_pred5]

In [24]:
def rmse_score(y_true, y_pred):
    return sqrt(mean_squared_error(y_true, y_pred))

In [25]:
mean_price = y_train.mean()
baseline_predictions = [mean_price] * len(y_test)
baseline_rmse = sqrt(mean_squared_error(y_test, baseline_predictions))

print(f"Baseline RMSE (Predicting Mean): {baseline_rmse:.2f}")

method = 1
for test in y_pred:
    r2_test = r2_score(y_test, test)
    rmse = rmse_score(y_test, test)
    print(f"R-squared (Test): {r2_test}")
    print(f"RMSE{method}: {rmse}\n")
    method+=1 

Baseline RMSE (Predicting Mean): 170308.83
R-squared (Test): 0.9103311655335076
RMSE1: 50990.200069287195

R-squared (Test): 0.9679591913011824
RMSE2: 30480.191280835472

R-squared (Test): 0.9650147426545383
RMSE3: 31849.930577881845

R-squared (Test): 0.9736991042415074
RMSE4: 27615.39271321924



In [26]:
training_residuals = y_train - model3.predict(X_train)
model4.fit(X_train, training_residuals)
pred_residuals = model4.predict(test_data)
final_predictions = pred_residuals + model3.predict(test_data)

# Step 4: Prepare the output DataFrame in the format Id, Predicted
output_df = pd.DataFrame({
    'Id': range(len(final_predictions)),  # Create 'Id' column (index-based)
    'Predicted': final_predictions        # Add predictions from model5
})

# Step 5: Round predictions to 1 decimal place
output_df['Predicted'] = output_df['Predicted'].round(1)

# Step 6: Export the predictions to a CSV file
output_df.to_csv('submission_v16.csv', index=False)
print("Done")

Done
