In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# prompt:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk

import re
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate
from tensorflow.keras.optimizers import Adam



df=pd.read_csv("/content/drive/My Drive/kagglex-cohort4/train.csv")
X = df.drop(columns=['price'])
y = df['price']
test=pd.read_csv("/content/drive/My Drive/kagglex-cohort4/test.csv")
data = pd.concat([X, test], axis=0)
data.set_index('id', inplace=True)
# Load the data


# Identify outliers using the IQR method
def impute_outliers_with_median(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    median_value = df[column].median()

    # Impute outliers with the median
    df[column] = df[column].apply(lambda x: median_value if x < lower_bound or x > upper_bound else x)

def impute_outliers_with_mean(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    mean_value = df[column].mean()

    # Impute outliers with the median
    df[column] = df[column].apply(lambda x: mean_value if x < lower_bound or x > upper_bound else x)



data.head()
# One-hot encode the 'accident' and 'fuel_type' columns
#accident_dummies = pd.get_dummies(data['accident'], prefix='accident').astype(int)
#fuel_type_dummies = pd.get_dummies(data['fuel_type'], prefix='fuel_type').astype(int)

# Concatenate the one-hot encoded columns with the original dataset
#data = pd.concat([data, accident_dummies], axis=1)

# Drop the original 'accident' and 'fuel_type' columns
#data.drop(columns=['accident'], inplace=True)

# Display the first few rows of the updated dataset to verify changes
#data.head()



#brand_counts = data['brand'].value_counts()
#brands_to_group = brand_counts[brand_counts < 10].index.tolist()

# Replace these brands with 'Other'
#data['brand'] = data['brand'].apply(lambda x: 'Other' if x in brands_to_group else x)

# Extract horsepower, engine size, number of cylinders from the engine column
def extract_engine_features(engine_str):
    hp = re.search(r'(\d+\.?\d*)HP', engine_str)
    liters = re.search(r'(\d+\.?\d*)L', engine_str)
    cylinders = re.search(r'(\d+)\sCylinder', engine_str)

    hp_value = float(hp.group(1)) if hp else None
    liters_value = float(liters.group(1)) if liters else None
    cylinders_value = int(cylinders.group(1)) if cylinders else None

    return hp_value, liters_value, cylinders_value

# Apply extraction function and handle missing values
data[['horsepower', 'engine_size', 'cylinders']] = data['engine'].apply(
    lambda x: pd.Series(extract_engine_features(x))
)

# Handle missing values by filling with mean or median
data['horsepower'].fillna(data['horsepower'].mean(), inplace=True)
data['engine_size'].fillna(data['engine_size'].mean(), inplace=True)
data['cylinders'].fillna(data['cylinders'].median(), inplace=True)

# Impute outliers in 'milage' and 'model_year' columns
impute_outliers_with_median(data, 'milage')
impute_outliers_with_median(data, 'model_year')
impute_outliers_with_median(data, 'horsepower')
impute_outliers_with_median(data, 'engine_size')
impute_outliers_with_median(data, 'cylinders')

# Encode categorical variables
categorical_cols = ['brand', 'model','fuel_type','transmission', 'ext_col', 'int_col']

label_encoders = {col: LabelEncoder() for col in categorical_cols}

for col in categorical_cols:
    data[col] = label_encoders[col].fit_transform(data[col])

accident_dummies = pd.get_dummies(data['accident'], prefix='accident').astype(int)
data = pd.concat([data, accident_dummies], axis=1)
# Normalize continuous features
scaler = StandardScaler()
data[['milage', 'horsepower', 'engine_size', 'cylinders', 'model_year','brand','model','fuel_type','transmission','ext_col','int_col']] = scaler.fit_transform(
    data[['milage', 'horsepower', 'engine_size', 'cylinders', 'model_year','brand','model','fuel_type','transmission','ext_col','int_col']]
)

# Separate features and target variable
Xt = data.drop(columns=[ 'engine','clean_title','accident'])
# Custom split point (e.g., at row index 3)
split_point =54273

# Create training and test sets
X = Xt.iloc[:split_point]
test_data = Xt.iloc[split_point:]

X.head(),y.head()
X.shape
y=np.log(y)
print("\nData with log-transformed target:")
print(y)
#
y_pred_baseline=[y.mean()]*len(y)
mae_baseline=mean_absolute_error(y,y_pred_baseline)
mse_baseline=mean_squared_error(y,y_pred_baseline)
print( np.sqrt(mse_baseline))
print( mae_baseline)
X.shape




Data with log-transformed target:
0         9.305651
1         9.017968
2         9.615805
3        11.058795
4         8.968269
           ...    
54268    10.275051
54269     8.779557
54270     9.849559
54271    12.180575
54272    10.532070
Name: price, Length: 54273, dtype: float64
0.8270705486270719
0.6481301630642998


(54273, 13)

In [None]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 54273 entries, 0 to 54272
Data columns (total 13 columns):
 #   Column                                           Non-Null Count  Dtype  
---  ------                                           --------------  -----  
 0   brand                                            54273 non-null  int64  
 1   model                                            54273 non-null  int64  
 2   model_year                                       54273 non-null  float64
 3   milage                                           54273 non-null  float64
 4   fuel_type                                        54273 non-null  int64  
 5   transmission                                     54273 non-null  int64  
 6   ext_col                                          54273 non-null  int64  
 7   int_col                                          54273 non-null  int64  
 8   horsepower                                       54273 non-null  float64
 9   engine_size                      

In [None]:

from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score
from sklearn.metrics import mean_absolute_error,mean_squared_error
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.ensemble import RandomForestRegressor

In [None]:
Rr=RandomForestRegressor()

In [None]:
paramsr={"n_estimators":range(40,80,5),
       "max_depth":range(50,100,5)

}

In [None]:
modelRr=GridSearchCV(Rr,param_grid=paramsr,cv=4,n_jobs=1,verbose=1)
modelRr.fit(X,y)

Fitting 4 folds for each of 80 candidates, totalling 320 fits


In [None]:
Rr1=modelRr.best_estimator_
Rr1

In [None]:
y_pred_trainingR=Rr1.predict(X)
mae_training=mean_absolute_error(y,y_pred_trainingR)
mae_training

0.129799525824693

In [None]:
mean_squared_error(y,y_pred_trainingR)

0.0361195489777437

In [None]:
import xgboost as xgb
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)


In [None]:
param_grid = {
    'n_estimators': [300],               # Narrowed range
    'max_depth': [8],                      # Narrowed range
    'learning_rate': [0.05,0.01,0.03],             # Narrowed range
    'subsample': [0.8,0.1],                  # Narrowed range
    'colsample_bytree': [0.8, 1.0],           # Narrowed range
    'gamma': [0, 0.05],                        # Narrowed range
    'reg_alpha': [0, 0.05],                    # Narrowed range
    'reg_lambda': [1.0, 10.0]                 # Narrowed range
}


In [None]:
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid,
                           cv=5, scoring='neg_mean_squared_error', verbose=1)
grid_search.fit(X, y)


Fitting 5 folds for each of 96 candidates, totalling 480 fits


In [None]:
best_xgb_model = grid_search.best_estimator_
y_pred = best_xgb_model.predict(X)

# Evaluate performance
mse = mean_squared_error(y, y_pred)
print(f'Mean Squared Error: {np.sqrt(mse)}')


Mean Squared Error: 0.4390199811726895


In [None]:
print(f'Best parameters found: {grid_search.best_params_}')

Best parameters found: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.03, 'max_depth': 8, 'n_estimators': 300, 'reg_alpha': 0.05, 'reg_lambda': 10.0, 'subsample': 0.8}


In [None]:
data=pd.read_csv("/content/drive/My Drive/kagglex-cohort4/test.csv")
brand_counts = data['brand'].value_counts()
brands_to_group = brand_counts[brand_counts < 10].index.tolist()

# Replace these brands with 'Other'
data['brand'] = data['brand'].apply(lambda x: 'Other' if x in brands_to_group else x)

# Extract horsepower, engine size, number of cylinders from the engine column
def extract_engine_features(engine_str):
    hp = re.search(r'(\d+\.?\d*)HP', engine_str)
    liters = re.search(r'(\d+\.?\d*)L', engine_str)
    cylinders = re.search(r'(\d+)\sCylinder', engine_str)

    hp_value = float(hp.group(1)) if hp else None
    liters_value = float(liters.group(1)) if liters else None
    cylinders_value = int(cylinders.group(1)) if cylinders else None

    return hp_value, liters_value, cylinders_value

# Apply extraction function and handle missing values
data[['horsepower', 'engine_size', 'cylinders']] = data['engine'].apply(
    lambda x: pd.Series(extract_engine_features(x))
)

# Handle missing values by filling with mean or median
data['horsepower'].fillna(data['horsepower'].mean(), inplace=True)
data['engine_size'].fillna(data['engine_size'].mean(), inplace=True)
data['cylinders'].fillna(data['cylinders'].median(), inplace=True)


# Encode categorical variables
categorical_cols = ['brand', 'model','fuel_type','transmission', 'ext_col', 'int_col']

label_encoders = {col: LabelEncoder() for col in categorical_cols}

for col in categorical_cols:
    data[col] = label_encoders[col].fit_transform(data[col])

accident_dummies = pd.get_dummies(data['accident'], prefix='accident').astype(int)
data = pd.concat([data, accident_dummies], axis=1)
# Normalize continuous features
scaler = StandardScaler()
data[['milage', 'horsepower', 'engine_size', 'cylinders', 'model_year','brand','model','fuel_type','transmission','ext_col','int_col']] = scaler.fit_transform(
    data[['milage', 'horsepower', 'engine_size', 'cylinders', 'model_year','brand','model','fuel_type','transmission','ext_col','int_col']]
)


In [None]:
# Separate features and id
test_ids = data['id']
data = data.drop(columns=['id', 'engine','accident','clean_title'])



# Predict prices using the trained model
predicted_prices = best_xgb_model.predict(data)

# Combine the id and predicted prices into a dataframe
results = pd.DataFrame({'id': test_ids, 'predicted_price': np.exp(predicted_prices).flatten()})

# Save the results to a CSV file
results.to_csv('/content/drive/My Drive/kagglex-cohort4/predicted_prices2.csv' , index=False)

In [None]:
cv_resultsRr=pd.DataFrame(modelRr.cv_results_)
cv_resultsRr

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score
0,3.43718,0.139177,0.205957,0.019548,50,20,"{'max_depth': 50, 'n_estimators': 20}",0.604286,0.615718,0.610002,0.005716,54
1,4.606909,0.685922,0.246778,0.013871,50,25,"{'max_depth': 50, 'n_estimators': 25}",0.606437,0.617725,0.612081,0.005644,49
2,5.499811,0.864687,0.309997,0.002631,50,30,"{'max_depth': 50, 'n_estimators': 30}",0.6108,0.62188,0.61634,0.00554,30
3,5.881258,0.587554,0.353531,0.034881,50,35,"{'max_depth': 50, 'n_estimators': 35}",0.611439,0.621757,0.616598,0.005159,29
4,8.519347,0.586776,0.602069,0.195628,50,40,"{'max_depth': 50, 'n_estimators': 40}",0.613894,0.623384,0.618639,0.004745,13
5,7.771398,0.831719,0.41259,0.003662,50,45,"{'max_depth': 50, 'n_estimators': 45}",0.6144,0.625068,0.619734,0.005334,3
6,3.125133,0.001588,0.212795,0.002963,55,20,"{'max_depth': 55, 'n_estimators': 20}",0.605972,0.615503,0.610738,0.004765,53
7,4.693194,0.540978,0.286289,0.055844,55,25,"{'max_depth': 55, 'n_estimators': 25}",0.607299,0.617849,0.612574,0.005275,47
8,6.451142,0.356149,0.303592,0.033382,55,30,"{'max_depth': 55, 'n_estimators': 30}",0.60792,0.619463,0.613692,0.005772,40
9,8.142634,2.626214,0.385952,0.029036,55,35,"{'max_depth': 55, 'n_estimators': 35}",0.613063,0.621794,0.617428,0.004365,23
