In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.stats import boxcox
from scipy.special import inv_boxcox
import warnings
warnings.simplefilter("ignore")

In [None]:
# Load CSV files
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [None]:
 #Drop the 'id' column as it's not useful

df_train = df_train.drop(columns=['id'])
id_test = df_test['id']
df_test = df_test.drop(columns=['id'])

In [None]:
# Handle missing values in 'clean_title', 'fuel_type', and 'accident'
df_train['clean_title'].fillna('Unknown', inplace=True)
df_test['clean_title'].fillna('Unknown', inplace=True)

In [None]:
# Fill missing values in 'fuel_type' and 'accident' using mode per group of 'model' and 'model_year'
df_train['fuel_type'] = df_train.groupby(['model', 'model_year'])['fuel_type'].transform(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else 'Unknown'))
df_test['fuel_type'] = df_test.groupby(['model', 'model_year'])['fuel_type'].transform(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else 'Unknown'))

df_train['accident'] = df_train.groupby(['model', 'model_year'])['accident'].transform(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else 'Unknown'))
df_test['accident'] = df_test.groupby(['model', 'model_year'])['accident'].transform(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else 'Unknown'))

In [None]:
# Add car age and drop model_year
df_train['car_age'] = 2024 - df_train['model_year']
df_test['car_age'] = 2024 - df_test['model_year']
df_train.drop('model_year', axis=1, inplace=True)
df_test.drop('model_year', axis=1, inplace=True)

In [None]:
# Target encoding for 'model' to avoid high cardinality
mean_price_per_model = df_train.groupby('model')['price'].mean()
df_train['model'] = df_train['model'].map(mean_price_per_model)
df_test['model'] = df_test['model'].map(mean_price_per_model)

# Extract engine power and cylinder
df_train['engine_power'] = df_train['engine'].str.extract(r'(\d+\.?\d*)HP').astype(float)
df_test['engine_power'] = df_test['engine'].str.extract(r'(\d+\.?\d*)HP').astype(float)

df_train['engine_cylinder'] = df_train['engine'].str.extract(r'(\d+\.?\d*)L').astype(float)
df_test['engine_cylinder'] = df_test['engine'].str.extract(r'(\d+\.?\d*)L').astype(float)

# Remove the 'engine' column after extraction
df_train.drop('engine', axis=1, inplace=True)
df_test.drop('engine', axis=1, inplace=True)

In [None]:
# Regroup rare colors in 'ext_col' and 'int_col'
color_threshold = 100
ext_color_counts = df_train['ext_col'].value_counts()
rare_ext_colors = ext_color_counts[ext_color_counts < color_threshold].index
df_train['ext_col'] = df_train['ext_col'].replace(rare_ext_colors, 'Other')
df_test['ext_col'] = df_test['ext_col'].replace(rare_ext_colors, 'Other')

int_color_counts = df_train['int_col'].value_counts()
rare_int_colors = int_color_counts[int_color_counts < color_threshold].index
df_train['int_col'] = df_train['int_col'].replace(rare_int_colors, 'Other')
df_test['int_col'] = df_test['int_col'].replace(rare_int_colors, 'Other')

In [None]:
# Fill missing engine_power and engine_cylinder with mean values
df_train['engine_power'].fillna(df_train['engine_power'].mean(), inplace=True)
df_test['engine_power'].fillna(df_test['engine_power'].mean(), inplace=True)

df_train['engine_cylinder'].fillna(df_train['engine_cylinder'].mean(), inplace=True)
df_test['engine_cylinder'].fillna(df_test['engine_cylinder'].mean(), inplace=True)

In [None]:
# Separate numerical and categorical columns
categorical_columns = ['brand', 'fuel_type', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title']
numerical_columns = df_train.select_dtypes(exclude=['object']).columns.tolist()
numerical_columns.remove('price')  # Don't include target

In [None]:
# OneHotEncode only for categorical columns with low cardinality
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

# Apply OneHotEncoder to categorical columns
encoded_train = pd.DataFrame(encoder.fit_transform(df_train[categorical_columns]))
encoded_test = pd.DataFrame(encoder.transform(df_test[categorical_columns]))

# Add column names to encoded data
encoded_train.columns = encoder.get_feature_names_out(categorical_columns)
encoded_test.columns = encoder.get_feature_names_out(categorical_columns)

# Reset index to align encoded data with original data
encoded_train.index = df_train.index
encoded_test.index = df_test.index

# Concatenate encoded categorical columns with numerical columns
X_train = pd.concat([df_train[numerical_columns], encoded_train], axis=1)
X_test = pd.concat([df_test[numerical_columns], encoded_test], axis=1)

# Align the columns between train and test sets to avoid issues with different columns
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

In [None]:
# Target variable
y_train = df_train['price']

# Apply Box-Cox transformation to target variable (price)
y_train_boxcox, lambda_ = boxcox(y_train + 1)  # Adding 1 to avoid issues with zero values

# Imputation with KNNImputer
knn_imputer = KNNImputer(n_neighbors=5)
X_train_imputed = knn_imputer.fit_transform(X_train)
X_test_imputed = knn_imputer.transform(X_test)

# Apply RobustScaler to numerical columns
scaler = RobustScaler()
X_train_imputed_scaled = scaler.fit_transform(X_train_imputed)
X_test_imputed_scaled = scaler.transform(X_test_imputed)

# Train-test split for evaluation
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train_imputed_scaled, y_train_boxcox, test_size=0.2, random_state=42)

# Example: Using a simple model like Ridge to evaluate
from sklearn.linear_model import Ridge

ridge_model = Ridge()
ridge_model.fit(X_train_split, y_train_split)
y_pred_boxcox = ridge_model.predict(X_val)

# Inverse Box-Cox transformation to get predictions back to the original scale
y_pred = inv_boxcox(y_pred_boxcox, lambda_)

# RMSE evaluation in the original scale
rmse = np.sqrt(mean_squared_error(np.exp(y_val), y_pred))
print(f'Ridge Model RMSE: {rmse}')