In [13]:
import pandas as pd
import numpy as np
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder,LabelEncoder
from sklearn.compose import ColumnTransformer

# Load data
df = pd.read_csv('./data/cardekho_imputated.csv')

# Drop unnecessary columns
df.drop('car_name', axis=1, inplace=True)

# Independent and dependent variables
X = df.drop(['selling_price'], axis=1)
y = df['selling_price']

# Columns for numerical and categorical features
num_features = ['vehicle_age', 'km_driven', 'mileage', 'engine', 'max_power', 'seats']
cat_features = ['seller_type', 'fuel_type', 'transmission_type', 'brand', 'model']

# Encode categorical features
le_brand = LabelEncoder()
le_model = LabelEncoder()
X['brand'] = le_brand.fit_transform(X['brand'])
X['model'] = le_model.fit_transform(X['model'])

# Transformers
numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder(drop='first')

# Fit transformers
numeric_transformer.fit(X[num_features])
oh_transformer.fit(X[cat_features])

# Transform features
X_num = numeric_transformer.transform(X[num_features])
X_cat = oh_transformer.transform(X[cat_features]).toarray()
X_transformed = np.hstack([X_num, X_cat])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.3, random_state=42)

# Train model
rf_model = RandomForestRegressor(n_estimators=100, min_samples_split= 2, max_features=8, max_depth= 15,n_jobs=-1)
rf_model.fit(X_train, y_train)

y_train_pred = rf_model.predict(X_train)
y_test_pred = rf_model.predict(X_test)

# Save transformers and model
# pickle.dump(numeric_transformer, open('numeric_transformer.pkl', 'wb'))
# pickle.dump(oh_transformer, open('oh_transformer.pkl', 'wb'))
# pickle.dump(rf_model, open('rf_model.pkl', 'wb'))
# pickle.dump(le_brand, open('le_brand.pkl', 'wb'))
# pickle.dump(le_model, open('le_model.pkl', 'wb'))

print("Data preparation, model training, and saving completed.")


Data preparation, model training, and saving completed.


In [14]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [15]:
##Create a Function to Evaluate Model
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [16]:
    # Evaluate Train and Test dataset
model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

In [19]:
print('Model performance for Training set')
print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
print("- R2 Score: {:.4f}".format(model_train_r2))

print('----------------------------------')
    
print('Model performance for Test set')
print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
print("- R2 Score: {:.4f}".format(model_test_r2))

Model performance for Training set
- Root Mean Squared Error: 258409.8957
- Mean Absolute Error: 129573.8503
- R2 Score: 0.9185
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 321665.3395
- Mean Absolute Error: 153796.9345
- R2 Score: 0.8624
