In [8]:
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder,StandardScaler

# Step 1: Load and clean data
overall_details = pd.read_csv('C:/Users/mange/Desktop/MY PROJECTS/CAR DEKHO/car_dekho_files/new files/overalldetails.csv')
overall_details.dropna(subset=['filtered_price'], inplace=True)

# Step 2: Define categorical and numerical features
categorical_cols = ['ft', 'bt', 'transmission', 'ownerNo', 'oem', 'model', 'modelYear', 'variantName']
numerical_cols = ['km']

# Step 3: One-Hot Encoding for categorical variables
one_hot_encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_cols = one_hot_encoder.fit_transform(overall_details[categorical_cols])

# Save encoder
with open('encoder.pkl', 'wb') as encoder_file:
    pickle.dump(one_hot_encoder, encoder_file)

# Convert to DataFrame
encoded_df = pd.DataFrame(encoded_cols, columns=one_hot_encoder.get_feature_names_out(categorical_cols))

# Step 4: Normalize numerical columns using StandardScaler
scaler = StandardScaler()
scaled_numerical = scaler.fit_transform(overall_details[numerical_cols])

# Save scaler
with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

# Convert to DataFrame
scaled_df = pd.DataFrame(scaled_numerical, columns=numerical_cols)

# Step 5: Combine all features
overall_details = pd.concat([overall_details.drop(columns=categorical_cols + numerical_cols), encoded_df, scaled_df], axis=1)

# Step 6: Split into train and test sets
X = overall_details.drop(columns=['filtered_price'])
y = overall_details['filtered_price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Train the RandomForestRegressor model
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)

# Step 8: Save the trained model
with open('comparison_df_rf.pkl', 'wb') as model_file:
    pickle.dump(rf_model, model_file)

print("RandomForest model saved to comparison_df_rf.pkl")

#Step 9: Make predictions using Random Forest
y_pred_rf = rf_model.predict(X_test)  

# Step 10: Evaluate the Random Forest model
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Random Forest RMSE: {rmse_rf}")
print(f"Random Forest R² Score: {r2_rf}")

# Step 11: Compare actual and predicted prices
comparison_df_rf = pd.DataFrame({'Actual Prices': y_test.values, 'Predicted Prices (Random Forest)': y_pred_rf})
print(comparison_df_rf.head())

RandomForest model saved to comparison_df_rf.pkl
Random Forest RMSE: 1.5253277264952654
Random Forest R² Score: 0.8237337445473566
   Actual Prices  Predicted Prices (Random Forest)
0          13.00                         12.657600
1           5.00                          5.988367
2           4.45                          5.076900
3           8.45                          7.889600
4           1.60                          3.406400
