In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df = pd.read_csv('~/Desktop/RESUME_projects_datasets/DynamicPricePredictionEngine.csv')

In [3]:
print("Checking for missing values:")
print(df.isnull().sum())

df = df.drop(['name'], axis=1)

if 'category' in df.columns:
    df['category'] = pd.factorize(df['category'])[0]

X = df.drop('currentPrice', axis=1)
y = df['currentPrice']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

Checking for missing values:
name                                 0
category                             0
basePrice                            0
customerRating                       0
salesLastWeek                        0
viewsLastWeek                        0
searchesLastWeek                     0
totalPurchasesLastSixMonths          0
totalUniqueCustomersLastSixMonths    0
currentPrice                         0
dtype: int64


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree Regression': DecisionTreeRegressor(),
    'Random Forest Regression': RandomForestRegressor()
}

# Train and evaluate each model
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results[name] = {'MSE': mse, 'R2': r2}
    print(f'{name} - MSE: {mse:.2f}, R2: {r2:.2f}')

# Hyperparameter Tuning for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f'Best parameters: {best_params}')
print(f'Best score: {best_score:.2f}')

# Train the best model
best_rf_model = grid_search.best_estimator_

# Predict using the best model
y_pred_best = best_rf_model.predict(X_test)

# Evaluate the best model
mse_best = mean_squared_error(y_test, y_pred_best)
r2_best = r2_score(y_test, y_pred_best)

print(f'Best Random Forest Model - MSE: {mse_best:.2f}, R2: {r2_best:.2f}')

import os
import pickle

directory = '/home/gaurav/Desktop/RESUME_projects_datasets&Models'
if not os.path.exists(directory):
    os.makedirs(directory)

model_path = os.path.join(directory, 'best_dppe_model.pkl')
with open(model_path, 'wb') as f:
    pickle.dump(best_rf_model, f)

print(f'Best Random Forest model saved to {model_path}')


Linear Regression - MSE: 30.72, R2: 0.99
Decision Tree Regression - MSE: 86.98, R2: 0.98
Random Forest Regression - MSE: 39.98, R2: 0.99
Fitting 3 folds for each of 108 candidates, totalling 324 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   0.5s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.8s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=50; total time=   0.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   0.7s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   0.4s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   1.6s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=5