In [1]:
import pandas as pd

In [2]:
file_path = r'C:\Users\HP\Documents\my-data/amazon_cleaned.csv'
df = pd.read_csv(file_path)
df.head(5)

Unnamed: 0,product_id,product_name,discounted_price,actual_price,discount_percentage,rating,rating_count,Main category,Sub category
0,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,399.0,1099.0,0.64,4.2,24269.0,Computers & Accessories,Accessories & Peripherals
1,B098NS6PVG,Ambrane Unbreakable 60W / 3A Fast Charging 1.5...,199.0,349.0,0.43,4.0,43994.0,Computers & Accessories,Accessories & Peripherals
2,B096MSW6CT,Sounce Fast Phone Charging Cable & Data Sync U...,199.0,1899.0,0.9,3.9,7928.0,Computers & Accessories,Accessories & Peripherals
3,B08HDJ86NZ,boAt Deuce USB 300 2 in 1 Type-C & Micro USB S...,329.0,699.0,0.53,4.2,94363.0,Computers & Accessories,Accessories & Peripherals
4,B08CF3B7N1,Portronics Konnect L 1.2M Fast Charging 3A 8 P...,154.0,399.0,0.61,4.2,16905.0,Computers & Accessories,Accessories & Peripherals


In [13]:
# import the models 
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [14]:
# check for null values
df.isna().sum()

product_id             0
product_name           0
discounted_price       0
actual_price           0
discount_percentage    0
rating                 0
rating_count           0
Main category          0
Sub category           0
dtype: int64

In [15]:
# fill the null values with the most common rating
df['rating_count'].fillna(df['rating_count'].mode()[0], inplace=True)
df.isna().sum()

product_id             0
product_name           0
discounted_price       0
actual_price           0
discount_percentage    0
rating                 0
rating_count           0
Main category          0
Sub category           0
dtype: int64

In [16]:
# define the x and y columns to be used
X = df[['discounted_price', 'rating_count', 'discount_percentage', 'rating']]
y = df['actual_price']

# split the values using train_test_split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=28)

In [17]:
# scale the train and test values
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [22]:
# create a dict for models evaluation
models = {
    'SVR': SVR(),
    'Decision Tree Regressor': DecisionTreeRegressor(),
    'Random Forest Regressor': RandomForestRegressor()
}

In [24]:
# create a function for the evaluation of the models
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    print(f'The scoring for {name}: {model.score(X_test_scaled, y_test)}')

The scoring for SVR: -0.11808040185335433
The scoring for Decision Tree Regressor: 0.9411688607705365
The scoring for Random Forest Regressor: 0.9554148421387924


In [26]:
# check for the best parameteres
param_grid = {
    'n_estimators': [20, 50, 75, 100, 150, 200],
    'max_depth': [None, 10, 20, 40, 50, 100],
    'min_samples_split': [1, 2, 4, 5, 10],
    'min_samples_leaf': [1, 3, 6, 9, 12]
}

grid = GridSearchCV(RandomForestRegressor(), param_grid, cv=5, n_jobs=-1)
grid.fit(X_train_scaled, y_train)

best_model = grid.best_estimator_
print(f'The best model was: {best_model}')
print(f'The best scoring was: {grid.best_score_}')

The best model was: RandomForestRegressor(n_estimators=75)
The best scoring was: 0.9635764026382849


In [27]:
# train the model with the data
rf_model = RandomForestRegressor(n_estimators=75)
rf_model.fit(X_train_scaled, y_train)
y_pred = rf_model.predict(X_test_scaled)

In [28]:
# evaluate this model with the varous scorings
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
r2_scoring = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print('The r2 score was: ', r2_scoring)
print('The mean absolute error was: ', mae)
print('The mean squared error was: ', mse)

The r2 score was:  0.956640715616455
The mean absolute error was:  504.5202495912807
The mean squared error was:  5162677.423397943


In [29]:
import joblib

In [30]:
# save the model
joblib.dump(rf_model, r"C:\Users\HP\Documents\Models/rf_model.pkl")

['C:\\Users\\HP\\Documents\\Models/rf_model.pkl']