In [1]:
import pandas as pd

In [2]:
file_path = r'C:\Users\HP\Documents\my-data/amazon_cleaned.csv'

In [3]:
df = pd.read_csv(file_path)

In [4]:
df.head(5)

Unnamed: 0,product_id,product_name,discounted_price,actual_price,discount_percentage,rating,rating_count,Main category,Sub category
0,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,399.0,1099.0,0.64,4.2,24269.0,Computers & Accessories,Accessories & Peripherals
1,B098NS6PVG,Ambrane Unbreakable 60W / 3A Fast Charging 1.5...,199.0,349.0,0.43,4.0,43994.0,Computers & Accessories,Accessories & Peripherals
2,B096MSW6CT,Sounce Fast Phone Charging Cable & Data Sync U...,199.0,1899.0,0.9,3.9,7928.0,Computers & Accessories,Accessories & Peripherals
3,B08HDJ86NZ,boAt Deuce USB 300 2 in 1 Type-C & Micro USB S...,329.0,699.0,0.53,4.2,94363.0,Computers & Accessories,Accessories & Peripherals
4,B08CF3B7N1,Portronics Konnect L 1.2M Fast Charging 3A 8 P...,154.0,399.0,0.61,4.2,16905.0,Computers & Accessories,Accessories & Peripherals


In [5]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import accuracy_score, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [10]:
# find null values
df.isna().sum()

product_id             0
product_name           0
discounted_price       0
actual_price           0
discount_percentage    0
rating                 0
rating_count           0
Main category          0
Sub category           0
dtype: int64

In [11]:
# fix rating_count null values
df['rating_count'].fillna(df['rating_count'].mode()[0], inplace=True)

In [12]:
# check
df.isna().sum()

product_id             0
product_name           0
discounted_price       0
actual_price           0
discount_percentage    0
rating                 0
rating_count           0
Main category          0
Sub category           0
dtype: int64

In [13]:
# prepare the X and y values for the model
X = df[['rating', 'rating_count', 'actual_price']]
y = df['discounted_price']

# spilt the x and y values
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=28)

In [14]:
# apply model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y1_pred = lr_model.predict(X_test)

In [15]:
# evaluate this model
r2_scoring = r2_score(y_test, y1_pred)
print('The r2 score for the Linear Regression model was: ', r2_scoring)

The r2 score for the Linear Regression model was:  0.8888176340484897


In [16]:
# try lasso and ridge
lasso = Lasso(alpha=1)
ridge = Ridge(alpha=1)
lasso.fit(X_train, y_train)
ridge.fit(X_train, y_train)
lasso_ypred = lasso.predict(X_test)
ridge_ypred = ridge.predict(X_test)

In [17]:
# evaluate lasso and ridge models with r2 score
lasso_r2 = r2_score(y_test, lasso_ypred)
ridge_r2 = r2_score(y_test, ridge_ypred)
print('The r2 score for Lasso model was: ', lasso_r2)
print('The r2 score for the Ridge model was: ', ridge_r2)

The r2 score for Lasso model was:  0.8888403917062356
The r2 score for the Ridge model was:  0.8888219760762661


In [18]:
# lets apply the scaler to get more accurate predictions
X2 = df[['rating', 'rating_count', 'actual_price']]
y2 = df['discounted_price']
scaler = StandardScaler()

In [19]:
# split these values using the train_test_split
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, random_state=28)
# apply scaler
X2_train_scaled = scaler.fit_transform(X2_train)
X2_test_scaled = scaler.transform(X2_test)

In [20]:
# apply the models once more
lr_model.fit(X2_train_scaled, y2_train)
lasso.fit(X2_train_scaled, y2_train)
ridge.fit(X2_train_scaled, y2_train)
# predict the y values
lr_y2pred = lr_model.predict(X2_test_scaled)
lasso_y2pred = lasso.predict(X2_test_scaled)
ridge_y2pred = ridge.predict(X2_test_scaled)
# evaluate the models
lr_r2score = r2_score(y2_test, lr_y2pred)
lasso_r2score = r2_score(y2_test, lasso_y2pred)
ridge_r2score = r2_score(y2_test, ridge_y2pred)
print('The scaled Linear Regression model r2 score was: ', lr_r2score)
print('The scaled Lasso model r2 score was: ', lasso_r2score)
print('The scaled Ridge model r2 score was: ', ridge_r2score)

The scaled Linear Regression model r2 score was:  0.8888176340484897
The scaled Lasso model r2 score was:  0.8888216119461667
The scaled Ridge model r2 score was:  0.8888022411879632


In [21]:
# model with the best score was the lasso model so we are going to save it and use it for the streamlit app
import joblib
joblib.dump(lasso, 'lasso_model.pkl')

['lasso_model.pkl']