In [None]:
# Import the required Libraries
import pandas as pd
import numpy as np

# Import data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Import libraries to split the dataset into train data and test data
import sklearn
from sklearn.model_selection import train_test_split

# Import libraries for rescaling the features
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import GridSearchCV
import sklearn.metrics as metrics
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from sklearn.svm import SVR

from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.layers import Dropout

from scipy.sparse import csr_matrix

# Import stats to check skew and kurtosis
from scipy import stats

# Import library to ignore the warnings
import warnings
warnings.filterwarnings('ignore')

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [None]:
# Dataset Link: https://www.kaggle.com/datasets/lokeshparab/amazon-products-dataset/
# From all the datasets, I chose Amazon-Products.csv as it contains all types of products in a single dataset.
# Read the dataset
ecom_data = pd.read_csv('Amazon-Products.csv')
ecom_data.head()

In [None]:
# Check the shape of the dataset
ecom_data.shape

In [None]:
# Check the datatypes and missing values in the dataset
ecom_data.info()

In [None]:
# Dropping the columns which are having unique values
ecom_data.drop(['Unnamed: 0'], axis=1, inplace=True)
ecom_data.head()

In [None]:
# Remove the Rupee symbol from the columns for easy calculation
ecom_data['actual_price'] = ecom_data['actual_price'].str.replace('₹','')
ecom_data['discount_price'] = ecom_data['discount_price'].str.replace('₹','')

In [None]:
# Check the missing value percentage
round(100*((ecom_data.isnull().sum())/ecom_data.shape[0]).sort_values(ascending=False),2)

In [None]:
# Now analyse all the values in the ratings column and replace them with null value
ecom_data['ratings'] = np.where(ecom_data['ratings'].str.contains('₹') |
                                ecom_data['ratings'].str.contains('Get') |
                                ecom_data['ratings'].str.contains('FREE') | 
                                (ecom_data['ratings'] == ''), np.zeros, ecom_data['ratings'])

# Now analyse all the values in the no_of_ratings column and replace them with null value
ecom_data['no_of_ratings'] = np.where(ecom_data['no_of_ratings'].str.contains('FREE') |
                                      ecom_data['no_of_ratings'].str.contains('Only') |
                                      ecom_data['no_of_ratings'].str.contains('This') |
                                      ecom_data['no_of_ratings'].str.contains('Usually') |
                                      (ecom_data['no_of_ratings'] == ''), np.zeros, ecom_data['no_of_ratings'])

In [None]:
# Rename the columns
ecom_data = ecom_data.rename(columns = {'actual_price':'actual_price_in_INR', 'discount_price':'discount_price_in_INR'})

In [None]:
# Drop the columns which are not required for the analysis
ecom_data.drop(['image','link'], axis=1, inplace=True)

In [None]:
# Dropping the rows in actual_price_in_INR column which has NA Value (Products were not given the actual price)
ecom_data.dropna(subset=['actual_price_in_INR'], axis=0, inplace=True)

In [None]:
# Change the column type to int/float
# Replacing the rows with '0' for discount_price_in_INR as discount is not compulsory for every product.
# Replacing the rows with '0' for no_of_ratings as for each product, rating is import but what if the product launched was new and yet to get the ratings.
ecom_data['no_of_ratings'] = ecom_data['no_of_ratings'].str.replace(',','').fillna(0).astype(float).astype(int)
ecom_data['actual_price_in_INR'] = ecom_data['actual_price_in_INR'].str.replace(',','').astype(float).astype(int)
ecom_data['discount_price_in_INR'] = ecom_data['discount_price_in_INR'].str.replace(',','').fillna(0).astype(float).astype(int)

In [None]:
ecom_data = ecom_data[ecom_data['actual_price_in_INR'] >= 0]

In [None]:
# Create new column called "brand"
ecom_data['brand'] = ecom_data['name'].str.split(' ').str[0]

In [None]:
# Create new column called "selling_price_in_INR"
ecom_data['selling_price_in_INR'] = ecom_data['actual_price_in_INR'] - ecom_data['discount_price_in_INR']
ecom_data.head()

In [None]:
ecom_data['ratings'] = pd.to_numeric(ecom_data['ratings'], errors='coerce').astype(float)

In [None]:
ecom_data['ratings'] = ecom_data.groupby(['sub_category', 'brand'])['ratings'].transform(lambda x:x.fillna(x.median()))

In [None]:
ecom_data['ratings'] = ecom_data.ratings.fillna(0)

In [None]:
# Check the missing values in the dataset
ecom_data.isnull().sum().sort_values(ascending=False)

In [None]:
ecom_data.info()

In [None]:
ecom_data.columns.tolist()

In [None]:
ecom_data.size

In [None]:
ecom_data.describe()

In [None]:
# Create bins for ratings so that we can calculate demand based on ratings later
bins = np.linspace(min(ecom_data['ratings']), max(ecom_data['ratings']),6)
bins

In [None]:
sns.histplot(data=ecom_data, x='ratings', bins=5, color='skyblue', edgecolor='black', element='bars', kde=True)
plt.show()

In [None]:
# Let us check the unique values
ecom_data.nunique().sort_values(ascending=False)

In [None]:
ecom_data.describe(percentiles=[0.25, 0.5, 0.75, 0.9, 0.99])

In [None]:
# Calculate the min value for a sub category of a particular brand based on actual price
ecom_data['min_actual'] = ecom_data.groupby(['sub_category', 'brand'])['actual_price_in_INR'].transform(min)

In [None]:
# Calculate the 25% value for a sub category of a particular brand based on actual price
ecom_data['mid_actual'] = ecom_data.groupby(['sub_category', 'brand'])['actual_price_in_INR'].transform(lambda x:x.quantile(0.25))

In [None]:
# Calculate the 75% value for a sub category of a particular brand based on actual price
ecom_data['mod_actual'] = ecom_data.groupby(['sub_category', 'brand'])['actual_price_in_INR'].transform(lambda x:x.quantile(0.75))

In [None]:
# Calculate the min value for a sub category of a particular brand based on discount price
ecom_data['min_discount'] = ecom_data.groupby(['sub_category', 'brand'])['discount_price_in_INR'].transform(min)

In [None]:
# Calculate the 25% value for a sub category of a particular brand based on discount price
ecom_data['mid_discount'] = ecom_data.groupby(['sub_category', 'brand'])['discount_price_in_INR'].transform(lambda x:x.quantile(0.25))

In [None]:
# Calculate the 75% value for a sub category of a particular brand based on discount price
ecom_data['mod_discount'] = ecom_data.groupby(['sub_category', 'brand'])['discount_price_in_INR'].transform(lambda x:x.quantile(0.75))

In [None]:
# Calculate the min value for a sub category of a particular brand based on selling price
ecom_data['min_selling'] = ecom_data.groupby(['sub_category', 'brand'])['selling_price_in_INR'].transform(min)

In [None]:
# Calculate the 25% value for a sub category of a particular brand based on selling price
ecom_data['mid_selling'] = ecom_data.groupby(['sub_category', 'brand'])['selling_price_in_INR'].transform(lambda x:x.quantile(0.25))

In [None]:
# Calculate the 75% value for a sub category of a particular brand based on selling price
ecom_data['mod_selling'] = ecom_data.groupby(['sub_category', 'brand'])['selling_price_in_INR'].transform(lambda x:x.quantile(0.75))

In [None]:
ecom_data.head()

In [None]:
# Function to calculate demand for actual_price_in_INR based on sub_category and brand columns by taking min, 25%, 75%
def actualCategorize(row):
    if ((row['actual_price_in_INR'] >= row['min_actual']) & (row['actual_price_in_INR'] <= row['mid_actual'])):
        return "Low Demand"
    elif ((row['actual_price_in_INR'] > row['mid_actual']) & (row['actual_price_in_INR'] < row['mod_actual'])):
        return "Moderate Demand"
    else:
        return "High Demand"

In [None]:
# Function to calculate demand for discount_price_in_INR based on sub_category and brand columns by taking min, 25%, 75%
def discountCategorize(row):
    if ((row['discount_price_in_INR'] >= row['min_discount']) & (row['discount_price_in_INR'] <= row['mid_discount'])):
        return "Low Demand"
    elif ((row['discount_price_in_INR'] > row['mid_discount']) & (row['discount_price_in_INR'] < row['mod_discount'])):
        return "Moderate Demand"
    else:
        return "High Demand"

In [None]:
# Function to calculate demand for selling_price_in_INR based on sub_category and brand columns by taking min, 25%, 75%
def sellingCategorize(row):
    if ((row['selling_price_in_INR'] >= row['min_selling']) & (row['selling_price_in_INR'] <= row['mid_selling'])):
        return "Low Demand"
    elif ((row['selling_price_in_INR'] > row['mid_selling']) & (row['selling_price_in_INR'] < row['mod_selling'])):
        return "Moderate Demand"
    else:
        return "High Demand"

In [None]:
# Calculate the demand for actual_price_in_INR based on sub_Category and brand columns
ecom_data['actual_demand'] = ecom_data.apply(actualCategorize, axis=1)

In [None]:
# Calculate the demand for discount_price_in_INR based on sub_Category and brand columns
ecom_data['discount_demand'] = ecom_data.apply(discountCategorize, axis=1)

In [None]:
# Calculate the demand for selling_price_in_INR based on sub_Category and brand columns
ecom_data['selling_demand'] = ecom_data.apply(sellingCategorize, axis=1)

In [None]:
ecom_data.head()

In [None]:
# Dropping the columns as we have calculated the demand based on these columns
ecom_data.drop(['min_actual', 'mid_actual', 'mod_actual', 'min_discount', 'mid_discount', 'mod_discount', 'min_selling', 'mid_selling', 'mod_selling'], axis=1, inplace=True)
ecom_data.head()

In [None]:
# Numerical columns in dataset
ecom_data_num = list(ecom_data.select_dtypes(include=['float64', 'int32']).columns)
ecom_data_num

In [None]:
# Categorical columns in dataset
ecom_data_cat = list(ecom_data.select_dtypes(include=['object']).columns)
ecom_data_cat

In [None]:
# Box plot for numerical columns
plt.figure(figsize=(20,30))
for i in enumerate(ecom_data_num):
    plt.subplot(3,3,i[0]+1)
    sns.boxplot(x=i[1], data=ecom_data)

In [None]:
ecom_data.shape

In [None]:
# From the above box plot, I observed that discount_price_in_INR, actual_price_in_INR, actuak_price_in_INR has infinite values.
# Replacing infinite values with NaN
ecom_data[np.isinf(ecom_data.actual_price_in_INR)] = np.nan
ecom_data[np.isinf(ecom_data.discount_price_in_INR)] = np.nan
ecom_data[np.isinf(ecom_data.selling_price_in_INR)] = np.nan

In [None]:
# Dropping the values which are having NaN values
ecom_data.dropna(subset=['actual_price_in_INR', 'discount_price_in_INR', 'selling_price_in_INR'], axis=0, inplace=True)

In [None]:
ecom_data.shape

In [None]:
# Function to check outliers
def check_outliers(list):
    for i in list:
        Q1 = ecom_data[i].quantile(0.25)
        Q3 = ecom_data[i].quantile(0.75)
        IQR = Q3 - Q1
        lower_fence = Q1-1.5*IQR
        upper_fence = Q3+1.5*IQR
        ecom_data[i][ecom_data[i] <= lower_fence] = lower_fence
        ecom_data[i][ecom_data[i] >= upper_fence] = upper_fence
        print("Outliers : ",i, lower_fence, upper_fence)
        plt.figure(1, figsize=(10,5))
        sns.boxplot(x=i, data=ecom_data)
        plt.xticks(rotation=90, fontsize=10)
        plt.show()

In [None]:
check_outliers(ecom_data_num)

In [None]:
# Box plot for numerical columns
plt.figure(figsize=(20,30))
for i in enumerate(ecom_data_num):
    plt.subplot(3,3,i[0]+1)
    sns.boxplot(x=i[1], data=ecom_data)

In [None]:
# Bar plot for main_category and the target variable "selling_price_in_INR"
plt.figure(figsize=(15,10))
sns.barplot(x='main_category', y='selling_price_in_INR', data=ecom_data)
plt.xticks(rotation=90)
plt.show()

In [None]:
# Bar plot for sub_category and the target variable "selling_price_in_INR"
plt.figure(figsize=(15,10))
sns.barplot(x='sub_category', y='selling_price_in_INR', data=ecom_data)
plt.xticks(rotation=90)
plt.show()

In [None]:
# Bar plot for actual_demand and the target variable "selling_price_in_INR"
plt.figure(figsize=(8,5))
sns.barplot(x='actual_demand', y='selling_price_in_INR', data=ecom_data)
plt.show()

In [None]:
# Bar plot for discount_demand and the target variable "selling_price_in_INR"
plt.figure(figsize=(8,5))
sns.barplot(x='discount_demand', y='selling_price_in_INR', data=ecom_data)
plt.show()

In [None]:
# Bar plot for selling_demand and the target variable "selling_price_in_INR"
plt.figure(figsize=(8,5))
sns.barplot(x='selling_demand', y='selling_price_in_INR', data=ecom_data)
plt.show()

In [None]:
# Count plot for sub category and actual demand
plt.figure(figsize=(20,10))
sns.countplot(x='sub_category', hue='actual_demand', data=ecom_data)
plt.xticks(rotation=90)
plt.show()

In [None]:
# Count plot for main category and actual demand as I have calculated the demand based on the sub category and brand for actual_price_in_INR
plt.figure(figsize=(20,10))
sns.countplot(x='main_category', hue='actual_demand', data=ecom_data)
plt.xticks(rotation=90)
plt.show()

In [None]:
# Count plot for main category and discount demand as I have calculated the demand based on the sub category and brand for discount_price_in_INR
plt.figure(figsize=(20,10))
sns.countplot(x='main_category', hue='discount_demand', data=ecom_data)
plt.xticks(rotation=90)
plt.show()

In [None]:
# Count plot for main category and selling demand as I have calculated the demand based on the sub category and brand for discount_price_in_INR
plt.figure(figsize=(20,10))
sns.countplot(x='main_category', hue='selling_demand', data=ecom_data)
plt.xticks(rotation=90)
plt.show()

In [None]:
# Now I chose the top brands and plot the bar graph
values = ecom_data['brand'].value_counts().keys().tolist()[:10]
counts = ecom_data['brand'].value_counts().tolist()[:10]

In [None]:
# Bar plot for top 10 brands
plt.figure(figsize=(8,5))
sns.barplot(x=values, y=counts, data=ecom_data)
plt.show()

In [None]:
# Check the regplot between selling_price_in_INR and actual_price_in_INR
plt.figure(figsize=(5,3), dpi=110)
plt.title("Actual Price vs Selling Price", fontsize=14)
sns.regplot(data=ecom_data, x='actual_price_in_INR', y="selling_price_in_INR", line_kws={'color':'red'})
plt.xlabel("Actual Price")
plt.show()

# Check the regplot between selling_price_in_INR and discount_price_in_INR
plt.figure(figsize=(5,3), dpi=110)
plt.title("Discount Price vs Selling Price", fontsize=14)
sns.regplot(data=ecom_data, x='discount_price_in_INR', y="selling_price_in_INR", line_kws={'color':'red'})
plt.xlabel("Discount Price")
plt.show()

# Check the regplot between selling_price_in_INR and ratings
plt.figure(figsize=(5,3), dpi=110)
plt.title("Ratings vs Selling Price", fontsize=14)
sns.regplot(data=ecom_data, x='ratings', y="selling_price_in_INR", line_kws={'color':'red'})
plt.xlabel("Ratings")
plt.show()

# Check the regplot between selling_price_in_INR and no_of_ratings
plt.figure(figsize=(5,3), dpi=110)
plt.title("No. of Ratings vs Selling Price", fontsize=14)
sns.regplot(data=ecom_data, x='no_of_ratings', y="selling_price_in_INR", line_kws={'color':'red'})
plt.xlabel("No. of Ratings")
plt.show()

In [None]:
# Check the regplot between selling_price_in_INR and no_of_ratings
plt.figure(figsize=(5,3), dpi=110)
plt.title("No. of Ratings vs Ratings", fontsize=14)
sns.regplot(data=ecom_data, y='no_of_ratings', x="ratings", line_kws={'color':'red'})
plt.xlabel("Ratings")
plt.show()

In [None]:
# Scatter Plot for ratings and no_of_ratings by comparing with actual_demand
plt.figure(figsize=(20,10))
sns.scatterplot(x='no_of_ratings', y='ratings', hue='actual_demand', data=ecom_data)
plt.show()

In [None]:
# Scatter Plot for ratings and no_of_ratings by comparing with discount_demand
plt.figure(figsize=(20,10))
sns.scatterplot(x='no_of_ratings', y='ratings', hue='discount_demand', data=ecom_data)
plt.show()

In [None]:
# Scatter Plot for ratings and no_of_ratings by comparing with selling_demand
plt.figure(figsize=(20,10))
sns.scatterplot(x='no_of_ratings', y='ratings', hue='selling_demand', data=ecom_data)
plt.show()

In [None]:
# Scatter Plot for actual_price_in_INR and selling_price_in_INR by comparing with actual_demand
plt.figure(figsize=(20,10))
sns.scatterplot(x='actual_price_in_INR', y='selling_price_in_INR', hue='actual_demand', data=ecom_data)
plt.show()

In [None]:
# Scatter Plot for discount_price_in_INR and selling_price_in_INR by comparing with discount_demand
plt.figure(figsize=(20,10))
sns.scatterplot(x='discount_price_in_INR', y='selling_price_in_INR', hue='discount_demand', data=ecom_data)
plt.show()

In [None]:
plt.figure(figsize=(20,10))
sns.scatterplot(x='no_of_ratings', y='ratings', data=ecom_data)
plt.show()

In [None]:
# plotting heat map to find correlation among continuous variables
plt.figure(figsize=(10,10))
sns.heatmap(ecom_data[ecom_data_num].corr(), annot=True, cmap="RdYlGn")
plt.show()

In [None]:
# Dropping the name column as I see it has more number of unique values and is not required for model building
ecom_data.drop(['name', 'brand'], axis=1, inplace=True)

In [None]:
ecom_data.shape

In [None]:
# Before proceeding further with model building, Get dummies for categorical variables

mc = pd.get_dummies(ecom_data['main_category'], prefix = 'mc', sparse=True)
mc = mc.astype(int)
# Adding result to master ecom_data
ecom_data = pd.concat([ecom_data, mc], axis=1)
# Dropping original variable
ecom_data = ecom_data.drop('main_category', axis=1)

In [None]:
sc = pd.get_dummies(ecom_data['sub_category'], prefix = 'sc', sparse=True)
sc = sc.astype(int)
# Adding result to master ecom_data
ecom_data = pd.concat([ecom_data, sc], axis=1)
# Dropping original variable
ecom_data = ecom_data.drop('sub_category', axis=1)

In [None]:
#br = pd.get_dummies(ecom_final['brand'], prefix = 'br', sparse=True)
#br = br.astype(int)
# Adding result to master ecom_data
#ecom_final = pd.concat([ecom_final, br], axis=1)
# Dropping original variable
#ecom_final = ecom_data.drop('brand', axis=1)

In [None]:
acDmd = pd.get_dummies(ecom_data['actual_demand'], prefix = 'acDmd', sparse=True)
acDmd = acDmd.astype(int)
# Adding result to master ecom_data
ecom_data = pd.concat([ecom_data, acDmd], axis=1)
# Dropping original variable
ecom_data = ecom_data.drop('actual_demand', axis=1)

In [None]:
dsDmd = pd.get_dummies(ecom_data['discount_demand'], prefix = 'dsDmd', sparse=True)
dsDmd = dsDmd.astype(int)
# Adding result to master ecom_data
ecom_data = pd.concat([ecom_data, dsDmd], axis=1)
# Dropping original variable
ecom_data = ecom_data.drop('discount_demand', axis=1)

In [None]:
slDmd = pd.get_dummies(ecom_data['selling_demand'], prefix = 'slDmd', sparse=True)
slDmd = slDmd.astype(int)
# Adding result to master ecom_data
ecom_data = pd.concat([ecom_data, slDmd], axis=1)
# Dropping original variable
ecom_data = ecom_data.drop('selling_demand', axis=1)

In [None]:
ecom_data.head()

In [None]:
ecom_data.shape

In [None]:
#Putting feature variables in X and response/target variables in y
X = ecom_data.drop(['selling_price_in_INR'], axis=1)
y = ecom_data['selling_price_in_INR']

In [None]:
# Split the dataset into train data and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=100)

In [None]:
# Scaling
scaler = MinMaxScaler()

#ecom_num = ['ratings', 'no_of_ratings', 'actual_price_in_INR', 'discount_price_in_INR', 'selling_price_in_INR']

# Fitting and transforming the scale on train
#ecom_train[ecom_num] = scaler.fit_transform(ecom_train[ecom_num])

#ecom_test[ecom_num] = scaler.transform(ecom_test[ecom_num])

X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)

In [None]:
## PCA

In [None]:
#let us do the PCA to reduce the dimensionality of the data and proceed with the Logistic regression first and move forward to other models
#Import PCA module
from sklearn.decomposition import PCA
pca = PCA(svd_solver='randomized', random_state=42)

In [None]:
#Let us fit the model with PCA
pca.fit(X_train)

#List of PCA components, it would be the same as number of variables
pca.components_

In [None]:
#Let's check the variance ratios
pca.explained_variance_ratio_[:50]

In [None]:
#Visualise the pca variance ration in a bar graph
plt.figure(figsize=(6,5))
plt.bar(range(1,len(pca.explained_variance_ratio_[:50])+1), pca.explained_variance_ratio_[:50])
plt.show()

In [None]:
#Most of the data is in the 0 to 2 range. Let's see the cummulative variance ratio
var_cum = np.cumsum(pca.explained_variance_ratio_)

In [None]:
#We can see most of the data is lying between 0 to 10
#Make the scree plots clearly for choosing the number of PCA
plt.figure(figsize=(8,6))
plt.title('Scree Plots')
plt.xlabel('Number of Components')
plt.ylabel('Cummulative explained variance')
plt.plot(range(1,len(var_cum)+1), var_cum)
plt.show()

We will take 50 components for describe the 95% of the variance in the dataset

In [None]:
#Once again we will apply PCA with components
pca = PCA(n_components=50, random_state=42)

In [None]:
#Let us fit the data
X_train_pca = pca.fit_transform(X_train)
X_train_pca.shape

In [None]:
#Creating the correlation matrix for the principal components
corr_mat = np.corrcoef(X_train_pca.transpose())
corr_mat_nodiag = corr_mat - np.diagflat(corr_mat.diagonal())
print("Max corr: ", corr_mat_nodiag.max(),", min corr: ",corr_mat_nodiag.min(),)

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(corr_mat, annot=True)
plt.show()

In [None]:
#Applying selected omponents to the test data - 12 components
X_test_pca = pca.transform(X_test)
X_test_pca.shape

In [None]:
## Linear Regression

In [None]:
# Perform Linear Regression
lm = LinearRegression()

# Fit a line
lm.fit(X_train_pca, y_train)

In [None]:
# Print the coefficients and intercept
print('Intercepts are: ',lm.intercept_)
print('Coefficients are:',lm.coef_)

In [None]:
# Predict the model with train set
y_train_pred = lm.predict(X_train_pca)

# r2 score for the train model
metrics.r2_score(y_true = y_train, y_pred = y_train_pred)

In [None]:
# Predict the model with test set
y_test_pred = lm.predict(X_test_pca)

# r2 score for the test model
metrics.r2_score(y_true = y_test, y_pred = y_test_pred)

In [None]:
# Create an empty results table
train_results = pd.DataFrame(columns=['Algorithm', 'Train  R2 score', 'Train MSE', 'Train RMSE', 'Train MAE'])
test_results = pd.DataFrame(columns=['Algorithm', 'Test R2 score', 'Test MSE', 'Test RMSE', 'Test MAE'])

In [None]:
# Calculate the scores for Train Dataset
train_r2_score = r2_score(y_train, y_train_pred)
train_mse = mean_squared_error(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)
train_mae = mean_absolute_error(y_train, y_train_pred)

# Calculate the scores for Test Dataset
test_r2_score = r2_score(y_test, y_test_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_mae = mean_absolute_error(y_test, y_test_pred)


train_results = train_results._append({'Algorithm': 'Linear Regression', 'Train R2 Score': train_r2_score, 'Train MSE': train_mse,
                                       'Train RMSE': train_rmse, 'Train MAE': train_mae}, ignore_index=True)

test_results = test_results._append({'Algorithm': 'Linear Regression',  'Test R2 Score': test_r2_score,
                                    'Test MSE': test_mse, 'Test RMSE': test_rmse, 'Test MAE': test_mae}, ignore_index=True)

In [None]:
test_results

In [None]:
## Decision Tree

In [None]:
dTree = DecisionTreeRegressor()

In [None]:
dTree.fit(X_train_pca, y_train)

In [None]:
# Predict the model with train set
y_train_pred = dTree.predict(X_train_pca)

# r2 score for the train model
metrics.r2_score(y_true = y_train, y_pred = y_train_pred)

In [None]:
# Predict the model with test set
y_test_pred = dTree.predict(X_test_pca)

In [None]:
# Create the parameter grid based on the results of random search
param_grid = {'max_depth': range(5,15,3), 'min_samples_leaf':range(100,200,50), 'min_samples_split':range(100,200,50), 'max_features':range(5,50,5)}
dTree = DecisionTreeRegressor(random_state=10)
grid_search = GridSearchCV(estimator=dTree, param_grid=param_grid, cv=5, n_jobs=4, verbose=1, scoring='accuracy', return_train_score=True)

In [None]:
# Fit the grid search to the data
grid_search.fit(X_train_pca, y_train)

In [None]:
# Print the optimal accuracy score and hyper parameters
print("Recall score", grid_search.best_score_)
print("Best parameters", grid_search.best_estimator_)

In [None]:
# Model with the best hyperparameters
dTree_final = DecisionTreeRegressor(max_depth=5, min_samples_leaf=100, min_samples_split=100, max_features=5, random_state=10)

In [None]:
dTree_final.fit(X_train_pca, y_train)

In [None]:
# Predict the model with train set
y_train_pred = dTree_final.predict(X_train_pca)

In [None]:
# Predict the model with test set
y_test_pred = dTree_final.predict(X_test_pca)

In [None]:
# Calculate the scores for Train Dataset
train_r2_score = r2_score(y_train, y_train_pred)
train_mse = mean_squared_error(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)
train_mae = mean_absolute_error(y_train, y_train_pred)

# Calculate the scores for Test Dataset
test_r2_score = r2_score(y_test, y_test_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_mae = mean_absolute_error(y_test, y_test_pred)

train_results = train_results._append({'Algorithm': 'Decision Tree', 'Train R2 Score': train_r2_score, 'Train MSE': train_mse,
                                       'Train RMSE': train_rmse, 'Train MAE': train_mae}, ignore_index=True)

test_results = test_results._append({'Algorithm': 'Decision Tree',  'Test R2 Score': test_r2_score,
                                    'Test MSE': test_mse, 'Test RMSE': test_rmse, 'Test MAE': test_mae}, ignore_index=True)

In [None]:
train_results

In [None]:
## Random Forest Regressor

In [None]:
parameters = {'max_depth': range(10,30,5), 'max_features':range(5,50,5), 'n_estimators':range(30,90,15), 'min_samples_split':range(50,300,50), 'min_samples_leaf':range(100,400,50)}

In [None]:
rfr = RandomForestRegressor()
#grid_search = GridSearchCV(estimator=rfr, param_grid=parameters, cv=5, n_jobs=-1, scoring='accuracy', return_train_score=True)

In [None]:
# Fit the grid search to the data
grid_search.fit(X_train_pca, y_train)

In [None]:
# Print the optimal accuracy score and hyper parameters
print("Recall score", grid_search.best_score_)
print("Best parameters", grid_search.best_estimator_)

In [None]:
# Model with the best hyperparameters
rf_final = RandomForestRegressor(max_depth=, max_features=, n_estimators=, min_samples_leaf=, min_samples_split=, n_jobs=-1, random_state=100)

In [None]:
#rf_final.fit(X_train_pca, y_train)
rfr.fit(X_train_pca, y_train)

In [None]:
# Predict the model with train set
y_train_pred = rfr.predict(X_train_pca)

In [None]:
# Predict the model with test set
y_test_pred = rfr.predict(X_test_pca)

In [None]:
# Calculate the scores for Train Dataset
train_r2_score = r2_score(y_train, y_train_pred)
train_mse = mean_squared_error(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)
train_mae = mean_absolute_error(y_train, y_train_pred)

# Calculate the scores for Test Dataset
test_r2_score = r2_score(y_test, y_test_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_mae = mean_absolute_error(y_test, y_test_pred)

train_results = train_results._append({'Algorithm': 'Random Forest', 'Train R2 Score': train_r2_score, 'Train MSE': train_mse,
                                       'Train RMSE': train_rmse, 'Train MAE': train_mae}, ignore_index=True)

test_results = test_results._append({'Algorithm': 'Random Forest',  'Test R2 Score': test_r2_score,
                                    'Test MSE': test_mse, 'Test RMSE': test_rmse, 'Test MAE': test_mae}, ignore_index=True)

In [None]:
## XGBoosting

In [None]:
# Fit the XGBRegressor 
XGb = XGBRegressor(learning_rate=0.1, n_estimators=140, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='reg:squarederror', nthread=4, scale_pos_weight=1, seed=27)

In [None]:
XGb.fit(X_train_pca, y_train)

In [None]:
# lets tune the hyper parameters
param_t1 = {'max_depth':range(3,10,2), 'min_child_weight':range(1,6,2), 'gamma': [i/10.0 for i in range(0,5)], 'min_samples_leaf':range(30,71,10), 'max_features':range(5,50,5)}
gsearch1 = GridSearchCV(estimator=XGBRegressor(learning_rate=0.1, n_estimators=140, gamma=0, subsample=0.8, objective='reg:squarederror', nthread=4, scale_pos_weight=1, seed=27), param_grid=param_t1, scoring='accuracy', n_jobs=4, cv=5)
gsearch1.fit(X_train_pca, y_train)

In [None]:
# Check the score and n_estimators
print("The best features: {}".format(gsearch1.best_params_))
print("The best score: {}".format(gsearch1.best_score_))

In [None]:
# Final model for Gradient Boosting
XGb_final = XGBRegressor(learning_rate=0.1, n_estimators=140, max_features=, min_child_weight=, gamma=, max_depth=, min_samples_leaf=, subsample=0.8, colsample_bytree=, objective='reg:squarederror', nthread=4, scale_pos_weight=1, seed=27)
XGb_final.fit(X_train_pca, y_train)

In [None]:
# Predict the model with train set
y_train_pred = XGb_final.predict(X_train_pca)

In [None]:
# Predict the model with test set
y_test_pred = XGb_final.predict(X_test_pca)

In [None]:
# Calculate the scores for Train Dataset
train_r2_score = r2_score(y_train, y_train_pred)
train_mse = mean_squared_error(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)
train_mae = mean_absolute_error(y_train, y_train_pred)

# Calculate the scores for Test Dataset
test_r2_score = r2_score(y_test, y_test_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_mae = mean_absolute_error(y_test, y_test_pred)

train_results = train_results._append({'Algorithm': 'XGBoosting', 'Train R2 Score': train_r2_score, 'Train MSE': train_mse,
                                       'Train RMSE': train_rmse, 'Train MAE': train_mae}, ignore_index=True)

test_results = test_results._append({'Algorithm': 'XGBoosting',  'Test R2 Score': test_r2_score,
                                    'Test MSE': test_mse, 'Test RMSE': test_rmse, 'Test MAE': test_mae}, ignore_index=True)

In [None]:
## SVM

In [None]:
sv = SVR(C=1, kernel='linear')
sv.fit(X_train_pca, y_train)

In [None]:
# Predict the model with train set
y_train_pred = sv.predict(X_train_pca)

In [None]:
# Predict the model with test set
y_test_pred = sv.predict(X_test_pca)

In [None]:
params = {"C": [0.1,1,10,100,1000]}
svm = SVR(kernel='linear')

model_cv= GridSearchCV(estimator=svm, param_grid=params, scoring='accuarcy', cv=5, verbose=1, n_jobs=4, return_train_score=True)
model_cv.fit(X_train_pca, y_train)

In [None]:
# Check the score
print("The best params: {}".format(model_cv.best_params_))
print("The best score: {}".format(model_cv.best_score_))

In [None]:
# Use C as  and run again
sv_final = SVR(C=)
sv_final.fit(X_train_pca, y_train)

In [None]:
# Predict the model with train set
y_train_pred = sv_final(X_train_pca)

In [None]:
# Predict the model with test set
y_test_pred = sv_final(X_test_pca)

In [None]:
# Calculate the scores for Train Dataset
train_r2_score = r2_score(y_train, y_train_pred)
train_mse = mean_squared_error(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)
train_mae = mean_absolute_error(y_train, y_train_pred)

# Calculate the scores for Test Dataset
test_r2_score = r2_score(y_test, y_test_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_mae = mean_absolute_error(y_test, y_test_pred)

train_results = train_results._append({'Algorithm': 'SVM', 'Train R2 Score': train_r2_score, 'Train MSE': train_mse,
                                       'Train RMSE': train_rmse, 'Train MAE': train_mae}, ignore_index=True)

test_results = test_results._append({'Algorithm': 'SVM',  'Test R2 Score': test_r2_score,
                                    'Test MSE': test_mse, 'Test RMSE': test_rmse, 'Test MAE': test_mae}, ignore_index=True)

In [None]:
## LSTM

In [None]:
ecom_num = ['ratings', 'no_of_ratings', 'actual_price_in_INR','discount_price_in_INR','selling_price_in_INR']

In [None]:
data = ecom_data[ecom_num]

In [None]:
scaler = MinMaxScaler()
ecom_data[['ratings', 'no_of_ratings', 'actual_price_in_INR','discount_price_in_INR','selling_price_in_INR']] = scaler.fit_transform(ecom_data[['ratings', 'no_of_ratings', 'actual_price_in_INR','discount_price_in_INR','selling_price_in_INR']])

In [None]:


# Preprocess the data
scaler = MinMaxScaler(feature_range=(0, 1))
data_scaled = scaler.fit_transform(ecom_data[['selling_price_in_INR']])

# Create sequences for LSTM
sequence_length = 10
sequences = [data_scaled[i:i+sequence_length+1] for i in range(len(data_scaled)-sequence_length)]

# Convert sequences to arrays
sequences = np.array(sequences)

# Split the data into features (X) and target variable (y)
X = sequences[:, :-1]
y = sequences[:, -1]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Reshape data for LSTM (assuming univariate time series)
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

# Build the LSTM model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test), verbose=1)

# Make predictions
y_pred = model.predict(X_test)

# Reverse scaling for both true and predicted values
y_test = scaler.inverse_transform(y_test.reshape(-1, 1))
y_pred = scaler.inverse_transform(y_pred)

# Calculate R-squared (R2) score
r2 = r2_score(y_test, y_pred)
print(f'R-squared (R2) score: {r2:.4f}')


In [None]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)

In [None]:
print('MSE', mse)
print('RMSE', rmse)
print('MAE', mae)

In [None]:
# Feed Forward Neural Networks

In [None]:
fnn_model = Sequential()

In [None]:
fnn_model.add(Dense(64, input_dim=X_train_pca.shape[1], activation='relu'))

In [None]:
fnn_model.add(Dense(32, activation='relu'))

In [None]:
fnn_model.add(Dense(1))

In [None]:
fnn_model.compile(optimizer='adam', loss='mean_squared_error')

In [None]:
fnn_model.fit(X_train_pca, y_train, epochs=10, batch_size=32, validation_split=0.1)

In [None]:
fnn_predictions = fnn_model.predict(X_test_pca)
fnn_mse = mean_squared_error(y_test, fnn_predictions)
fnn_rmse = np.sqrt(fnn_mse)
fnn_r2 = r2_score(y_test, fnn_predictions)
fnn_mae = mean_absolute_error(y_test, fnn_predictions)
print(f'FNN MSE: {fnn_mse}')
print(f'FNN RMSE: {fnn_rmse}')
print(f'FNN r2: {fnn_r2}')
print(f'FNN MAE: {fnn_mae}')