In [None]:
#checking for outliers

import pandas as pd

# Load the dataset from Excel file
file_path = 'location data.xlsx'
data = pd.read_excel(file_path)

# Define a function to detect outliers using z-score method
def detect_outliers_zscore(data, threshold=3):
    from scipy.stats import zscore
    z_scores = zscore(data)
    abs_z_scores = abs(z_scores)
    outliers = (abs_z_scores > threshold).any(axis=1)
    return outliers

# Detect outliers in numerical columns
numerical_columns = data.select_dtypes(include=['number']).columns
outliers = detect_outliers_zscore(data[numerical_columns])

# Display rows with outliers
outlier_rows = data[outliers]
print("Rows with outliers:")
print(outlier_rows)


Rows with outliers:
     Average store monthly revenue  \
2                         0.083066   
5                         0.085778   
7                         0.025031   
9                         0.079767   
12                        0.041694   
..                             ...   
735                       0.800919   
737                       0.548154   
738                       0.387109   
739                       0.668827   
741                       0.559818   

     Number of competitive top jewellery brand stores located within the same shopping mall  \
2                                                    0                                        
5                                                    0                                        
7                                                    1                                        
9                                                    1                                        
12                                              

In [None]:
# removing outliers
import pandas as pd
import numpy as np
from scipy import stats

# Load the dataset
file_path = 'location data.xlsx'
data = pd.read_excel(file_path)

# Remove outliers based on z-score
z_scores = np.abs(stats.zscore(data))
threshold = 3
outlier_rows = np.where(z_scores > threshold)[0]
cleaned_data_zscore = data.drop(outlier_rows)


# Print summary statistics after outlier handling
print("\nCleaned Dataset (Z-score):")
print(cleaned_data_zscore.describe())

# Save the cleaned dataset to a new file if needed
cleaned_data_zscore.to_excel('cleaned_data_zscore.xlsx', index=False)




Cleaned Dataset (Z-score):
       Average store monthly revenue  \
count                   4.130000e+02   
mean                    1.997432e-01   
std                     1.417943e-01   
min                     6.244722e-12   
25%                     1.011212e-01   
50%                     1.610950e-01   
75%                     2.515438e-01   
max                     6.941368e-01   

       Number of competitive top jewellery brand stores located within the same shopping mall  \
count                                         413.000000                                        
mean                                            2.062954                                        
std                                             1.666608                                        
min                                             0.000000                                        
25%                                             1.000000                                        
50%                          

In [None]:
#standardizing

from sklearn.preprocessing import StandardScaler
import pandas as pd

# Load dataset from a file
file_path = 'cleaned_data_zscore.xlsx'
data = pd.read_excel(file_path)

# Separate features
# Assuming the target variable is in the first column
X = data.iloc[:, 1:104]  # Feature
print("features: ",X)

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler to the features and transform the features
X_scaled = scaler.fit_transform(X)


# the standardized features back to a file
standardized_file_path = 'standardized_dataset.xlsx'
pd.DataFrame(X_scaled).to_excel(standardized_file_path, index=False, header=True)

print("Dataset standardized and saved to", standardized_file_path)


features:       Number of competitive top jewellery brand stores located within the same shopping mall  \
0                                                    1                                        
1                                                    4                                        
2                                                    1                                        
3                                                    1                                        
4                                                    0                                        
..                                                 ...                                        
408                                                  0                                        
409                                                  1                                        
410                                                  4                                        
411                                    

In [None]:
# PCA
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Load dataset
file_path = 'standardized_dataset.xlsx'
data = pd.read_excel(file_path)
X = data.iloc[:, :]  # Features

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialize PCA with the some number of components
pca = PCA(n_components=20)

# Fit and transform the data to the chosen number of principal components
X_pca = pca.fit_transform(X_scaled)
# Now X_pca contains the transformed dataset with reduced dimensions
# the explained variance ratio of each principal component
print("Explained variance ratio:", pca.explained_variance_ratio_)

# the cumulative explained variance ratio
print("Cumulative explained variance ratio:", sum(pca.explained_variance_ratio_))

# convert the transformed data back to a DataFrame
transformed_data = pd.DataFrame(X_pca)
transformed_data.to_excel('pca_transformed_dataset.xlsx', index=False)
print("PCA transformation complete and saved to pca_transformed_dataset.xlsx")


Explained variance ratio: [0.23405748 0.11178556 0.07068419 0.05446744 0.03685824 0.02951672
 0.02918121 0.02607549 0.0229531  0.02138522 0.017416   0.01597121
 0.01525486 0.01379396 0.01310589 0.01247373 0.01174425 0.0112781
 0.01070375 0.00998654]
Cumulative explained variance ratio: 0.7686929458537903
PCA transformation complete and saved to pca_transformed_dataset.xlsx


In [None]:
# PCR
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

# Load PCA-transformed dataset from a file
file_path = 'pca_transformed_dataset.xlsx'
data = pd.read_excel(file_path)

# Assuming target variable is Average store monthly revenue
X = data.drop('Average store monthly revenue',axis=1)
y = data['Average store monthly revenue']   # Target variable

# Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)

# Fit a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate NDCG
def get_dcg(y_pred, y_true, k):
    df = pd.DataFrame({"y_pred": y_pred, "y_true": y_true})
    df = df.sort_values(by="y_pred", ascending=False)
    df = df.iloc[0:k, :]
    dcg = df["y_true"] / np.log2(np.arange(1, df["y_true"].count() + 1) + 1)
    dcg = np.sum(dcg)
    return dcg

def get_ndcg(y_pred, y_true, k):
    dcg = get_dcg(y_pred, y_true, k)
    idcg = get_dcg(y_true, y_true, k)
    ndcg = dcg / idcg
    return ndcg

ndcg = get_ndcg(y_pred, y_test, k=30)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Calculate MAPE
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true))

mape = mean_absolute_percentage_error(y_test, y_pred)

print("RMSE:", rmse)
print("MAPE:", mape)
print("NDCG:", ndcg)


RMSE: 0.10540784436078021
MAPE: 0.6308170235972347
NDCG: 0.8268976898422989


In [None]:
# Random Forest
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
import pandas as pd

# Load PCA-transformed dataset from a file
file_path = 'pca_transformed_dataset.xlsx'
data = pd.read_excel(file_path)

# Separate features and target variable
X = data.drop('Average store monthly revenue',axis=1)
y = data['Average store monthly revenue']   # Target variable

# Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)

# Initialize and train the Random Forest Regression model
model = RandomForestRegressor(n_estimators=100, random_state=44)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate NDCG
def get_dcg(y_pred, y_true, k):
    df = pd.DataFrame({"y_pred": y_pred, "y_true": y_true})
    df = df.sort_values(by="y_pred", ascending=False)
    df = df.iloc[0:k, :]
    dcg = df["y_true"] / np.log2(np.arange(1, df["y_true"].count() + 1) + 1)
    dcg = np.sum(dcg)
    return dcg

def get_ndcg(y_pred, y_true, k):
    dcg = get_dcg(y_pred, y_true, k)
    idcg = get_dcg(y_true, y_true, k)
    ndcg = dcg / idcg
    return ndcg

ndcg = get_ndcg(y_pred, y_test, k=30)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Calculate MAPE
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true))

mape = mean_absolute_percentage_error(y_test, y_pred)


print("RMSE:", rmse)
print("MAPE:", mape)
print("NDCG:", ndcg)


RMSE: 0.10861607374670373
MAPE: 0.6883775042558138
NDCG: 0.811753386994917


In [None]:
# Lasso
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
from sklearn import metrics

# Load PCA-transformed dataset from a file
file_path = 'pca_transformed_dataset.xlsx'
data = pd.read_excel(file_path)

# Separate features and target variable
X = data.drop('Average store monthly revenue',axis=1)
y = data['Average store monthly revenue']   # Target variable

# Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)

# Initialize and train the Lasso Regression model
alpha = 0.003 # Regularization strength
model = Lasso(alpha=alpha)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate NDCG
def get_dcg(y_pred, y_true, k):
    df = pd.DataFrame({"y_pred": y_pred, "y_true": y_true})
    df = df.sort_values(by="y_pred", ascending=False)
    df = df.iloc[0:k, :]
    dcg = df["y_true"] / np.log2(np.arange(1, df["y_true"].count() + 1) + 1)
    dcg = np.sum(dcg)
    return dcg

def get_ndcg(y_pred, y_true, k):
    dcg = get_dcg(y_pred, y_true, k)
    idcg = get_dcg(y_true, y_true, k)
    ndcg = dcg / idcg
    return ndcg

ndcg = get_ndcg(y_pred, y_test, k=30)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Calculate MAPE
mape = metrics.mean_absolute_percentage_error(y_test, y_pred)

print("RMSE:", rmse)
print("MAPE:", mape)
print("NDCG:", ndcg)



RMSE: 0.10458871271265284
MAPE: 0.6460013765255291
NDCG: 0.8302418629451971


In [None]:
#GBDT
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
from sklearn.metrics import ndcg_score

# Load data from Excel file with PCA-transformed features
excel_file_path = 'pca_transformed_dataset.xlsx'
data_pca = pd.read_excel(excel_file_path)

# Assuming target column is named 'Average store monthly revenue'
X_pca = data_pca.drop('Average store monthly revenue', axis=1)
y_pca = data_pca['Average store monthly revenue']

# Split the PCA-transformed data into training and testing sets
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y_pca, test_size=0.2, random_state=42)

# Train Gradient Boosting model on PCA-transformed data
model_pca = GradientBoostingRegressor()
model_pca.fit(X_train_pca, y_train_pca)

# Make predictions on the test set
predictions_pca = model_pca.predict(X_test_pca)

# Calculate metrics on the PCA-transformed data

rmse = np.sqrt(mean_squared_error(y_test_pca, predictions_pca))
mape = mean_absolute_error(y_test_pca, predictions_pca)

# Calculate NDCG
ndcg = ndcg_score(np.array([y_test_pca]), np.array([predictions_pca.reshape(-1)]))

# Print the evaluation metrics
print("RMSE:", rmse)
print("MAPE:", mape)
print("NDCG:", ndcg)




RMSE: 0.1559475041390776
MAPE: 0.11290390939371821
NDCG: 0.7931014343096824


In [None]:
#NN
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.neural_network import MLPRegressor
from sklearn import metrics

# Load the Excel file containing the dataset
file_path = 'pca_transformed_dataset.xlsx'
df = pd.read_excel(file_path)

# Assume the last column is the target variable and the rest are features
X = df.drop('Average store monthly revenue', axis=1)
y = df['Average store monthly revenue']   # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the neural network regressor
mlp_regressor = MLPRegressor(hidden_layer_sizes=(100, 50), activation='relu', solver='adam', random_state=45)

# Train the model
mlp_regressor.fit(X_train, y_train)

# Predict on the test set
y_pred = mlp_regressor.predict(X_test)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Calculate MAPE
mape = metrics.mean_absolute_percentage_error(y_test, y_pred)

#Calculate NDCG
ndcg = ndcg_score(np.array([y_test]), np.array([y_pred.reshape(-1)]))

# Print the evaluation metrics
print("RMSE:", rmse)
print("MAPE:", mape)
print("NDCG:", ndcg)



RMSE: 0.17544530999074168
MAPE: 1.132835154035199
NDCG: 0.8274047650047966
