In [None]:
#PCA with TGIs
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import ndcg_score
from sklearn.preprocessing import StandardScaler

df = pd.read_excel('scaled_location_data.xlsx')

X = df.drop('Average store monthly revenue', axis=1)
Y = df['Average store monthly revenue']

scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)

X_train, X_test, Y_train, Y_test = train_test_split(X_standardized, Y, test_size=0.2, random_state=42)

pca = PCA()
X_train_pca = pca.fit_transform(X_train)

explained_variance_ratio = pca.explained_variance_ratio_
cumulative_explained_variance = explained_variance_ratio.cumsum()

threshold = 0.95
n_components = next(idx for idx, value in enumerate(cumulative_explained_variance) if value >= threshold) + 1

pca = PCA(n_components=15)
X_train_pca = pca.fit_transform(X_train)

X_test_pca = pca.transform(X_test)

model = LinearRegression()
model.fit(X_train_pca, Y_train)

predictions = model.predict(X_test_pca)

rmse = mean_squared_error(Y_test, predictions, squared=False)
print(f'Root Mean Squared Error (RMSE): {rmse}')

mape = mean_absolute_error(Y_test, predictions) / abs(Y_test).mean() * 100
print(f'Mean Absolute Percentage Error (MAPE): {mape}')

ndcg = ndcg_score([Y_test.values], [predictions])
print(f'Normalized Discounted Cumulative Gain (NDCG): {ndcg}')


Root Mean Squared Error (RMSE): 0.1330515980987947
Mean Absolute Percentage Error (MAPE): 49.32697683465225
Normalized Discounted Cumulative Gain (NDCG): 0.9240267812058243


In [None]:
#PCR with TGIs
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import ndcg_score
import numpy as np

df = pd.read_excel('scaled_location_data.xlsx')

X = df.drop('Average store monthly revenue', axis=1)
Y = df['Average store monthly revenue']

scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)

def ndcg_scorer(y_true, y_pred):
    return ndcg_score([y_true.values], [y_pred])

scoring = {'RMSE': 'neg_root_mean_squared_error',
           'MAPE': make_scorer(mean_absolute_error),
           'NDCG': ndcg_scorer}

kf = KFold(n_splits=5, shuffle=True, random_state=42)

rmse_scores = []
mape_scores = []
ndcg_scores = []

for train_index, test_index in kf.split(X_standardized):
    X_train, X_test = X_standardized[train_index], X_standardized[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]

    pca = PCA()
    X_train_pca = pca.fit_transform(X_train)

    threshold = 0.95
    n_components = np.argmax(np.cumsum(pca.explained_variance_ratio_) >= threshold) + 1

    pca = PCA(n_components=3)
    X_train_pca = pca.fit_transform(X_train)

    X_test_pca = pca.transform(X_test)

    model = LinearRegression()
    model.fit(X_train_pca, Y_train)

    pcr_predictions = model.predict(X_test_pca)

    rmse = np.sqrt(mean_squared_error(Y_test, pcr_predictions))
    mape = mean_absolute_error(Y_test, pcr_predictions)
    ndcg = ndcg_score([Y_test.values], [pcr_predictions])

    rmse_scores.append(rmse)
    mape_scores.append(mape)
    ndcg_scores.append(ndcg)

avg_rmse = np.mean(rmse_scores)
avg_mape = np.mean(mape_scores)
avg_ndcg = np.mean(ndcg_scores)

print(f'Average RMSE across folds: {avg_rmse}')
print(f'Average MAPE across folds: {avg_mape}')
print(f'Average NDCG across folds: {avg_ndcg}')


Average RMSE across folds: 0.15481415509922383
Average MAPE across folds: 0.11748763277233679
Average NDCG across folds: 0.8526129075839692


In [None]:
#RandomForest with TGIs
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import make_scorer
from sklearn.metrics import ndcg_score
import numpy as np

df = pd.read_excel('scaled_location_data1.xlsx')

X = df.drop('Average store monthly revenue', axis=1)
Y = df['Average store monthly revenue']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

rf_model = RandomForestRegressor(random_state=42)

def ndcg_scorer(y_true, y_pred):
    return ndcg_score([y_true.values], [y_pred])

scoring = {'RMSE': 'neg_root_mean_squared_error',
           'MAPE': make_scorer(mean_absolute_error),
           'NDCG': ndcg_scorer}

rf_predictions = cross_val_predict(rf_model, X, Y, cv=5)

rmse = np.sqrt(mean_squared_error(Y, rf_predictions))
mape = mean_absolute_error(Y, rf_predictions)
ndcg = ndcg_score([Y.values], [rf_predictions])

print(f'RMSE on the entire dataset: {rmse}')
print(f'MAPE on the entire dataset: {mape}')
print(f'NDCG on the entire dataset: {ndcg}')


RMSE on the entire dataset: 0.15052019065661348
MAPE on the entire dataset: 0.11043210291112603
NDCG on the entire dataset: 0.9000590540645967


In [None]:
#Lasso with TGIs
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import ndcg_score
import numpy as np

df = pd.read_excel('scaled_location_data1.xlsx')

X = df.drop('Average store monthly revenue', axis=1)
Y = df['Average store monthly revenue']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

lasso_model = Lasso(random_state=42)

def ndcg_scorer(y_true, y_pred):
    return ndcg_score([y_true.values], [y_pred])

scoring = {'RMSE': 'neg_root_mean_squared_error',
           'MAPE': make_scorer(mean_absolute_error),
           'NDCG': ndcg_scorer}

lasso_predictions = cross_val_predict(lasso_model, X, Y, cv=5)

rmse = np.sqrt(mean_squared_error(Y, lasso_predictions))
mape = mean_absolute_error(Y, lasso_predictions)
ndcg = ndcg_score([Y.values], [lasso_predictions])

print(f'RMSE on the entire dataset: {rmse}')
print(f'MAPE on the entire dataset: {mape}')
print(f'NDCG on the entire dataset: {ndcg}')


RMSE on the entire dataset: 0.19799317493420046
MAPE on the entire dataset: 0.15428603724788642
NDCG on the entire dataset: 0.7261080720531122


In [None]:
#GBDT with TGIs
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import ndcg_score
import numpy as np

df = pd.read_excel('scaled_location_data1.xlsx')

X = df.drop('Average store monthly revenue', axis=1)
Y = df['Average store monthly revenue']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

gbdt_model = GradientBoostingRegressor(random_state=42)

def ndcg_scorer(y_true, y_pred):
    return ndcg_score([y_true.values], [y_pred])

scoring = {'RMSE': 'neg_root_mean_squared_error',
           'MAPE': make_scorer(mean_absolute_error),
           'NDCG': ndcg_scorer}

gbdt_predictions = cross_val_predict(gbdt_model, X, Y, cv=5)

rmse = np.sqrt(mean_squared_error(Y, gbdt_predictions))
mape = mean_absolute_error(Y, gbdt_predictions)
ndcg = ndcg_score([Y.values], [gbdt_predictions])

print(f'RMSE on the entire dataset: {rmse}')
print(f'MAPE on the entire dataset: {mape}')
print(f'NDCG on the entire dataset: {ndcg}')


RMSE on the entire dataset: 0.1485082796413721
MAPE on the entire dataset: 0.10948367510720751
NDCG on the entire dataset: 0.8988195706957863


In [None]:
#NeuralNetwoks with TGIs
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import ndcg_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import numpy as np

df = pd.read_excel('scaled_location_data.xlsx')

X = df.drop('Average store monthly revenue', axis=1)
Y = df['Average store monthly revenue']

scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)

def ndcg_scorer(y_true, y_pred):
    return ndcg_score([y_true.values], [y_pred])

scoring = {'RMSE': 'neg_root_mean_squared_error',
           'MAPE': make_scorer(mean_absolute_error),
           'NDCG': ndcg_scorer}

kf = KFold(n_splits=5, shuffle=True, random_state=42)

rmse_scores = []
mape_scores = []
ndcg_scores = []

for train_index, test_index in kf.split(X_standardized):
    X_train, X_test = X_standardized[train_index], X_standardized[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]

    nn_model = Sequential()
    nn_model.add(Dense(64, input_dim=X.shape[1], activation='relu'))
    nn_model.add(Dense(1, activation='linear'))
    nn_model.compile(optimizer='adam', loss='mean_squared_error')

    nn_model.fit(X_train, Y_train, epochs=10, batch_size=32, verbose=0)

    nn_predictions = nn_model.predict(X_test).flatten()

    rmse = np.sqrt(mean_squared_error(Y_test, nn_predictions))
    mape = mean_absolute_error(Y_test, nn_predictions)
    ndcg = ndcg_score([Y_test.values], [nn_predictions])

    rmse_scores.append(rmse)
    mape_scores.append(mape)
    ndcg_scores.append(ndcg)

avg_rmse = np.mean(rmse_scores)
avg_mape = np.mean(mape_scores)
avg_ndcg = np.mean(ndcg_scores)

print(f'Average RMSE across folds: {avg_rmse}')
print(f'Average MAPE across folds: {avg_mape}')
print(f'Average NDCG across folds: {avg_ndcg}')


Average RMSE across folds: 0.3368418463865684
Average MAPE across folds: 0.2558913334615541
Average NDCG across folds: 0.8194289896722402


In [None]:
preprocessed_data_path = 'preprocessed_data1.xlsx'
X_preprocessed = preprocessor.transform(X)
preprocessed_df = pd.DataFrame(X_preprocessed, columns=X.columns)
preprocessed_df['Average store monthly revenue'] = y
preprocessed_df.to_excel(preprocessed_data_path, index=False)
print(f'Preprocessed data saved to {preprocessed_data_path}')