In [2]:
import pandas as pd
import numpy as np
import itertools
import statsmodels.api as sm

# Load the dataset
file_path = 'store_sales.csv'  # Update with your local file path if necessary
data = pd.read_csv(file_path)

# Clean column names
data.columns = data.columns.str.strip()

# Outlier detection and handling for "number of competing stores"
# Define outliers using the Interquartile Range (IQR) method
Q1 = data['competing_stores'].quantile(0.25)  # First quartile
Q3 = data['competing_stores'].quantile(0.75)  # Third quartile
IQR = Q3 - Q1  # Interquartile range

# Defining outlier boundaries
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Cleaning the data: Remove rows with outliers in "number of competing stores"
cleaned_data = data[(data['competing_stores'] >= lower_bound) & (data['competing_stores'] <= upper_bound)]

# Define target variable and explanatory variables
y = cleaned_data['sales']
X = cleaned_data[['area', 'walking_time', 'competing_stores', 'population_density', 'parking', 'dining', 'main_street']]

# Function to calculate BIC
def calculate_bic(y, X):
    X = sm.add_constant(X)  # Add a constant term for the intercept
    model = sm.OLS(y, X).fit()  # Fit an Ordinary Least Squares (OLS) regression model
    return model.bic  # Return BIC value

# Generate all possible combinations of explanatory variables
explanatory_variables = X.columns
all_combinations = []

for r in range(1, len(explanatory_variables) + 1):  # r = number of variables in the combination
    combinations_r = list(itertools.combinations(explanatory_variables, r))
    all_combinations.extend(combinations_r)

# Calculate BIC for each combination
results = []
for combination in all_combinations:
    X_subset = X[list(combination)]  # Subset of explanatory variables
    bic = calculate_bic(y, X_subset)  # Calculate BIC
    results.append({'formula': combination, 'BIC': bic})

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Sort by BIC in ascending order
results_df = results_df.sort_values(by='BIC', ascending=True)

# Extract the best combination
best_combination = results_df.iloc[0]

print("\nBest Combination of Explanatory Variables:")
print("Variables:", best_combination['formula'])
print("Lowest BIC:", best_combination['BIC'])


Best Combination of Explanatory Variables:
Variables: ('area', 'walking_time', 'competing_stores', 'parking', 'dining', 'main_street')
Lowest BIC: 2896.126071456377
