In [1]:
import sys
import pandas as pd
import statsmodels.formula.api as smf
import itertools

In [2]:
# Define functions
# Function to get a list of regression equations without interaction terms
def get_model_list_interaction_none(descriptors, name): 
    models = []
    for n in range(1, len(descriptors)+1):
        for comb in itertools.combinations(descriptors, n):
            model_description = name + ' ~ ' + ' + '.join(comb)
            models.append(model_description)
            
    return models

# Function to get a list of regression equations containing one interaction term
def get_model_list_interaction_one(descriptors, name):
    models = []
    for n in range(1, len(descriptors)+1):
        for comb in itertools.combinations(descriptors, n):
            model_description = name + ' ~ ' + ' + '.join(comb)
            models.append(model_description)     
            
            interactions = []
            for interaction in itertools.combinations(comb, 2):
                interactions.append(' * '.join(interaction))
            for interaction in interactions:
                model_description = name + ' ~ ' + ' + '.join(comb) + ' + ' + interaction
                models.append(model_description)
                
    return models

# Function to get a list of regression equations including all interaction terms
def get_model_list_interaction_all(descriptors, name):
    models = []
    for n in range(1, len(descriptors)+1):
        for comb in itertools.combinations(descriptors, n):
            interactions = []
            for interaction in itertools.combinations(comb, 2):
                interactions.append(' * '.join(interaction))
            for m in range(len(interactions)+1): 
                for comb_interactions in itertools.combinations(interactions, m):
                    if len(comb_interactions)==0:
                        model_description = name + ' ~ ' + ' + '.join(comb)
                    else:
                        model_description = name + ' ~ ' + ' + '.join(comb) + ' + ' + ' + '.join(comb_interactions)
                    models.append(model_description)
                    
    return models

def get_model_list(descriptors, target, interaction='none'):
    if interaction=='none':
        models = get_model_list_interaction_none(descriptors, target)
    elif interaction=='one':
        models = get_model_list_interaction_one(descriptors, target)
    elif interaction=='all':
        models = get_model_list_interaction_all(descriptors, target)
    else:
        interactions = ['none', 'one', 'all']
        if interaction not in interactions:
            raise ValueError(f"interaction must be one of {', '.join(interactions)}")
    
    return models

In [3]:
# Upload data
# Place real_estate.csv in the same folder as this file
df = pd.read_csv('store_sales.csv')
display(df)

Unnamed: 0,sales,area,walking_time,competing_stores,population_density,parking,dining,main_street
0,87.6,58.121727,6.941218,2,11.966089,0,0,0
1,92.0,53.186809,5.700467,5,11.127574,0,1,1
2,96.3,55.668847,4.500544,3,16.835161,1,0,0
3,85.1,44.352024,5.749738,4,39.470058,0,1,0
4,86.4,46.541696,8.016232,2,17.731017,0,0,0
...,...,...,...,...,...,...,...,...
494,84.7,52.010861,11.387535,3,51.603845,1,1,0
495,72.2,44.100876,14.949785,3,23.235896,1,0,0
496,80.7,48.662116,14.808499,3,56.799533,1,1,0
497,96.1,55.478438,8.929852,2,39.143037,1,1,0


# Analysis #2

In [7]:
min_ec = sys.float_info.max
opt_model = ''
opt_results = None
# Strip any whitespace from column names
df.columns = df.columns.str.strip()

# Redefine the descriptors list to match the cleaned column names
descriptors = ['area ', 'walking_time', 'competing_stores ', 'population_density', 'parking', 'dining', 'main_street']
target = 'sales'

# Generate models using the updated descriptors
models = get_model_list(descriptors, target)

print('BIC\t formula')

# Run the analysis with cleaned column names
for model in models:
    # Ensure population_density is included in each model formula
    results = smf.ols(formula=model, data=df).fit()
    ec = results.bic
    print('{:.0f}\t {}'.format(ec, model))

    if ec < min_ec:
        min_ec = ec
        opt_model = model
        opt_results = results

print('\nOptimum model:', opt_model)
display(opt_results.summary())


BIC	 formula
3953	 sales ~ area 
3732	 sales ~ walking_time
3876	 sales ~ competing_stores 
3959	 sales ~ population_density
3936	 sales ~ parking
3947	 sales ~ dining
3774	 sales ~ main_street
3729	 sales ~ area  + walking_time
3875	 sales ~ area  + competing_stores 
3959	 sales ~ area  + population_density
3936	 sales ~ area  + parking
3946	 sales ~ area  + dining
3776	 sales ~ area  + main_street
3567	 sales ~ walking_time + competing_stores 
3738	 sales ~ walking_time + population_density
3729	 sales ~ walking_time + parking
3720	 sales ~ walking_time + dining
3381	 sales ~ walking_time + main_street
3882	 sales ~ competing_stores  + population_density
3849	 sales ~ competing_stores  + parking
3861	 sales ~ competing_stores  + dining
3689	 sales ~ competing_stores  + main_street
3943	 sales ~ population_density + parking
3954	 sales ~ population_density + dining
3780	 sales ~ population_density + main_street
3930	 sales ~ parking + dining
3749	 sales ~ parking + main_street
3761	 s

0,1,2,3
Dep. Variable:,sales,R-squared:,0.888
Model:,OLS,Adj. R-squared:,0.887
Method:,Least Squares,F-statistic:,652.8
Date:,"Mon, 11 Nov 2024",Prob (F-statistic):,1.2499999999999999e-230
Time:,18:35:24,Log-Likelihood:,-1426.3
No. Observations:,499,AIC:,2867.0
Df Residuals:,492,BIC:,2896.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,97.4885,2.115,46.099,0.000,93.333,101.644
area,0.2408,0.039,6.213,0.000,0.165,0.317
walking_time,-1.6629,0.041,-40.205,0.000,-1.744,-1.582
competing_stores,-5.1078,0.195,-26.139,0.000,-5.492,-4.724
parking,3.1736,0.388,8.188,0.000,2.412,3.935
dining,4.9924,0.385,12.975,0.000,4.236,5.748
main_street,16.6951,0.482,34.638,0.000,15.748,17.642

0,1,2,3
Omnibus:,20.789,Durbin-Watson:,2.072
Prob(Omnibus):,0.0,Jarque-Bera (JB):,37.647
Skew:,0.264,Prob(JB):,6.69e-09
Kurtosis:,4.237,Cond. No.,572.0


왜 population density가 없지?