In [2]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

In [3]:
sales_data_raw = pd.read_csv("sale-data-raw.csv").iloc[:, 1:]

In [4]:
sales_data_raw.head()

Unnamed: 0,acctAge,visitsMonth,spendToDate,spendMonth,satSite,satQuality,satPrice,satOverall,region,coupon,purchase
0,21,9,21,21,6,5,6,6,Mideast,0,0
1,9,9,55,55,4,5,4,4,Southwest,0,0
2,16,7,64,17,6,6,7,7,Mideast,1,0
3,4,8,61,8,6,6,6,6,Southwest,1,0
4,16,6,27,9,5,5,6,6,Midwest,0,0


In [5]:
from sklearn.decomposition import PCA


# Assuming sales.data.raw is a pandas DataFrame
sales_data_subset = sales_data_raw.iloc[:, [4, 5, 6, 7]]
pca = PCA()
pc_sat = pca.fit_transform(sales_data_subset)

In [6]:
sales_data_rmcor = sales_data_raw.drop(["satSite", "satQuality", "satPrice","satOverall"], axis='columns')
sales_data_rmcor['sat'] = pc_sat[:, 0]
sales_data_rmcor.head()

Unnamed: 0,acctAge,visitsMonth,spendToDate,spendMonth,region,coupon,purchase,sat
0,21,9,21,21,Mideast,0,0,0.025656
1,9,9,55,55,Southwest,0,0,3.11977
2,16,7,64,17,Mideast,1,0,-1.464107
3,4,8,61,8,Southwest,1,0,-0.423617
4,16,6,27,9,Midwest,0,0,0.532224


In [7]:
from sklearn.preprocessing import StandardScaler

# Assuming sales.data.rmcor is a pandas DataFrame
sales_data_subset = sales_data_rmcor.drop(sales_data_rmcor.columns[4], axis=1)
scaler = StandardScaler()
sales_data_sc = pd.DataFrame(scaler.fit_transform(sales_data_subset), columns=sales_data_subset.columns)

sales_data_sc['region'] = sales_data_raw['region']
sales_data_sc.head()

Unnamed: 0,acctAge,visitsMonth,spendToDate,spendMonth,coupon,purchase,sat,region
0,1.413673,0.815049,-0.516865,-0.197683,-0.716018,-0.179272,0.008743,Mideast
1,-0.664221,0.815049,-0.349409,0.3115,-0.716018,-0.179272,1.063087,Southwest
2,0.547884,-0.081022,-0.305083,-0.257587,1.396614,-0.179272,-0.498906,Mideast
3,-1.53001,0.367014,-0.319858,-0.392371,1.396614,-0.179272,-0.144351,Southwest
4,0.547884,-0.529058,-0.487314,-0.377395,-0.716018,-0.179272,0.18136,Midwest


In [8]:
from scipy.stats import boxcox

sales_data_bc = sales_data_raw.copy()

# Transform the specified columns using autoTransform function
cols_to_transform = [col for col in sales_data_bc.columns if col not in ['region', 'coupon', 'purchase']]
for col in cols_to_transform:
    sales_data_bc[col], _ = boxcox(sales_data_bc[col])
    scaler = StandardScaler()
    sales_data_bc[col] = scaler.fit_transform(sales_data_bc[[col]])

# Select the required columns
sales_data_bc = sales_data_bc[['region', 'coupon', 'purchase'] + cols_to_transform]

# Add the 'sat' column
sales_data_bc['sat'] = pc_sat[:, 0]

sales_data_bc.head()

Unnamed: 0,region,coupon,purchase,acctAge,visitsMonth,spendToDate,spendMonth,satSite,satQuality,satPrice,satOverall,sat
0,Mideast,0,0,1.378137,0.82534,-1.269914,0.275749,0.188668,-0.645058,0.143403,0.185873,0.025656
1,Southwest,0,0,-0.636833,0.82534,0.115819,1.169225,-1.119975,-0.645058,-1.098463,-1.029447,3.11977
2,Mideast,1,0,0.56311,-0.040222,0.282106,0.036496,0.188668,-0.026132,0.774815,0.792157,-1.464107
3,Southwest,1,0,-1.568385,0.399377,0.23071,-0.974861,0.188668,-0.026132,0.143403,0.185873,-0.423617
4,Midwest,0,0,0.56311,-0.495957,-0.845496,-0.798915,-0.464341,-0.645058,0.143403,0.185873,0.532224


In [9]:
sales_data_bc['region'] = sales_data_bc['region'].astype('category')
# sales_data_bc['region'] = sales_data_bc['region'].cat.reorder_categories(['old', 'ren', 'new'], ordered=True)
sales_data_bc['region'] = sales_data_bc['region'].cat.codes

In [10]:
sales_data_bc.drop(["satSite", "satQuality", "satPrice","satOverall"], axis='columns', inplace=True)

In [11]:
from sklearn.linear_model import LinearRegression

# Assuming sales.data.bc is a pandas DataFrame
X = sales_data_bc.drop('spendMonth', axis=1)
y = sales_data_bc['spendMonth']

linear_model = LinearRegression()
linear_model.fit(X, y)

In [13]:
from statsmodels.formula.api import ols
from statsmodels.stats.api import anova_lm

model = ols('spendMonth ~ ' + ' + '.join(X.columns), data=sales_data_bc).fit()
print(model.summary())

# Get the ANOVA table
print(anova_lm(model))

                            OLS Regression Results                            
Dep. Variable:             spendMonth   R-squared:                       0.086
Model:                            OLS   Adj. R-squared:                  0.078
Method:                 Least Squares   F-statistic:                     11.09
Date:                Thu, 15 Aug 2024   Prob (F-statistic):           1.88e-13
Time:                        00:22:53   Log-Likelihood:                -1147.4
No. Observations:                 835   AIC:                             2311.
Df Residuals:                     827   BIC:                             2349.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept      -0.0427      0.061     -0.694      

In [15]:
from statsmodels.formula.api import logit
from statsmodels.genmod.families import Binomial

# Assuming sales.data.bc is a pandas DataFrame
log_model = logit('purchase ~ coupon', data=sales_data_bc).fit()
print(log_model.summary())

Optimization terminated successfully.
         Current function value: 0.128378
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:               purchase   No. Observations:                  835
Model:                          Logit   Df Residuals:                      833
Method:                           MLE   Df Model:                            1
Date:                Thu, 15 Aug 2024   Pseudo R-squ.:                 0.07425
Time:                        00:23:46   Log-Likelihood:                -107.20
converged:                       True   LL-Null:                       -115.79
Covariance Type:            nonrobust   LLR p-value:                 3.371e-05
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -4.3549      0.380    -11.449      0.000      -5.100      -3.609
coupon         1.7234      0.

In [17]:
log_model = logit('purchase ~ coupon*sat', data=sales_data_bc).fit()
print(log_model.summary())

Optimization terminated successfully.
         Current function value: 0.110071
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:               purchase   No. Observations:                  835
Model:                          Logit   Df Residuals:                      831
Method:                           MLE   Df Model:                            3
Date:                Thu, 15 Aug 2024   Pseudo R-squ.:                  0.2063
Time:                        00:24:51   Log-Likelihood:                -91.909
converged:                       True   LL-Null:                       -115.79
Covariance Type:            nonrobust   LLR p-value:                 2.385e-10
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -4.5950      0.479     -9.597      0.000      -5.533      -3.657
coupon         0.7021      0.

In [18]:
odds_ratios = np.exp(log_model.params)
print(odds_ratios)

Intercept     0.010102
coupon        2.018052
sat           0.678133
coupon:sat    0.936362
dtype: float64


In [22]:
conj_df = pd.read_csv('bag-data.csv')

In [23]:
conj_df.head()

Unnamed: 0.1,Unnamed: 0,resp.id,rating,price,color,zipper,finish
0,1,1,3,17,gray,silver,patent
1,2,1,10,15,navy,silver,matte
2,3,1,9,15,black,gold,patent
3,4,1,6,19,navy,gold,matte
4,5,1,2,19,navy,silver,patent


In [25]:
from statsmodels.formula.api import mixedlm

# Assuming conjoint.df is a pandas DataFrame
mixed_model = mixedlm("rating ~ price + color + zipper + finish",
                     data=conj_df,
                     groups="resp.id",
                     re_formula="~price + color + zipper + finish")

mixed_model_fit = mixed_model.fit(max_iter=100000)
print(mixed_model_fit.summary())



                       Mixed Linear Model Regression Results
Model:                      MixedLM         Dependent Variable:         rating     
No. Observations:           4500            Method:                     REML       
No. Groups:                 300             Scale:                      3.0837     
Min. group size:            15              Log-Likelihood:             -10446.3739
Max. group size:            15              Converged:                  No         
Mean group size:            15.0                                                   
-----------------------------------------------------------------------------------
                                        Coef.  Std.Err.    z    P>|z| [0.025 0.975]
-----------------------------------------------------------------------------------
Intercept                               13.248    0.276  47.990 0.000 12.707 13.790
color[T.gray]                           -2.028    0.079 -25.617 0.000 -2.183 -1.873
color[T.navy]  



In [44]:
min = 0
max = 0
minmax = [1,1]
for i in range(1,301):
    if mixed_model_fit.random_effects[i]['color[T.navy]'] < min :
        min = mixed_model_fit.random_effects[i]['color[T.navy]']
        minmax[0] = i
    elif mixed_model_fit.random_effects[i]['color[T.navy]'] > max :
        max = mixed_model_fit.random_effects[i]['color[T.navy]']
        minmax[1] = i
print(minmax)