In [1]:
import pandas as pd
import numpy as np; np.random.seed(0)
from scipy import stats
import statsmodels
import statsmodels.api as sm
import seaborn as sns; sns.set()
from collections import Counter
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
plt.style.use('classic')
%matplotlib inline

In [2]:
# Load selection data
data = pd.read_csv('selections.csv')
# Exclude Caddies category
data = data[data['Category']!=0].reset_index(drop=True)
# Randomly select one outcome from each selection pair
for i in np.arange(0,len(data),2):
    data.drop([i,i+1][np.random.randint(2)],inplace=True)
ytrain = data['Outcome']
data = data.drop(['Outcome'], axis=1)

In [3]:
# Pre-process Independent variables
for column in data.columns[:7]:
    dummy_df = pd.get_dummies(data[column],prefix=column, drop_first=True)
    for dummy in dummy_df.columns:
        data[dummy] = dummy_df[dummy]
data = data.drop(data.columns[:7], axis=1)
Xtrain = data.copy()

## MODEL 1

In [4]:
log_reg = sm.Logit(ytrain, data[['Treatment', 'Baseline']]).fit()
print(log_reg.summary())

Optimization terminated successfully.
         Current function value: 0.688245
         Iterations 4
                           Logit Regression Results                           
Dep. Variable:                Outcome   No. Observations:                  477
Model:                          Logit   Df Residuals:                      475
Method:                           MLE   Df Model:                            1
Date:                Mon, 26 Jul 2021   Pseudo R-squ.:                0.005682
Time:                        19:25:12   Log-Likelihood:                -328.29
converged:                       True   LL-Null:                       -330.17
Covariance Type:            nonrobust   LLR p-value:                   0.05275
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Treatment      0.3369      0.182      1.852      0.064      -0.020       0.693
Baseline      -0.0011      0.

## MODEL 2

In [5]:
log_reg = sm.Logit(ytrain, data[['Treatment', 'Baseline', 'Altruism', 'SSI', 'Interest']]).fit()
print(log_reg.summary())

Optimization terminated successfully.
         Current function value: 0.686083
         Iterations 4
                           Logit Regression Results                           
Dep. Variable:                Outcome   No. Observations:                  477
Model:                          Logit   Df Residuals:                      472
Method:                           MLE   Df Model:                            4
Date:                Mon, 26 Jul 2021   Pseudo R-squ.:                0.008805
Time:                        19:25:12   Log-Likelihood:                -327.26
converged:                       True   LL-Null:                       -330.17
Covariance Type:            nonrobust   LLR p-value:                    0.2134
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Treatment      0.3224      0.184      1.755      0.079      -0.038       0.682
Baseline      -0.0081      0.

In [6]:
log_reg = sm.Logit(ytrain, data[['Gender_1', 'Gender_2']]).fit()
print(log_reg.summary())

Optimization terminated successfully.
         Current function value: 0.691783
         Iterations 3
                           Logit Regression Results                           
Dep. Variable:                Outcome   No. Observations:                  477
Model:                          Logit   Df Residuals:                      475
Method:                           MLE   Df Model:                            1
Date:                Mon, 26 Jul 2021   Pseudo R-squ.:               0.0005707
Time:                        19:25:12   Log-Likelihood:                -329.98
converged:                       True   LL-Null:                       -330.17
Covariance Type:            nonrobust   LLR p-value:                    0.5393
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Gender_1       0.0591      0.130      0.455      0.649      -0.196       0.314
Gender_2       0.1370      0.

## MODEL 3

In [7]:
log_reg = sm.Logit(ytrain, Xtrain).fit()
print(log_reg.summary())

Optimization terminated successfully.
         Current function value: 0.661978
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:                Outcome   No. Observations:                  477
Model:                          Logit   Df Residuals:                      446
Method:                           MLE   Df Model:                           30
Date:                Mon, 26 Jul 2021   Pseudo R-squ.:                 0.04363
Time:                        19:25:12   Log-Likelihood:                -315.76
converged:                       True   LL-Null:                       -330.17
Covariance Type:            nonrobust   LLR p-value:                    0.5276
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
Interest         0.1063      0.166      0.639      0.523      -0.220       0.433
Baseline         0.0079