In [38]:
from sklearn import metrics
import numpy as np
from scipy.stats import chi2
import matplotlib.pyplot as plt
from sklearn import metrics
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy import stats
from datetime import datetime
from scipy.stats import norm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

In [39]:
dataname = 'churn'

In [40]:
def remove_outs(df, std_treshold: float = 3.5):    #
    df = df[np.abs(df-df.mean()) <= (std_treshold*df.std())]
    # keep only the ones that are within +std_treshold to -std_treshold standard deviations in the column 'Data'.
    df = df[~(np.abs(df-df.mean()) > (std_treshold*df.std()))]
    df = df.dropna()
    return df

def remove_colls(df, corr_threshold: float = 0.8):
    # Drop multicollinear features
    # Create correlation matrix
    corr_matrix = df.corr().abs()
    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    # Find features with correlation greater than corr_threshold
    to_drop = [column for column in upper.columns if any(upper[column] > corr_threshold)]
    # Drop features 
    df = df.drop(to_drop, axis=1).astype(float)
    return df

In [41]:
if dataname == 'communities':
    df = pd.read_csv(f'data/{dataname}.csv', header=None)
    df = df.drop_duplicates()
    df = df.replace('?', np.nan)
    df = df.drop(columns=[0, 1, 2, 3, 4, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 126])
    df = df.dropna().astype(float)
    Y = df[127] 
    Y = (Y > np.median(Y)) * 1
    df = df.drop(columns=[127])
    df = remove_colls(df, corr_threshold=0.9)

elif dataname == 'housing':
    df = pd.read_csv(f'data/{dataname}.csv')
    df = df.drop_duplicates()
    df = df.replace('?', np.nan)
    df = df.dropna()
    Y = df['median_house_value'] 
    Y = (Y > np.median(Y))
    df = df.drop(columns=['median_house_value'])
    df['ocean_proximity'] = df['ocean_proximity'].replace(['NEAR BAY', 'ISLAND', 'NEAR OCEAN', '<1H OCEAN', 'INLAND'], [0, 0, 0, 1, 2])
    df = df.astype(float)
    df = remove_colls(df, corr_threshold=0.9)

elif dataname == "heloc":
    df = pd.read_csv(f'data/{dataname}.csv')
    df = df.drop_duplicates()
    df = df.replace('?', np.nan)
    df = df.dropna()
    df = df.drop(columns=['RiskPerformance'])
    df = df.dropna()
    Y = df['ExternalRiskEstimate'] 
    Y = (Y >= np.median(Y)) * 1
    df = df.drop(columns=['ExternalRiskEstimate'])
    df = df.astype(float)
    df = remove_colls(df, corr_threshold=0.9)
    
elif dataname == 'speeddating':
    df = pd.read_csv(f'data/{dataname}.csv', header=None)
    df = df.drop_duplicates()
    df = df.replace('?', np.nan)
    names = np.arange(df.columns.shape[0])
    names = list(map(str, names))
    df.columns = names
    Y = df['120'] # whether someone matched
    df = df.drop(columns=['2', '7', '8', '120'])
    df['6'] = df['6'].map({'[0-1]': 1, '[2-3]': 2, '[4-5]': 3, '[4-6]': 4, '[7-37]': 5})
    df['12'] = df['12'].map({'[0-1]': 1, '[2-5]': 2, '[6-10]': 3})
    df['13'] = df['13'].map({'[0-1]': 1, '[2-5]': 2, '[6-10]': 3})
    
elif dataname == 'default':
    df = pd.read_csv(f'data/{dataname}.csv', header=None)
    df = df.drop_duplicates()
    df = df.replace('?', np.nan)
    df = df.dropna()
    ind_nondefault = np.where(df.values[:,-1] == 0)[0]
    n_default = ind_nondefault.shape[0]
    ind_default = np.where(df.values[:,-1] > 0)[0]
    ind_nondefault_subset = np.random.choice(ind_nondefault, ind_default.shape[0], replace=False)
    inds = np.r_[ind_default, ind_nondefault_subset]
    df = df.iloc[inds]
    Y = (df.values[:,-1] >= np.median(df.values[:,-1])) * 1
    df = df.drop(columns=df.columns[-1])
    df = df.astype(float)
    df = remove_colls(df, corr_threshold=0.95)

elif dataname == 'mnist':
    df_train = pd.read_csv(f'data/{dataname}_train.csv')
    df_test = pd.read_csv(f'data/{dataname}_test.csv')
    df = pd.concat([ df_train,  df_test], ignore_index=True)
    y0_ind = np.where(df['label']==3)[0]
    y1_ind = np.where(df['label']==8)[0]
    inds = np.r_[y0_ind , y1_ind]
    df = df.iloc[inds]
    df['label'] = df['label'].map({3:0, 8:1})
    Y = (df['label'].values) * 1
    df = df.drop(columns=['label'])
    #df = df.iloc[np.random.choice(df.shape[0], int(0.30*df.shape[0]), replace=False)]
    #df['label'] = df['label'].map({0:0, 1:0, 2:0, 4:0, 7:0,  5:1, 3:1, 6:1, 8:1, 9:1})
    
elif dataname == 'fraud':
    df = pd.read_csv(f'data/{dataname}.csv')
    df = df.drop_duplicates()
    df = df.replace('?', np.nan)
    df = df.dropna()
    '''
    ind_fraud = np.where(df['fraud_bool'] == 1)[0]
    n_fraud = ind_fraud.shape[0]
    ind_nonfraud = np.where(df['fraud_bool'] == 0)[0]
    ind_nonfraud_subset = np.random.choice(ind_nonfraud, n_fraud, replace=False)
    inds = np.r_[ind_fraud, ind_nonfraud_subset]
    df = df.iloc[inds]
    Y = df['fraud_bool']
    df = df.drop(columns=['fraud_bool'])
    '''
    for i in ["payment_type", "employment_status", "housing_status", "source", "device_os"]:
        dummies = pd.get_dummies(df[i])
        df[dummies.columns] = dummies
    df = df.drop(columns=["payment_type", "employment_status", "housing_status", "source", "device_os"])
    df = df.iloc[np.random.choice(df.shape[0], int(0.05*df.shape[0]), replace=False)]
    Y = df['income'] 
    Y = (Y >= np.median(Y)) * 1
    df = df.drop(columns=['income'])
elif dataname == 'churn':
    df = pd.read_csv(f'data/{dataname}.csv', header=None)
    df = df.drop_duplicates()
    df = df.replace('?', np.nan)
    df[230] = df[230].map({-1:0, 1:1})
    ind_nondefault = np.where(df.values[:,-1] == 0)[0]
    n_default = ind_nondefault.shape[0]
    ind_default = np.where(df.values[:,-1] > 0)[0]
    ind_nondefault_subset = np.random.choice(ind_nondefault, ind_default.shape[0], replace=False)
    inds = np.r_[ind_default, ind_nondefault_subset]
    df = df.iloc[inds]
    Y = df[230]
    df = df.drop(columns=[230])
    df = df.iloc[:,0:190]         # keep numerical features only
    df = df.fillna(df.median())   # fill missing with median
    cols = df.columns[df.isna().any()].tolist()
    df = df.drop(columns=cols)
    df = df.dropna()

In [51]:
Y.values[0]

1

In [42]:
df.shape

(7344, 174)

In [43]:
baseline = np.mean(Y)
print('Baseline score:', baseline)

Baseline score: 0.5


In [44]:
scaler = StandardScaler()
X = df.values
scaler.fit(X)
X = scaler.transform(X)

In [45]:
# split data
X_train, X_test, Y_train, Y_test = train_test_split(X, 
                                                    Y.astype(float), 
                                                    test_size=0.5,
                                                    random_state=10)

In [46]:
clf = LogisticRegression(penalty='l2', C=100, fit_intercept=True, max_iter=2000)
clf.fit(X_train, Y_train)

print('training set accuracy:', clf.score(X_train, Y_train))
print('test set accuracy:', clf.score(X_test, Y_test))

training set accuracy: 0.6399782135076253
test set accuracy: 0.5955882352941176


In [47]:
print('Improvement over baseline:', clf.score(X_test, Y_test) - baseline)

Improvement over baseline: 0.09558823529411764


In [48]:
clf = RandomForestClassifier(max_depth=10)
clf.fit(X_train, Y_train)
print('training set accuracy:', clf.score(X_train, Y_train))
print('test set accuracy:', clf.score(X_test, Y_test))

training set accuracy: 0.8619281045751634
test set accuracy: 0.6244553376906318
