In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [2]:
num_cols = [
    'ClientPeriod',
    'MonthlySpending',
    'TotalSpent'
]

cat_cols = [
    'Sex',
    'IsSeniorCitizen',
    'HasPartner',
    'HasChild',
    'HasPhoneService',
    'HasMultiplePhoneNumbers',
    'HasInternetService',
    'HasOnlineSecurityService',
    'HasOnlineBackup',
    'HasDeviceProtection',
    'HasTechSupportAccess',
    'HasOnlineTV',
    'HasMovieSubscription',
    'HasContractPhone',
    'IsBillingPaperless',
    'PaymentMethod'
]

target_col = 'Churn'

In [172]:
def prepare_data(data, train=False):
    period_bins = [0,2,5,10,17,25,33,43,53,62,70,72,100]
    data['ClientPeriod'] = data.groupby('HasContractPhone')['ClientPeriod'].apply(lambda x: x.replace(0, x.median()))
    data['ClientPeriodG'] = pd.cut(data['ClientPeriod'], bins=period_bins).astype(str)
    return data

data = prepare_data(pd.read_csv('./train.csv'), train=True)
train_valid_data, test_data = train_test_split(data, test_size=0.2, stratify=data[target_col], random_state=42)
train_data, valid_data = train_test_split(train_valid_data, test_size=0.25, stratify=train_valid_data[target_col], random_state=42)

feature_cols = cat_cols + ['ClientPeriodG']
feature_cols = list(set(feature_cols) - set(['Sex','HasPartner','HasDeviceProtection']))

ohe = OneHotEncoder(handle_unknown='ignore')
lr = LogisticRegression(C=0.4, penalty='l1', solver='liblinear', random_state=42)
model = Pipeline([('ohe', ohe), ('clf', lr)])
model.fit(train_data[feature_cols], train_data[target_col])

c = np.abs(model['clf'].coef_)
valid_probas = model.predict_proba(valid_data[feature_cols])[:,1]
test_probas = model.predict_proba(test_data[feature_cols])[:,1]
print(f'coef_: {c.min():.3f} (min), {c.max():.3f} (max), {(c==0).sum()} / {len(c[0])} (zeros)')
print('roc_auc_score (valid): ', roc_auc_score(valid_data[target_col], valid_probas))
print('roc_auc_score (test): ', roc_auc_score(test_data[target_col], test_probas))

data = prepare_data(pd.read_csv('./test.csv'), train=False)
submission = pd.read_csv('./submission.csv')
submission['Churn'] = model.predict_proba(data[feature_cols])[:,1]
submission.to_csv('./my_submission.csv', index=False)

coef_: 0.000 (min), 1.780 (max), 11 / 47 (zeros)
roc_auc_score (valid):  0.8629709339998148
roc_auc_score (test):  0.858039433490697


In [192]:
def get_target_rates(data, cat_cols, target_col):
    return {c: data.groupby(c)[target_col].mean().to_dict() for c in cat_cols}

def map_target_rates(data, cat_cols, target_rates):
    return data[cat_cols].apply(lambda x: x.map(target_rates[x.name]))

def prepare_data(data, train=False):
    
    data['TotalSpent'] = pd.to_numeric(data['TotalSpent'], errors='coerce')
    data['TotalSpent'] = data['TotalSpent'].fillna(0)
    data['TotalSpent'] = data.groupby('HasContractPhone')['TotalSpent'].apply(lambda x: x.replace(0, x.median()))
    data['ClientPeriod'] = data.groupby('HasContractPhone')['ClientPeriod'].apply(lambda x: x.replace(0, x.median()))
    
    period_bins = [0,2,5,10,17,25,33,43,53,62,70,72,100]
    spent_bins = [0,18.8,198.2,679.1,1400.55,2754.3,4917.8,8684.8]
    data['ClientPeriod'] = pd.cut(data['ClientPeriod'], bins=period_bins).astype(str)
    data['TotalSpent'] = pd.cut(data['TotalSpent'], bins=spent_bins).astype(str)
    
    rates_cols = cat_cols + ['ClientPeriod','TotalSpent']
    if train: prepare_data.target_rates = get_target_rates(data, rates_cols, target_col)
    data[rates_cols] = map_target_rates(data, rates_cols, prepare_data.target_rates)
    return data

data = prepare_data(pd.read_csv('./train.csv'), train=True)
train_valid_data, test_data = train_test_split(data, test_size=0.2, stratify=data[target_col], random_state=42)
train_data, valid_data = train_test_split(train_valid_data, test_size=0.25, stratify=train_valid_data[target_col], random_state=42)

redundant_cols = ['Sex','HasPartner','HasDeviceProtection','MonthlySpending']
feature_cols = list(set(cat_cols + num_cols) - set(redundant_cols))

sc = StandardScaler()
lr = LogisticRegression(C=1, penalty='l1', solver='liblinear', random_state=42)
model = Pipeline([('sca', sc), ('clf', lr)])
model.fit(train_data[feature_cols], train_data[target_col])

c = np.abs(model['clf'].coef_)
valid_probas = model.predict_proba(valid_data[feature_cols])[:,1]
test_probas = model.predict_proba(test_data[feature_cols])[:,1]
print(f'coef_: {c.min():.3f} (min), {c.max():.3f} (max), {(c==0).sum()} / {len(c[0])} (zeros)')
print('roc_auc_score (valid): ', roc_auc_score(valid_data[target_col], valid_probas))
print('roc_auc_score (test): ', roc_auc_score(test_data[target_col], test_probas))

data = prepare_data(pd.read_csv('./test.csv'), train=False)
submission = pd.read_csv('./submission.csv')
submission['Churn'] = model.predict_proba(data[feature_cols])[:,1]
submission.to_csv('./my_submission.csv', index=False)

coef_: 0.006 (min), 1.009 (max), 0 / 15 (zeros)
roc_auc_score (valid):  0.8646116819402019
roc_auc_score (test):  0.8591270943256503


In [1357]:
# all features, drop='first', C=100000, penalty='l2'
# with outliers, C=10000000, penalty='l2'
# with outliers, C=0.4, penalty='l1'
# C=1000, penalty='l1'
# with outliers, C=1, penalty='l2' 0.8591780061094142 / 0.8564148847542349
# with outliers, C=0.33, penalty='l1' 0.8621239470517449 / 0.8579098398592981
# C=100000, penalty='l1' 0.8515544284775054 / 0.852812998966845
# no replacing zeros, no removing outliers, C=0.4, penalty='l1' 0.862054521892067 / 0.8577848745718781
# with outliers, without 'Sex','HasPartner','HasDeviceProtection', C=0.33, penalty='l1', solver='liblinear' 
# 0.8623530500786818 / 0.8580741460705361

In [114]:
# Cutting numeric features into bins
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
data = pd.concat([train, test]).reset_index()
data['TotalSpent'] = pd.to_numeric(data['TotalSpent'], errors='coerce')
data['TotalSpent'] = data['TotalSpent'].fillna(0)
data['TotalSpent'] = data.groupby('HasContractPhone')['TotalSpent'].apply(lambda x: x.replace(0, x.median()))
data['ClientPeriod'] = data.groupby('HasContractPhone')['ClientPeriod'].apply(lambda x: x.replace(0, x.median()))
_, period_bins = pd.qcut(data['ClientPeriod'], 11, retbins=True)
_, spending_bins = pd.qcut(data['MonthlySpending'], 6, retbins=True)
_, spent_bins = pd.qcut(data['TotalSpent'], 6, retbins=True)
print('ClientPeriod bins:', period_bins.tolist())
print('MonthlySpending bins:', spending_bins.tolist())
print('TotalSpent bins:', spent_bins.tolist())

ClientPeriod bins: [1.0, 2.0, 5.0, 10.0, 17.0, 25.0, 34.0, 44.0, 53.0, 62.0, 70.0, 72.0]
MonthlySpending bins: [18.25, 21.1, 50.4, 70.35, 83.9, 96.1, 118.75]
TotalSpent bins: [18.8, 198.2, 679.0999999999999, 1400.55, 2754.2999999999997, 4917.8, 8684.8]
