In [39]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import linear_model
from sklearn import svm
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
%matplotlib inline

class encode_categorical_feature:
    def __init__(self,feature):
        self.f_set = []
        self.feature = feature
    def fit(self,data):
        if self.feature in data.columns:
            self.f_set = sorted(list(data[self.feature].unique()))
            f_values   = data[self.feature].apply(lambda x: [self.f_set.index(x)])
            self.encoder = OneHotEncoder().fit(list(f_values))
    def transform(self,data):
        if not self.f_set:
            return data
        f_values  = data[self.feature].apply(lambda x: [self.f_set.index(x)])
        f_encoded = self.encoder.transform(list(f_values)).toarray()
        f_df      = pd.DataFrame(f_encoded, columns=[self.feature+'_'+str(x) for x in self.f_set])
        new_data  = pd.concat([data.reset_index(drop=True),f_df], axis=1)
        new_data.drop(self.feature, axis=1, inplace=True)
        return new_data

def preprocess_data(train_data, test_data):
    t = encode_categorical_feature('religion')
    t.fit(train_data)
    train_data = t.transform(train_data)
    test_data  = t.transform(test_data)
    
    features = [x for x in train_data.columns if x not in ['patientID', 'geo','segment','subgroup','combined_label', 'INTNR' ]]
    target = 'combined_label'
    # train_data
    train_data['combined_label'] = 100*train_data['geo'] + 10*train_data['segment'] + train_data['subgroup']
    train_data = train_data.fillna(0)
    X_train,y_train = train_data[features],train_data[target]
    # test_data
    test_data = test_data.fillna(0)
    X_test = test_data[features]
    # select features
    selection1 = ['DISTRICT', u'tribe', u'REGION_PROVINCE', u'babydoc', u'foodinsecurity', u'religion_Buddhist', u'india', u'hindu', u'religion_Hindu', u'religion_Russian/Easter', u'educ', u'Debut', u'literacy', u'christian', u'hivknow', u'ModCon', u'age', u'thrasher', u'usecondom', u'religion_Other Christia', u'religion_Muslim', u'muslim', u'lowlit', u'multpart', u'motorcycle', u'CHILDREN', u'LaborDeliv']
    selection2 = [u'christian', u'hindu', u'REGION_PROVINCE', u'DISTRICT', u'electricity', u'age', u'tribe', u'foodinsecurity', u'EVER_HAD_SEX', u'EVER_BEEN_PREGNANT', u'CHILDREN', u'india', u'married', u'multpart', u'educ', u'literacy', u'LaborDeliv', u'babydoc', u'Debut', u'ModCon', u'usecondom', u'hivknow', u'religion_Buddhist', u'religion_Hindu', u'religion_Russian/Easter']
    selection3 = [u'DISTRICT', u'tribe', u'REGION_PROVINCE', u'babydoc', u'india', u'educ', u'Debut', u'literacy', u'hivknow', u'ModCon', u'age', u'usecondom', u'multpart', u'CHILDREN', u'LaborDeliv', u'married']
    selected_features = selection1+selection2+selection3
    X_train = X_train[selected_features]
    X_test = X_test[selected_features]
#     return X_train,y_train,X_test
    return X_train.values,y_train.values,X_test.values

def gen_models():
    def gen_models_rf(parameters):
        models = []
        for n in parameters['n_estimators']:
            for m in parameters['max_depth']:
                models.append(RandomForestClassifier(n_estimators=n, max_depth=m, random_state=0))
        return models
    def gen_models_gbm(parameters):
        models = []
        for n in parameters['n_estimators']:
            for m in parameters['max_depth']:
                for l in parameters['learning_rate']:
                    models.append(GradientBoostingClassifier(n_estimators=n, max_depth=m, random_state=0))
        return models
    def gen_models_log(parameters):
        models = []
        for p in parameters['penalty']:
            for c in parameters['C']:
                models.append(linear_model.LogisticRegression(penalty=p, C=c))
        return models

    def gen_models_svm(parameters):
        models = []
        for c in parameters['C']:
            for k in parameters['kernel']:
                if 'gamma' in parameters:
                    for g in parameters['gamma']:
                        models.append(svm.SVC(C=c, kernel=k, gamma=g, probability=True))
                else:
                    models.append(svm.SVC(C=c, kernel=k, probability=True))

        return models

    param_gbm = { 'n_estimators':[50, 100,200],
                  'learning_rate':[0.1,1.0],
                  'max_depth':[1,2,3]}
    param_rf  = {'n_estimators':[100,200,500,800,1000,],
                  'max_depth':[1,2,4,6,8,10,13]}
    param_svc = [
                  {'C':[0.1, 1, 10, 100], 'kernel': ['linear']},
                  {'C':[0.1, 1, 10, 100], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
                ]
    param_log = {'penalty':('l1','l2'), 
                  'C':[0.1, 1, 10, 100]}
    models  = gen_models_gbm(param_gbm)
    models += gen_models_rf(param_rf)
    models += gen_models_svm(param_svc[0])
    models += gen_models_svm(param_svc[1])
    models += gen_models_log(param_log)
    selection_ind = [45, 38, 52, 31, 24, 16, 17, 30, 14, 15, 37]
    models = [models[i] for i in selection_ind]
    return models


class model_collection:
    def __init__(self, models):
        self.models = models[:]
    def fit(self, X_train, y_train):
        self.classes = np.unique(y_train)
        for i in range(len(self.models)):
            self.models[i].fit(X_train,y_train)
    def predict_proba(self, X_test):
        self.y_proba = []
        for m in self.models:
            self.y_proba.append(m.predict_proba(X_test))
        return self.y_proba
    def predict(self, X_test):
        self.y_pred = []
        for m in self.models:
            self.y_pred.append(m.predict(X_test))
        return self.y_pred
    

class combined_model:
    def __init__(self):
        models = gen_models()
        self.model_col = model_collection(models)
    def fit(self, X_train, y_train):
        self.model_col.fit(X_train, y_train)
    def predict(self, X_test):
        y_pred  = self.model_col.predict(X_test)
        y_pred  = np.array(y_pred).T
        y_proba = self.model_col.predict_proba(X_test)
        y_proba_max = np.array([[row.max() for row in yy] for yy in y_proba]).T
        
        
        # The sum of probabilities
        y_proba_sum = np.zeros_like(y_proba[0])
        for yy in y_proba:
            y_proba_sum += yy
        y_pred_sum = [self.model_col.classes[row.argmax()] for row in y_proba_sum]
        
        self.prediction = y_pred_sum
#         print self.model_col.models[2]
        return self.prediction
        
        
def azureml_main(train_data = None, test_data = None):
    features = [x for x in train_data.columns if x not in ['geo','segment','subgroup' ]]
    test_data = test_data[features]
    
    X_train,y_train,X_test = preprocess_data(train_data, test_data)
    
    m = combined_model()
    m.fit(X_train, y_train)
    y_pred = m.predict(X_test)
    
#     test_data['Geo_Pred']      = y_pred/100
#     test_data['Segment_Pred']  = y_pred/10%10
#     test_data['Subgroup_Pred'] = y_pred%10
    
#     return test_data[['patientID','Geo_Pred','Segment_Pred','Subgroup_Pred']]

#     output = pd.DataFrame()
#     output['patientID']=test_data['patientID']
#     output['Geo_Pred']      = y_pred/100
#     output['Segment_Pred']  = y_pred/10%10
#     output['Subgroup_Pred'] = y_pred%10
#     return output
    return y_pred




    
# data = pd.read_csv('../datasets/WomenHealth_Training.csv')
# from sklearn.cross_validation import KFold
# scores = []
# for train_ind, valid_ind in KFold(data.shape[0], n_folds=4): 
#     train_data = data.loc[train_ind,:]
#     test_data  = data.loc[valid_ind,:]
#     y_test = 100*test_data['geo'] + 10*test_data['segment'] + test_data['subgroup']
#     y_pred = azureml_main(train_data, test_data)
#     scores.append(sklearn.metrics.accuracy_score(y_pred, y_test))
#     print scores[-1]

In [6]:
import random
data = pd.read_csv('../datasets/WomenHealth_Training.csv')
nrows = data.shape[0]
train_valid_ind = random.sample(range(nrows),int(nrows*0.9))
train_ind = random.sample(train_valid_ind,int(len(train_valid_ind)*0.5))
valid_ind = [i for i in train_valid_ind if i not in train_ind]
test_ind  = [i for i in range(nrows) if i not in train_valid_ind]

train_data = data.ix[train_ind,:]
valid_data  = data.ix[valid_ind,:]
test_data  = data.ix[test_ind,:]

X_train, y_train, X_test = preprocess_data(train_data, test_data)
X_valid, y_valid, X_test = preprocess_data(valid_data, test_data)

models_all = gen_models()
model_level_1 = sklearn.base.clone(models_all[2])
model_level_1.fit(X_train,y_train)
y_valid_pred  = model_level_1.predict(X_valid)
y_valid_pred = np.array(y_valid_pred).T
y_valid_proba = model_level_1.predict_proba(X_valid)
y_valid_proba_max = np.array([row.max() for row in y_valid_proba]).T

In [15]:
y_test = 100*test_data['geo'] + 10*test_data['segment'] + test_data['subgroup']

In [23]:
y_test_pred  = model_level_1.predict(X_test)
y_test_pred  = np.array(y_test_pred).T
y_test_proba = model_level_1.predict_proba(X_test)
y_test_proba_max = np.array([row.max() for row in y_test_proba]).T

In [25]:
print sklearn.metrics.accuracy_score(y_valid,y_valid_pred)
print sklearn.metrics.accuracy_score(y_test,y_test_pred)

0.858645351283
0.82797731569


In [11]:
for p in np.linspace(0,0.95,20):
    ind = y_valid_proba_max>p
    print "%.2f   %.4f   %4d "%(p, sklearn.metrics.accuracy_score(y_valid_pred[ind],y_valid[ind]), sum(ind))

0.00   0.8586   2377 
0.05   0.8586   2377 
0.10   0.8586   2377 
0.15   0.8586   2377 
0.20   0.8590   2376 
0.25   0.8596   2371 
0.30   0.8625   2357 
0.35   0.8711   2319 
0.40   0.8825   2246 
0.45   0.8957   2158 
0.50   0.9170   2036 
0.55   0.9362   1913 
0.60   0.9540   1804 
0.65   0.9641   1700 
0.70   0.9725   1564 
0.75   0.9805   1435 
0.80   0.9839   1304 
0.85   0.9842   1137 
0.90   0.9859    925 
0.95   0.9950    597 


In [41]:
p_cut = 0.7
l2_valid_ind = y_valid_proba_max<p_cut
X_train_2 = X_valid[l2_valid_ind,:]
y_train_2 = y_valid[l2_valid_ind]

l2_test_ind = y_test_proba_max<p_cut
X_test_2 = X_test[l2_test_ind,:]

for model2 in models_all:
    model2.fit(X_train_2,y_train_2)
    y_test_pred_tmp = y_test_pred.copy()
#     print sklearn.metrics.accuracy_score(y_test,y_test_pred)
    y_test_pred_tmp[l2_test_ind]=model2.predict(X_test_2)
    print '%.6f    %.6f    %.6f'%(sklearn.metrics.accuracy_score(y_test,y_test_pred),
                                  sklearn.metrics.accuracy_score(y_test[l2_test_ind],y_test_pred_tmp[l2_test_ind]),
                                  sklearn.metrics.accuracy_score(y_test,y_test_pred_tmp))

# for m in models_all:
#     m.fit(X_train_2,y_train_2)
#     y_test_pred_tmp = y_test_pred[:]
#     y_test_pred_tmp[l2_test_ind]=m.predict(X_test_2)
#     print '%.4f    %.4f'%(sklearn.metrics.accuracy_score(y_test,y_test_pred),
#                           sklearn.metrics.accuracy_score(y_test,y_test_pred_tmp))

0.824197    0.621212    0.831758
0.824197    0.621212    0.831758
0.824197    0.606061    0.826087
0.824197    0.636364    0.837429
0.824197    0.616162    0.829868
0.824197    0.565657    0.810964
0.824197    0.565657    0.810964
0.824197    0.611111    0.827977
0.824197    0.535354    0.799622
0.824197    0.535354    0.799622
0.824197    0.606061    0.826087


In [44]:
data = pd.read_csv('../datasets/WomenHealth_Training.csv')
train_ind, valid_ind, y_train_ind, y_valid_ind = sklearn.cross_validation.train_test_split(range(data.shape[0]),range(data.shape[0]),test_size=0.25, random_state=42)
train_valid_data = data.loc[train_ind,:]
test_data  = data.loc[valid_ind,:]
X_train_valid,y_train_valid,X_test = preprocess_data(train_valid_data, test_data)
y_test = 100*test_data['geo'] + 10*test_data['segment'] + test_data['subgroup']

In [70]:
from sklearn.cross_validation import KFold
for p_cut in [0.6,0.7,0.8, 0.9,0.95]:
    model_1 = sklearn.base.clone(models_all[2])
    model_2 = sklearn.base.clone(models_all[2])
    level_2_X = []
    level_2_y = []
    for train_ind, valid_ind in KFold(X_train_valid.shape[0], n_folds=4): 
        X_train  = X_train_valid[train_ind,:]
        X_valid  = X_train_valid[valid_ind,:]
        y_train  = y_train_valid[train_ind]
        y_valid  = y_train_valid[valid_ind]
        model_1.fit(X_train, y_train)
        y_valid_proba = model_1.predict_proba(X_valid)
        y_valid_proba_max = np.array([row.max() for row in y_valid_proba]).T
        level_2_ind = y_valid_proba_max<p_cut
        level_2_X.append(X_valid[level_2_ind,:])
        level_2_y.append(y_valid[level_2_ind])
    level_2_X = np.concatenate(level_2_X, axis=0)
    level_2_y = np.concatenate(level_2_y, axis=0)

    model_1.fit(X_train_valid, y_train_valid)
    model_2.fit(level_2_X,level_2_y)
    y_test_pred = model_1.predict(X_test)
    print 'model 1 accuracy', p_cut, sklearn.metrics.accuracy_score(y_test, y_test_pred)

    y_test_proba = model_1.predict_proba(X_test)
    y_test_proba_max = np.array([row.max() for row in y_test_proba]).T
    level_2_ind = y_test_proba_max<p_cut
    for m in models_all:
        m.fit(level_2_X,level_2_y)
        y_test_pred[level_2_ind]=m.predict(X_test[level_2_ind,:])
        print sklearn.metrics.accuracy_score(y_test, y_test_pred)


model 1 accuracy 0.6 0.859954579864
0.856169568509
0.853898561696
0.85692657078
0.856169568509
0.860711582135
0.850870552612
0.850870552612
0.862225586677
0.851627554883
0.851627554883
0.859954579864
model 1 accuracy 0.7 0.859954579864
0.853141559425
0.854655563967
0.850113550341
0.85692657078
0.860711582135
0.84935654807
0.84935654807
0.850113550341
0.850113550341
0.850113550341
0.855412566238
model 1 accuracy 0.8 0.859954579864
0.856169568509
0.859954579864
0.855412566238
0.859197577593
0.859954579864
0.846328538986
0.846328538986
0.857683573051
0.853141559425
0.853141559425
0.853898561696
model 1 accuracy 0.9 0.859954579864
0.858440575322
0.859197577593
0.854655563967
0.856169568509
0.862225586677
0.855412566238
0.855412566238
0.857683573051
0.85692657078
0.85692657078
0.855412566238
model 1 accuracy 0.95 0.859954579864
0.854655563967
0.854655563967
0.856169568509
0.859197577593
0.854655563967
0.855412566238
0.855412566238
0.852384557154
0.858440575322
0.858440575322
0.852384557154
