In [None]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import linear_model
from sklearn import svm
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
%matplotlib inline

class encode_categorical_feature:
    def __init__(self,feature):
        self.f_set = []
        self.feature = feature
    def fit(self,data):
        if self.feature in data.columns:
            self.f_set = sorted(list(data[self.feature].unique()))
            f_values   = data[self.feature].apply(lambda x: [self.f_set.index(x) if x in self.f_set else float('nan')])
            self.encoder = OneHotEncoder().fit(list(f_values))
    def transform(self,data):
        if not self.f_set:
            return data
        f_values  = data[self.feature].apply(lambda x: [self.f_set.index(x)])
        f_encoded = self.encoder.transform(list(f_values)).toarray()
        f_df      = pd.DataFrame(f_encoded, columns=[self.feature+'_'+str(x) for x in self.f_set])
        new_data  = pd.concat([data.reset_index(drop=True),f_df], axis=1)
        new_data.drop(self.feature, axis=1, inplace=True)
        return new_data

def preprocess_data(train_data, test_data):
    t = encode_categorical_feature('religion')
    t.fit(train_data)
    train_data = t.transform(train_data)
    test_data  = t.transform(test_data)
    
#     t = encode_categorical_feature('DISTRICT')
#     t.fit(train_data)
#     train_data = t.transform(train_data)
#     test_data  = t.transform(test_data)
    
#     t = encode_categorical_feature('tribe')
#     t.fit(train_data)
#     train_data = t.transform(train_data)
#     test_data  = t.transform(test_data)
    
#     t = encode_categorical_feature('REGION_PROVINCE')
#     t.fit(train_data)
#     train_data = t.transform(train_data)
#     test_data  = t.transform(test_data)
    
    features = [x for x in train_data.columns if x not in ['patientID', 'geo','segment','subgroup','combined_label', 'INTNR', 'religion']]
    target = 'combined_label'
    # train_data
    train_data['combined_label'] = 100*train_data['geo'] + 10*train_data['segment'] + train_data['subgroup']
    train_data = train_data.fillna(0)
    X_train,y_train = train_data[features],train_data[target]
    # test_data
    test_data = test_data.fillna(0)
    X_test = test_data[features]
    # select features
    selection1 = [u'DISTRICT', u'tribe', u'REGION_PROVINCE', u'babydoc', u'foodinsecurity', u'religion_Buddhist', u'india', u'hindu', u'religion_Hindu', u'religion_Russian/Easter', u'educ', u'Debut', u'literacy', u'christian', u'hivknow', u'ModCon', u'age', u'thrasher', u'usecondom', u'religion_Other Christia', u'religion_Muslim', u'muslim', u'lowlit', u'multpart', u'motorcycle', u'CHILDREN', u'LaborDeliv']
    selection2 = [u'christian', u'hindu', u'REGION_PROVINCE', u'DISTRICT', u'electricity', u'age', u'tribe', u'foodinsecurity', u'EVER_HAD_SEX', u'EVER_BEEN_PREGNANT', u'CHILDREN', u'india', u'married', u'multpart', u'educ', u'literacy', u'LaborDeliv', u'babydoc', u'Debut', u'ModCon', u'usecondom', u'hivknow', u'religion_Buddhist', u'religion_Hindu', u'religion_Russian/Easter']
    selection3 = [u'DISTRICT', u'tribe', u'REGION_PROVINCE', u'babydoc', u'india', u'educ', u'Debut', u'literacy', u'hivknow', u'ModCon', u'age', u'usecondom', u'multpart', u'CHILDREN', u'LaborDeliv', u'married']
    selected_features = selection1+selection2+selection3
    X_train = X_train[selected_features]
    X_test = X_test[selected_features]
    return X_train,y_train,X_test
    print X_train.columns
    return X_train.values,y_train.values,X_test.values


In [102]:
data = pd.read_csv('../datasets/WomenHealth_Training.csv')
train_ind, valid_ind, y_train_ind, y_valid_ind = sklearn.cross_validation.train_test_split(range(data.shape[0]),range(data.shape[0]),test_size=0.25, random_state=42)
train_data = data.loc[train_ind,:]
test_data  = data.loc[valid_ind,:]
X_train,y_train,X_test = preprocess_data(train_data, test_data)
y_test = 100*test_data['geo'] + 10*test_data['segment'] + test_data['subgroup']
print X_train.shape

(3962, 68)


In [None]:
selection1 = [u'DISTRICT', u'tribe', u'REGION_PROVINCE', u'babydoc', u'foodinsecurity', u'religion_Buddhist', u'india', u'hindu', u'religion_Hindu', u'religion_Russian/Easter', u'educ', u'Debut', u'literacy', u'christian', u'hivknow', u'ModCon', u'age', u'thrasher', u'usecondom', u'religion_Other Christia', u'religion_Muslim', u'muslim', u'lowlit', u'multpart', u'motorcycle', u'CHILDREN', u'LaborDeliv']
selection2 = [u'christian', u'hindu', u'REGION_PROVINCE', u'DISTRICT', u'electricity', u'age', u'tribe', u'foodinsecurity', u'EVER_HAD_SEX', u'EVER_BEEN_PREGNANT', u'CHILDREN', u'india', u'married', u'multpart', u'educ', u'literacy', u'LaborDeliv', u'babydoc', u'Debut', u'ModCon', u'usecondom', u'hivknow', u'religion_Buddhist', u'religion_Hindu', u'religion_Russian/Easter']
selection3 = [u'DISTRICT', u'tribe', u'REGION_PROVINCE', u'babydoc', u'india', u'educ', u'Debut', u'literacy', u'hivknow', u'ModCon', u'age', u'usecondom', u'multpart', u'CHILDREN', u'LaborDeliv', u'married']
selected_features = selection1+selection2+selection3
print data.columns

In [103]:
rf = RandomForestClassifier(n_estimators=1000, max_depth=13, random_state=0)
rf.fit(X_train, y_train)
y_pred_proba = rf.predict_proba(X_test)
y_pred_proba_max= np.apply_along_axis(max, 1, y_pred_proba)
y_pred=rf.predict(X_test)
print sklearn.metrics.accuracy_score(y_pred, y_test)

0.859954579864


In [108]:
import time
from sklearn import grid_search
start_time = time.time()
param_grid = {'n_estimators':[100,300],
              'learning_rate':[0.1,1.0],
              'max_depth':[1,2,3,4],
              'random_state':[0]}
gbm0 = GradientBoostingClassifier()
gbm_grid = grid_search.GridSearchCV(gbm0, param_grid, cv=4)
gbm_grid.fit(X_train,y_train)
print gbm_grid.best_score_
end_time = time.time()
print end_time-start_time

0.851842503786
3611.40774107


## 1. A collection of models

In [None]:
def gen_models_rf(parameters):
    models = []
    for n in parameters['n_estimators']:
        for m in parameters['max_depth']:
            models.append(RandomForestClassifier(n_estimators=n, max_depth=m, random_state=0))
    return models
def gen_models_gbm(parameters):
    models = []
    for n in parameters['n_estimators']:
        for m in parameters['max_depth']:
            for l in parameters['learning_rate']:
                models.append(GradientBoostingClassifier(n_estimators=n, max_depth=m, random_state=0))
    return models
def gen_models_log(parameters):
    models = []
    for p in parameters['penalty']:
        for c in parameters['C']:
            models.append(linear_model.LogisticRegression(penalty=p, C=c))
    return models

def gen_models_svm(parameters):
    models = []
    for c in parameters['C']:
        for k in parameters['kernel']:
            if 'gamma' in parameters:
                for g in parameters['gamma']:
                    models.append(svm.SVC(C=c, kernel=k, gamma=g, probability=True))
            else:
                models.append(svm.SVC(C=c, kernel=k, probability=True))

    return models

# models =[ 
#           RandomForestClassifier(   n_estimators=100,  max_depth=10, random_state=0),
#           RandomForestClassifier(   n_estimators=200,  max_depth=8 , random_state=0),
#           RandomForestClassifier(   n_estimators=400,  max_depth=10, random_state=0),
#           RandomForestClassifier(   n_estimators=1000, max_depth=13, random_state=0),
#           RandomForestClassifier(   n_estimators=1200, max_depth=6 , random_state=0),
#         GradientBoostingClassifier( n_estimators=100,  max_depth=1, random_state=0, learning_rate=0.1),
#         GradientBoostingClassifier( n_estimators=100,  max_depth=2, random_state=0, learning_rate=0.1),
#         GradientBoostingClassifier( n_estimators=100,  max_depth=3, random_state=0, learning_rate=0.1),
#         GradientBoostingClassifier( n_estimators=300,  max_depth=2, random_state=0, learning_rate=0.1),
#         GradientBoostingClassifier( n_estimators=1000, max_depth=2, random_state=0, learning_rate=0.1)
#         ]

param_gbm = { 'n_estimators':[50, 100,200],
              'learning_rate':[0.1,1.0],
              'max_depth':[1,2,3]}
param_rf  = {'n_estimators':[100,200,500,800,1000],
              'max_depth':[1,2,4,6,8,10,13]}
param_svc = [
              {'C':[0.1, 1, 10, 100], 'kernel': ['linear']},
              {'C':[0.1, 1, 10, 100], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
            ]
param_log = {'penalty':('l1','l2'), 
              'C':[0.1, 1, 10, 100]}
m_idx = [0]*5
models  = gen_models_gbm(param_gbm)
m_idx[1] = len(models)
models += gen_models_rf(param_rf)
m_idx[2] = len(models)
models += gen_models_svm(param_svc[0])
models += gen_models_svm(param_svc[1])
m_idx[3] = len(models)
models += gen_models_log(param_log)
m_idx[4] = len(models)
# for i in range(m_idx[4]):
#     print i,models[i]
print m_idx
m_idx_rf_1000_13 = 52

In [None]:
class model_collection:
    def __init__(self, models):
        self.models = models[:]
    def fit(self, X_train, y_train):
        self.classes = np.unique(y_train)
        for i in range(len(self.models)):
            self.models[i].fit(X_train,y_train)
    def predict_proba(self, X_test):
        self.y_proba = []
        for m in self.models:
            self.y_proba.append(m.predict_proba(X_test))
        return self.y_proba
    def predict(self, X_test):
        self.y_pred = []
        for m in self.models:
            self.y_pred.append(m.predict(X_test))
        return self.y_pred

In [None]:
col = model_collection(models)
col.fit(X_train,y_train)
y_pred  = col.predict(X_test)
y_proba = col.predict_proba(X_test)

y_pred = np.array(y_pred).T
y_proba_max = np.array([[row.max() for row in yy] for yy in y_proba]).T
classes = col.classes

## 1. Majority vote

In [None]:
def vote_common(row):
    d={}
    for v in row:
        d[v] = d.get(v,0)+1
    vote = sorted(d,key=d.get)
    return vote[0]
y_pred_vote = np.apply_along_axis(vote_common, 1, y_pred)
sklearn.metrics.accuracy_score(y_pred_vote, y_test)

## 2. Output the prediction that has max probability among models

In [None]:
plt.hist(np.apply_along_axis(max, 1,y_proba_max),30)

In [109]:
pred_choice = np.apply_along_axis(np.argmax, 1,y_proba_max)
pred_by_choice = np.array([y_pred[i,pred_choice[i]] for i in range(len(y_test))])
sklearn.metrics.accuracy_score(pred_by_choice,y_test)

0.846328538985617

## 3. Sum of prediction probabilities

In [None]:
# The sum of probabilities
y_proba_sum = np.zeros_like(y_proba[0])
for yy in y_proba:
    y_proba_sum += yy
y_pred_sum = [classes[row.argmax()] for row in y_proba_sum]
sklearn.metrics.accuracy_score(y_pred_sum, y_test)

In [None]:
# def a model class for the sum of probabilites
class sum_of_models:
    def __init__(self, models):
        self.models = models[:]
    def fit(self, X_train, y_train):
        self.classes = np.unique(y_train)
        for i in range(len(self.models)):
            self.models[i].fit(X_train,y_train)
    def predict(self, X_test):
        y_proba = np.zeros((X_test.shape[0],len(self.classes)))
        for m in self.models:
            y_proba += m.predict_proba(X_test)
        myfun = lambda r: self.classes[r.argmax()]
        return np.apply_along_axis(myfun, 1, y_proba)
    
ms = sum_of_models(models)
ms.fit(X_train,y_train)
y_pred = ms.predict(X_test)
sklearn.metrics.accuracy_score(y_pred,y_test)

## 4. Selection predictions from each model according probability

### 4.1. Explory the prediciton probalities

In [None]:
plt.hist(y_proba_max[:,m_idx_rf_1000_13],30)

In [None]:
sklearn.metrics.accuracy_score(y_pred[:,m_idx_rf_1000_13],y_test)

In [None]:
for p in np.linspace(0,1,21):
    ind = y_proba_max[:,m_idx_rf_1000_13]>p
    print "%.6f "*2%(p, sklearn.metrics.accuracy_score(y_pred[ind,m_idx_rf_1000_13],y_test[ind])), sum(ind)

In [None]:
score = []
for p in np.linspace(0,1,21):
    score.append([sklearn.metrics.accuracy_score(y_pred[y_proba_max[:,k]>p,k],y_test[y_proba_max[:,k]>p]) for k in range(len(models))])
    print "%.3f "*10 % tuple([p]+score[-1][:9])
score=np.array(score)

### 4.2. Select prediction according to model accuracy


In [None]:
# 1. predefine a accuracy threshold, score_cut
# 2. for each model, find probability p such that the prediciton with probability>p has accuracy > score_cut
# 3. update prediction if it has not been updated
print 'score_cut, over_all_prediction_accuracy, prediction_accuracy_among_predicted, number_of_prediction_updated'
for score_cut in np.linspace(0.85,0.99,16):
    y_pred_increament = np.zeros_like(y_test)
    p_scale = np.linspace(0,1,21)
    p_cut = [p_scale[np.argmax(score[:,k]>score_cut)] for k in range(len(models))]

    for k in range(len(models)):
        ind_confi_k = y_proba_max[:,k]>p_cut[k]
        for ind in [i for i in range(len(y_test)) if ind_confi_k[i]]:
            if y_pred_increament[ind]==0:
                y_pred_increament[ind] = y_pred[ind,k]

    ind_predicted = y_pred_increament!=0
    print '%.2f  %.3f  %.3f  %4d'%(score_cut,
                                   sklearn.metrics.accuracy_score(y_pred_increament,y_test), 
                                   sklearn.metrics.accuracy_score(y_pred_increament[ind_predicted],y_test[ind_predicted]), 
                                   sum(ind_predicted))
# for ind in range(len(y_test)):
#     if y_pred_increament[ind]==0:
#         y_pred_increament[ind] = y_pred[ind,1]
# print sklearn.metrics.accuracy_score(y_pred_increament,y_test)


In [None]:
# 1. given probability p and select predicitons with probability>p 
# 2. update prediction if it has not been updated
print 'p_cut, over_all_prediction_accuracy, prediction_accuracy_among_predicted, number_of_prediction_updated'
for p_cut in np.linspace(0.2,0.99,70):
    y_pred_increament = np.zeros_like(y_test)
    for k in range(9):
        ind_confi_k = y_proba_max[:,k]>p_cut
        for ind in [i for i in range(len(y_test)) if ind_confi_k[i]]:
            if y_pred_increament[ind]==0:
                y_pred_increament[ind] = y_pred[ind,k]
    
    ind_predicted = y_pred_increament!=0
    print '%.2f  %.3f  %.3f  %4d'%(p_cut,
                                   sklearn.metrics.accuracy_score(y_pred_increament,y_test), 
                                   sklearn.metrics.accuracy_score(y_pred_increament[ind_predicted],y_test[ind_predicted]), 
                                   sum(ind_predicted))

**Result:** The over all accuracy is not improved. The reason could be that the selection method does not guanratee the accuracy of the newly added predictions.

### 4.3. Selection predictions with contralled accuracy
This method selects predicitons according to prabability thretholds with contralled accuracy.

#### 4.3.1. Select good-accuracy predictions and choose a model to fill the rest
The first level selection is good. The second is not.

In [None]:
# 1. given probability p and select predicitons with probability>p 
# 2. update prediction if it has not been updated
print 'p_cut, over_all_prediction_accuracy, prediction_accuracy_among_predicted, number_of_prediction_updated'
# for p_cut in np.linspace(0.6,0.99,40):
for p_cut in [0.78]:
    y_pred_increament = np.zeros_like(y_test)
    for k in range(len(models)):
        ind_confi_k = y_proba_max[:,k]>p_cut
        for ind in [i for i in range(len(y_test)) if ind_confi_k[i]]:
            if y_pred_increament[ind]==0:
                y_pred_increament[ind] = y_pred[ind,k]
    
    ind_predicted = y_pred_increament!=0
    print '%.2f  %.4f  %.4f  %4d'%(p_cut,
                                   sklearn.metrics.accuracy_score(y_pred_increament,y_test), 
                                   sklearn.metrics.accuracy_score(y_pred_increament[ind_predicted],y_test[ind_predicted]), 
                                   sum(ind_predicted))

for k in range(len(models)):
    y_pred_tmp = np.array(y_pred_increament)[:]
    ind_remaining = [i for i in range(len(y_test)) if y_pred_increament[i]==0]
    for ind in range(len(y_test)):
        if y_pred_tmp[ind]==0:
            y_pred_tmp[ind] = y_pred[ind,k]
    print '%2d %.4f %.4f'%(k, sklearn.metrics.accuracy_score(y_pred_tmp,y_test),sklearn.metrics.accuracy_score(y_pred_tmp[ind_remaining],y_test.values[ind_remaining]))


In [None]:
# 1. given probability p and select predicitons with probability>p 
# 2. update prediction if it has not been updated
# print 'p_cut, over_all_prediction_accuracy, prediction_accuracy_among_predicted, number_of_prediction_updated'
for p_cut in np.linspace(0.6,1,41):
# for p_cut in [0.8]:
    y_pred_increament = np.zeros_like(y_test)
    for k in range(len(models)):
        ind_confi_k = y_proba_max[:,k]>p_cut
        for ind in [i for i in range(len(y_test)) if ind_confi_k[i]]:
            if y_pred_increament[ind]==0:
                y_pred_increament[ind] = y_pred[ind,k]
    
    ind_predicted = y_pred_increament!=0
    increament_score = sklearn.metrics.accuracy_score(y_pred_increament[ind_predicted],y_test[ind_predicted])
#     print '%.2f  %.3f  %.3f  %4d'%(p_cut,
#                                    sklearn.metrics.accuracy_score(y_pred_increament,y_test), 
#                                    sklearn.metrics.accuracy_score(y_pred_increament[ind_predicted],y_test[ind_predicted]), 
#                                    sum(ind_predicted))
    
    ind_remaining = [i for i in range(len(y_test)) if y_pred_increament[i]==0]
    ramain_scores = []
    best_scores = []
    for k in range(len(models)):
        y_pred_tmp = y_pred_increament[:]
#         for ind in range(len(y_test)):
#             if y_pred_tmp[ind]==0:
#                 y_pred_tmp[ind] = y_pred[ind,k]
        y_pred_tmp[ind_remaining] = y_pred[ind_remaining,k]
        ramain_scores.append(sklearn.metrics.accuracy_score(y_pred_tmp[ind_remaining],y_test.values[ind_remaining]))
        best_scores.append(sklearn.metrics.accuracy_score(y_pred_tmp,y_test))
#     best_m_ind = max(ramain_scores)
    
    print '%.2f   %4d    %.4f   %.4f   %.4f'%(p_cut, len(ind_remaining), increament_score, max(ramain_scores), max(best_scores))
    
#     print '%2d %.4f %.4f'%(k, sklearn.metrics.accuracy_score(y_pred_tmp,y_test),sklearn.metrics.accuracy_score(y_pred_tmp[ind_remaining],y_test.values[ind_remaining]))


#### 4.3.2. Multi-level contralled selection

## 5. Select models before voting

In [None]:
# models_all = models
# y_pred_all = y_pred[:,:]
# y_proba_all = y_proba[:]
# y_proba_max_all = y_proba_max[:,:]

In [None]:
m_scores = [sklearn.metrics.accuracy_score(y_test, y_pred_all[:,k]) for k in range(len(models_all))]

In [None]:
plt.hist(m_scores,30)

In [None]:
print sum([mm>0.85 for mm in m_scores])
plt.plot(np.linspace(0,1,100), [sum([mm>p for mm in m_scores]) for p in np.linspace(0,1,100)])

In [106]:
selected_models_ind = [i for i in range(len(models_all)) if m_scores[i]>0.821]
# selected_models_ind = sorted(selected_models_ind, key=lambda i: m_scores[i], reverse=True)
models      = [models_all[i] for i in selected_models_ind]
y_pred      = y_pred_all[:,selected_models_ind]
y_proba_max = y_proba_max_all[:,selected_models_ind]
y_proba     = [y_proba_all[i] for i in selected_models_ind]
print np.array(m_scores)[selected_models_ind]
print selected_models_ind
print len(models)
models

[ 0.82967449  0.82967449  0.82361847  0.82361847  0.82437547  0.82437547
  0.83118849  0.83118849  0.82891749  0.82891749  0.84708554  0.8342165
  0.83648751  0.8342165   0.82286147  0.82286147]
[4, 5, 8, 9, 10, 11, 14, 15, 16, 17, 53, 54, 63, 66, 70, 71]
16


[GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
               max_depth=3, max_features=None, max_leaf_nodes=None,
               min_samples_leaf=1, min_samples_split=2, n_estimators=50,
               random_state=0, subsample=1.0, verbose=0, warm_start=False),
 GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
               max_depth=3, max_features=None, max_leaf_nodes=None,
               min_samples_leaf=1, min_samples_split=2, n_estimators=50,
               random_state=0, subsample=1.0, verbose=0, warm_start=False),
 GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
               max_depth=2, max_features=None, max_leaf_nodes=None,
               min_samples_leaf=1, min_samples_split=2, n_estimators=100,
               random_state=0, subsample=1.0, verbose=0, warm_start=False),
 GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
               max_depth=2, max_features=None, m

In [None]:
# 1. given probability p and select predicitons with probability>p 
# 2. update prediction if it has not been updated
# print 'p_cut, over_all_prediction_accuracy, prediction_accuracy_among_predicted, number_of_prediction_updated'
for p_cut in np.linspace(0.6,1,41):
# for p_cut in [0.8]:
    y_pred_increament = np.zeros_like(y_test)
    for k in range(len(models)):
        ind_confi_k = y_proba_max[:,k]>p_cut
        for ind in [i for i in range(len(y_test)) if ind_confi_k[i]]:
            if y_pred_increament[ind]==0:
                y_pred_increament[ind] = y_pred[ind,k]
    
    ind_predicted = y_pred_increament!=0
    increament_score = sklearn.metrics.accuracy_score(y_pred_increament[ind_predicted],y_test[ind_predicted])
#     print '%.2f  %.3f  %.3f  %4d'%(p_cut,
#                                    sklearn.metrics.accuracy_score(y_pred_increament,y_test), 
#                                    sklearn.metrics.accuracy_score(y_pred_increament[ind_predicted],y_test[ind_predicted]), 
#                                    sum(ind_predicted))
    
    ind_remaining = [i for i in range(len(y_test)) if y_pred_increament[i]==0]
    ramain_scores = []
    best_scores = []
    for k in range(len(models)):
        y_pred_tmp = y_pred_increament[:]
#         for ind in range(len(y_test)):
#             if y_pred_tmp[ind]==0:
#                 y_pred_tmp[ind] = y_pred[ind,k]
        y_pred_tmp[ind_remaining] = y_pred[ind_remaining,k]
        ramain_scores.append(sklearn.metrics.accuracy_score(y_pred_tmp[ind_remaining],y_test.values[ind_remaining]))
        best_scores.append(sklearn.metrics.accuracy_score(y_pred_tmp,y_test))
#     best_m_ind = max(ramain_scores)
    
    print '%.2f   %4d    %.4f   %.4f   %.4f'%(p_cut, len(ind_remaining), increament_score, max(ramain_scores), max(best_scores)), selected_models_ind[np.argmax(ramain_scores)]
    
#     print '%2d %.4f %.4f'%(k, sklearn.metrics.accuracy_score(y_pred_tmp,y_test),sklearn.metrics.accuracy_score(y_pred_tmp[ind_remaining],y_test.values[ind_remaining]))


In [None]:
def vote_common(row):
    d={}
    for v in row:
        d[v] = d.get(v,0)+1
    vote = sorted(d,key=d.get)
    return vote[0]
y_pred_vote = np.apply_along_axis(vote_common, 1, y_pred)
sklearn.metrics.accuracy_score(y_pred_vote, y_test)

In [None]:
pred_choice = np.apply_along_axis(np.argmax, 1,y_proba_max)
pred_by_choice = np.array([y_pred[i,pred_choice[i]] for i in range(len(y_test))])
sklearn.metrics.accuracy_score(pred_by_choice,y_test)

In [None]:
# The sum of probabilities
y_proba_sum = np.zeros_like(y_proba[0])
for yy in y_proba:
    y_proba_sum += yy
y_pred_sum = [classes[row.argmax()] for row in y_proba_sum]
sklearn.metrics.accuracy_score(y_pred_sum, y_test)

## 6. Submit versions

In [None]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import linear_model
from sklearn import svm
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
%matplotlib inline

class encode_categorical_feature:
    def __init__(self,feature):
        self.f_set = []
        self.feature = feature
    def fit(self,data):
        if self.feature in data.columns:
            self.f_set = sorted(list(data[self.feature].unique()))
            f_values   = data[self.feature].apply(lambda x: [self.f_set.index(x)])
            self.encoder = OneHotEncoder().fit(list(f_values))
    def transform(self,data):
        if not self.f_set:
            return data
        f_values  = data[self.feature].apply(lambda x: [self.f_set.index(x)])
        f_encoded = self.encoder.transform(list(f_values)).toarray()
        f_df      = pd.DataFrame(f_encoded, columns=[self.feature+'_'+str(x) for x in self.f_set])
        new_data  = pd.concat([data.reset_index(drop=True),f_df], axis=1)
        new_data.drop(self.feature, axis=1, inplace=True)
        return new_data

def preprocess_data(train_data, test_data):
    t = encode_categorical_feature('religion')
    t.fit(train_data)
    train_data = t.transform(train_data)
    test_data  = t.transform(test_data)
    
    features = [x for x in train_data.columns if x not in ['patientID', 'geo','segment','subgroup','combined_label', 'INTNR' ]]
    target = 'combined_label'
    # train_data
    train_data['combined_label'] = 100*train_data['geo'] + 10*train_data['segment'] + train_data['subgroup']
    train_data = train_data.fillna(0)
    X_train,y_train = train_data[features],train_data[target]
    # test_data
    test_data = test_data.fillna(0)
    X_test = test_data[features]
    # select features
    selection1 = ['DISTRICT', u'tribe', u'REGION_PROVINCE', u'babydoc', u'foodinsecurity', u'religion_Buddhist', u'india', u'hindu', u'religion_Hindu', u'religion_Russian/Easter', u'educ', u'Debut', u'literacy', u'christian', u'hivknow', u'ModCon', u'age', u'thrasher', u'usecondom', u'religion_Other Christia', u'religion_Muslim', u'muslim', u'lowlit', u'multpart', u'motorcycle', u'CHILDREN', u'LaborDeliv']
    selection2 = [u'christian', u'hindu', u'REGION_PROVINCE', u'DISTRICT', u'electricity', u'age', u'tribe', u'foodinsecurity', u'EVER_HAD_SEX', u'EVER_BEEN_PREGNANT', u'CHILDREN', u'india', u'married', u'multpart', u'educ', u'literacy', u'LaborDeliv', u'babydoc', u'Debut', u'ModCon', u'usecondom', u'hivknow', u'religion_Buddhist', u'religion_Hindu', u'religion_Russian/Easter']
    selection3 = [u'DISTRICT', u'tribe', u'REGION_PROVINCE', u'babydoc', u'india', u'educ', u'Debut', u'literacy', u'hivknow', u'ModCon', u'age', u'usecondom', u'multpart', u'CHILDREN', u'LaborDeliv', u'married']
    selected_features = selection1+selection2+selection3
    X_train = X_train[selected_features]
    X_test = X_test[selected_features]
#     return X_train,y_train,X_test
    return X_train.values,y_train.values,X_test.values

def gen_models():
    def gen_models_rf(parameters):
        models = []
        for n in parameters['n_estimators']:
            for m in parameters['max_depth']:
                models.append(RandomForestClassifier(n_estimators=n, max_depth=m, random_state=0))
        return models
    def gen_models_gbm(parameters):
        models = []
        for n in parameters['n_estimators']:
            for m in parameters['max_depth']:
                for l in parameters['learning_rate']:
                    models.append(GradientBoostingClassifier(n_estimators=n, max_depth=m, random_state=0))
        return models
    def gen_models_log(parameters):
        models = []
        for p in parameters['penalty']:
            for c in parameters['C']:
                models.append(linear_model.LogisticRegression(penalty=p, C=c))
        return models

    def gen_models_svm(parameters):
        models = []
        for c in parameters['C']:
            for k in parameters['kernel']:
                if 'gamma' in parameters:
                    for g in parameters['gamma']:
                        models.append(svm.SVC(C=c, kernel=k, gamma=g, probability=True))
                else:
                    models.append(svm.SVC(C=c, kernel=k, probability=True))

        return models

    param_gbm = { 'n_estimators':[50, 100,200],
                  'learning_rate':[0.1,1.0],
                  'max_depth':[1,2,3]}
    param_rf  = {'n_estimators':[100,200,500,800,1000,],
                  'max_depth':[1,2,4,6,8,10,13]}
    param_svc = [
                  {'C':[0.1, 1, 10, 100], 'kernel': ['linear']},
                  {'C':[0.1, 1, 10, 100], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
                ]
    param_log = {'penalty':('l1','l2'), 
                  'C':[0.1, 1, 10, 100]}
    models  = gen_models_gbm(param_gbm)
    models += gen_models_rf(param_rf)
    models += gen_models_svm(param_svc[0])
    models += gen_models_svm(param_svc[1])
    models += gen_models_log(param_log)
    selection_ind = [45, 38, 52, 31, 24, 16, 17, 30, 14, 15, 37]
    models = [models[i] for i in selection_ind]
    return models


class model_collection:
    def __init__(self, models):
        self.models = models[:]
    def fit(self, X_train, y_train):
        self.classes = np.unique(y_train)
        for i in range(len(self.models)):
            self.models[i].fit(X_train,y_train)
    def predict_proba(self, X_test):
        self.y_proba = []
        for m in self.models:
            self.y_proba.append(m.predict_proba(X_test))
        return self.y_proba
    def predict(self, X_test):
        self.y_pred = []
        for m in self.models:
            self.y_pred.append(m.predict(X_test))
        return self.y_pred
    

class combined_model:
    def __init__(self):
        models = gen_models()
        self.model_col = model_collection(models)
    def fit(self, X_train, y_train):
        self.model_col.fit(X_train, y_train)
    def predict(self, X_test):
        y_pred  = self.model_col.predict(X_test)
        y_pred  = np.array(y_pred).T
        y_proba = self.model_col.predict_proba(X_test)
        y_proba_max = np.array([[row.max() for row in yy] for yy in y_proba]).T
        
        # predictions with better confidency
        p_cut = 0.8
        y_pred_increament = np.zeros(X_test.shape[0])
        for k in range(len(self.model_col.models)):
            for ind in range(X_test.shape[0]):
                if y_pred_increament[ind]==0 and y_proba_max[ind,k]>p_cut:
                    y_pred_increament[ind] = y_pred[ind,k]

        # remaining preditions
        ind_remaining = [i for i in range(len(y_test)) if y_pred_increament[i]==0]
        best_m_for_remaining = 2
        y_pred_increament[ind_remaining] = y_pred[ind_remaining,best_m_for_remaining]
        self.prediction = y_pred_increament
#         print self.model_col.models[2]
        return self.prediction
        
        
def azureml_main(train_data = None, test_data = None):
    features = [x for x in train_data.columns if x not in ['geo','segment','subgroup' ]]
    test_data = test_data[features]
    
    X_train,y_train,X_test = preprocess_data(train_data, test_data)
    
    m = combined_model()
    m.fit(X_train, y_train)
    y_pred = m.predict(X_test)
    
#     test_data['Geo_Pred']      = y_pred/100
#     test_data['Segment_Pred']  = y_pred/10%10
#     test_data['Subgroup_Pred'] = y_pred%10
    
#     return test_data[['patientID','Geo_Pred','Segment_Pred','Subgroup_Pred']]

#     output = pd.DataFrame()
#     output['patientID']=test_data['patientID']
#     output['Geo_Pred']      = y_pred/100
#     output['Segment_Pred']  = y_pred/10%10
#     output['Subgroup_Pred'] = y_pred%10
#     return output
    return y_pred




    
data = pd.read_csv('../datasets/WomenHealth_Training.csv')
from sklearn.cross_validation import KFold
scores = []
for train_ind, valid_ind in KFold(data.shape[0], n_folds=4): 
    train_data = data.loc[train_ind,:]
    test_data  = data.loc[valid_ind,:]
    y_test = 100*test_data['geo'] + 10*test_data['segment'] + test_data['subgroup']
    y_pred = azureml_main(train_data, test_data)
    scores.append(sklearn.metrics.accuracy_score(y_pred, y_test))
    print scores[-1]

In [None]:
# use models[45] and p_cut=0.74
# 0.855412566238
# 0.869795609387
# 0.813777441332
# 0.85

### 6.2. sum of models

In [None]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import linear_model
from sklearn import svm
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
%matplotlib inline

class encode_categorical_feature:
    def __init__(self,feature):
        self.f_set = []
        self.feature = feature
    def fit(self,data):
        if self.feature in data.columns:
            self.f_set = sorted(list(data[self.feature].unique()))
            f_values   = data[self.feature].apply(lambda x: [self.f_set.index(x)])
            self.encoder = OneHotEncoder().fit(list(f_values))
    def transform(self,data):
        if not self.f_set:
            return data
        f_values  = data[self.feature].apply(lambda x: [self.f_set.index(x)])
        f_encoded = self.encoder.transform(list(f_values)).toarray()
        f_df      = pd.DataFrame(f_encoded, columns=[self.feature+'_'+str(x) for x in self.f_set])
        new_data  = pd.concat([data.reset_index(drop=True),f_df], axis=1)
        new_data.drop(self.feature, axis=1, inplace=True)
        return new_data

def preprocess_data(train_data, test_data):
    t = encode_categorical_feature('religion')
    t.fit(train_data)
    train_data = t.transform(train_data)
    test_data  = t.transform(test_data)
    
    features = [x for x in train_data.columns if x not in ['patientID', 'geo','segment','subgroup','combined_label', 'INTNR' ]]
    target = 'combined_label'
    # train_data
    train_data['combined_label'] = 100*train_data['geo'] + 10*train_data['segment'] + train_data['subgroup']
    train_data = train_data.fillna(0)
    X_train,y_train = train_data[features],train_data[target]
    # test_data
    test_data = test_data.fillna(0)
    X_test = test_data[features]
    # select features
    selection1 = ['DISTRICT', u'tribe', u'REGION_PROVINCE', u'babydoc', u'foodinsecurity', u'religion_Buddhist', u'india', u'hindu', u'religion_Hindu', u'religion_Russian/Easter', u'educ', u'Debut', u'literacy', u'christian', u'hivknow', u'ModCon', u'age', u'thrasher', u'usecondom', u'religion_Other Christia', u'religion_Muslim', u'muslim', u'lowlit', u'multpart', u'motorcycle', u'CHILDREN', u'LaborDeliv']
    selection2 = [u'christian', u'hindu', u'REGION_PROVINCE', u'DISTRICT', u'electricity', u'age', u'tribe', u'foodinsecurity', u'EVER_HAD_SEX', u'EVER_BEEN_PREGNANT', u'CHILDREN', u'india', u'married', u'multpart', u'educ', u'literacy', u'LaborDeliv', u'babydoc', u'Debut', u'ModCon', u'usecondom', u'hivknow', u'religion_Buddhist', u'religion_Hindu', u'religion_Russian/Easter']
    selection3 = [u'DISTRICT', u'tribe', u'REGION_PROVINCE', u'babydoc', u'india', u'educ', u'Debut', u'literacy', u'hivknow', u'ModCon', u'age', u'usecondom', u'multpart', u'CHILDREN', u'LaborDeliv', u'married']
    selected_features = selection1+selection2+selection3
    X_train = X_train[selected_features]
    X_test = X_test[selected_features]
#     return X_train,y_train,X_test
    return X_train.values,y_train.values,X_test.values

def gen_models():
    def gen_models_rf(parameters):
        models = []
        for n in parameters['n_estimators']:
            for m in parameters['max_depth']:
                models.append(RandomForestClassifier(n_estimators=n, max_depth=m, random_state=0))
        return models
    def gen_models_gbm(parameters):
        models = []
        for n in parameters['n_estimators']:
            for m in parameters['max_depth']:
                for l in parameters['learning_rate']:
                    models.append(GradientBoostingClassifier(n_estimators=n, max_depth=m, random_state=0))
        return models
    def gen_models_log(parameters):
        models = []
        for p in parameters['penalty']:
            for c in parameters['C']:
                models.append(linear_model.LogisticRegression(penalty=p, C=c))
        return models

    def gen_models_svm(parameters):
        models = []
        for c in parameters['C']:
            for k in parameters['kernel']:
                if 'gamma' in parameters:
                    for g in parameters['gamma']:
                        models.append(svm.SVC(C=c, kernel=k, gamma=g, probability=True))
                else:
                    models.append(svm.SVC(C=c, kernel=k, probability=True))

        return models

    param_gbm = { 'n_estimators':[50, 100,200],
                  'learning_rate':[0.1,1.0],
                  'max_depth':[1,2,3]}
    param_rf  = {'n_estimators':[100,200,500,800,1000,],
                  'max_depth':[1,2,4,6,8,10,13]}
    param_svc = [
                  {'C':[0.1, 1, 10, 100], 'kernel': ['linear']},
                  {'C':[0.1, 1, 10, 100], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
                ]
    param_log = {'penalty':('l1','l2'), 
                  'C':[0.1, 1, 10, 100]}
    models  = gen_models_gbm(param_gbm)
    models += gen_models_rf(param_rf)
    models += gen_models_svm(param_svc[0])
    models += gen_models_svm(param_svc[1])
    models += gen_models_log(param_log)
    selection_ind = [45, 38, 52, 31, 24, 16, 17, 30, 14, 15, 37]
    models = [models[i] for i in selection_ind]
    return models


class model_collection:
    def __init__(self, models):
        self.models = models[:]
    def fit(self, X_train, y_train):
        self.classes = np.unique(y_train)
        for i in range(len(self.models)):
            self.models[i].fit(X_train,y_train)
    def predict_proba(self, X_test):
        self.y_proba = []
        for m in self.models:
            self.y_proba.append(m.predict_proba(X_test))
        return self.y_proba
    def predict(self, X_test):
        self.y_pred = []
        for m in self.models:
            self.y_pred.append(m.predict(X_test))
        return self.y_pred
    

class combined_model:
    def __init__(self):
        models = gen_models()
        self.model_col = model_collection(models)
    def fit(self, X_train, y_train):
        self.model_col.fit(X_train, y_train)
    def predict(self, X_test):
        y_pred  = self.model_col.predict(X_test)
        y_pred  = np.array(y_pred).T
        y_proba = self.model_col.predict_proba(X_test)
        y_proba_max = np.array([[row.max() for row in yy] for yy in y_proba]).T
        
        
        # The sum of probabilities
        y_proba_sum = np.zeros_like(y_proba[0])
        for yy in y_proba:
            y_proba_sum += yy
        y_pred_sum = [self.model_col.classes[row.argmax()] for row in y_proba_sum]
        
        self.prediction = y_pred_sum
#         print self.model_col.models[2]
        return self.prediction
        
        
def azureml_main(train_data = None, test_data = None):
    features = [x for x in train_data.columns if x not in ['geo','segment','subgroup' ]]
    test_data = test_data[features]
    
    X_train,y_train,X_test = preprocess_data(train_data, test_data)
    
    m = combined_model()
    m.fit(X_train, y_train)
    y_pred = m.predict(X_test)
    
#     test_data['Geo_Pred']      = y_pred/100
#     test_data['Segment_Pred']  = y_pred/10%10
#     test_data['Subgroup_Pred'] = y_pred%10
    
#     return test_data[['patientID','Geo_Pred','Segment_Pred','Subgroup_Pred']]

#     output = pd.DataFrame()
#     output['patientID']=test_data['patientID']
#     output['Geo_Pred']      = y_pred/100
#     output['Segment_Pred']  = y_pred/10%10
#     output['Subgroup_Pred'] = y_pred%10
#     return output
    return y_pred




    
data = pd.read_csv('../datasets/WomenHealth_Training.csv')
from sklearn.cross_validation import KFold
scores = []
for train_ind, valid_ind in KFold(data.shape[0], n_folds=4): 
    train_data = data.loc[train_ind,:]
    test_data  = data.loc[valid_ind,:]
    y_test = 100*test_data['geo'] + 10*test_data['segment'] + test_data['subgroup']
    y_pred = azureml_main(train_data, test_data)
    scores.append(sklearn.metrics.accuracy_score(y_pred, y_test))
    print scores[-1]

## 7. save model and upload to Azure

In [None]:
class encode_categorical_feature:
    def __init__(self,feature):
        self.f_set = []
        self.feature = feature
    def fit(self,data):
        if self.feature in data.columns:
            self.f_set = sorted(list(data[self.feature].unique()))
            f_values   = data[self.feature].apply(lambda x: [self.f_set.index(x)])
            self.encoder = OneHotEncoder().fit(list(f_values))
    def transform(self,data):
        if not self.f_set:
            return data
        f_values  = data[self.feature].apply(lambda x: [self.f_set.index(x)])
        f_encoded = self.encoder.transform(list(f_values)).toarray()
        f_df      = pd.DataFrame(f_encoded, columns=[self.feature+'_'+str(x) for x in self.f_set])
        new_data  = pd.concat([data.reset_index(drop=True),f_df], axis=1)
        new_data.drop(self.feature, axis=1, inplace=True)
        return new_data


def preprocess_data_X(data):
    features = [x for x in data.columns if x not in 
                    ['patientID', 'geo','segment','subgroup','combined_label', 'INTNR' ]]
    X = data[features]
    # select features
    selection1 = [  'DISTRICT', u'tribe', u'REGION_PROVINCE', u'babydoc', u'foodinsecurity', 
                    u'religion_Buddhist', u'india', u'hindu', u'religion_Hindu', 
                    u'religion_Russian/Easter', u'educ', u'Debut', u'literacy', u'christian', 
                    u'hivknow', u'ModCon', u'age', u'thrasher', u'usecondom', 
                    u'religion_Other Christia', u'religion_Muslim', u'muslim', u'lowlit', 
                    u'multpart', u'motorcycle', u'CHILDREN', u'LaborDeliv']
    selection2 = [  u'christian', u'hindu', u'REGION_PROVINCE', u'DISTRICT', u'electricity', 
                    u'age', u'tribe', u'foodinsecurity', u'EVER_HAD_SEX', u'EVER_BEEN_PREGNANT', 
                    u'CHILDREN', u'india', u'married', u'multpart', u'educ', u'literacy', 
                    u'LaborDeliv', u'babydoc', u'Debut', u'ModCon', u'usecondom', u'hivknow', 
                    u'religion_Buddhist', u'religion_Hindu', u'religion_Russian/Easter']
    selection3 = [  u'DISTRICT', u'tribe', u'REGION_PROVINCE', u'babydoc', u'india', u'educ', 
                    u'Debut', u'literacy', u'hivknow', u'ModCon', u'age', u'usecondom', 
                    u'multpart', u'CHILDREN', u'LaborDeliv', u'married']
    selected_features = selection1+selection2+selection3
    X = X[selected_features]
    return X.values

    

def preprocess_data_y(data):
    target = 'combined_label'
    data['combined_label'] = 100*data['geo'] + 10*data['segment'] + data['subgroup']
    return data['combined_label'].values

In [None]:
import os
import pickle
data = pd.read_csv('../datasets/WomenHealth_Training.csv')
train_data, test_data = data, data

t = encode_categorical_feature('religion')
t.fit(train_data)
train_data = t.transform(train_data)
test_data = t.transform(test_data)
X_train = preprocess_data_X(train_data)
y_train = preprocess_data_X(train_data)
X_test  = preprocess_data_X(test_data)
y_test  = preprocess_data_X(test_data)

m = RandomForestClassifier(n_estimators=1000, max_depth=13, random_state=0)
m.fit(X_train,y_train)
print sklearn.metrics.accuracy_score(y_test, m.predict(X_test))
print os.getcwd()
pickle.dump(m, open('rf_model.pkl', 'wb'))