In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import linear_model
from sklearn import svm
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
%matplotlib inline

class encode_categorical_feature:
    def __init__(self,feature):
        self.f_set = []
        self.feature = feature
    def fit(self,data):
        if self.feature in data.columns:
            self.f_set = sorted(list(data[self.feature].unique()))
            f_values   = data[self.feature].apply(lambda x: [self.f_set.index(x)])
            self.encoder = OneHotEncoder().fit(list(f_values))
    def transform(self,data):
        if not self.f_set:
            return data
        f_values  = data[self.feature].apply(lambda x: [self.f_set.index(x)])
        f_encoded = self.encoder.transform(list(f_values)).toarray()
        f_df      = pd.DataFrame(f_encoded, columns=[self.feature+'_'+str(x) for x in self.f_set])
        new_data  = pd.concat([data.reset_index(drop=True),f_df], axis=1)
        new_data.drop(self.feature, axis=1, inplace=True)
        return new_data

def preprocess_data(train_data, test_data):
    t = encode_categorical_feature('religion')
    t.fit(train_data)
    train_data = t.transform(train_data)
    test_data  = t.transform(test_data)
    
    features = [x for x in train_data.columns if x not in ['patientID', 'geo','segment','subgroup','combined_label', 'INTNR' ]]
    target = 'combined_label'
    # train_data
    train_data['combined_label'] = 100*train_data['geo'] + 10*train_data['segment'] + train_data['subgroup']
    train_data = train_data.fillna(0)
    X_train,y_train = train_data[features],train_data[target]
    # test_data
    test_data = test_data.fillna(0)
    X_test = test_data[features]
    # select features
    selection1 = ['DISTRICT', u'tribe', u'REGION_PROVINCE', u'babydoc', u'foodinsecurity', u'religion_Buddhist', u'india', u'hindu', u'religion_Hindu', u'religion_Russian/Easter', u'educ', u'Debut', u'literacy', u'christian', u'hivknow', u'ModCon', u'age', u'thrasher', u'usecondom', u'religion_Other Christia', u'religion_Muslim', u'muslim', u'lowlit', u'multpart', u'motorcycle', u'CHILDREN', u'LaborDeliv']
    selection2 = [u'christian', u'hindu', u'REGION_PROVINCE', u'DISTRICT', u'electricity', u'age', u'tribe', u'foodinsecurity', u'EVER_HAD_SEX', u'EVER_BEEN_PREGNANT', u'CHILDREN', u'india', u'married', u'multpart', u'educ', u'literacy', u'LaborDeliv', u'babydoc', u'Debut', u'ModCon', u'usecondom', u'hivknow', u'religion_Buddhist', u'religion_Hindu', u'religion_Russian/Easter']
    selection3 = [u'DISTRICT', u'tribe', u'REGION_PROVINCE', u'babydoc', u'india', u'educ', u'Debut', u'literacy', u'hivknow', u'ModCon', u'age', u'usecondom', u'multpart', u'CHILDREN', u'LaborDeliv', u'married']
    selected_features = selection1+selection2+selection3
    X_train = X_train[selected_features]
    X_test = X_test[selected_features]
#     return X_train,y_train,X_test
    return X_train.values,y_train.values,X_test.values

def gen_models():
    def gen_models_rf(parameters):
        models = []
        for n in parameters['n_estimators']:
            for m in parameters['max_depth']:
                models.append(RandomForestClassifier(n_estimators=n, max_depth=m, random_state=0))
        return models
    def gen_models_gbm(parameters):
        models = []
        for n in parameters['n_estimators']:
            for m in parameters['max_depth']:
                for l in parameters['learning_rate']:
                    models.append(GradientBoostingClassifier(n_estimators=n, max_depth=m, random_state=0))
        return models
    def gen_models_log(parameters):
        models = []
        for p in parameters['penalty']:
            for c in parameters['C']:
                models.append(linear_model.LogisticRegression(penalty=p, C=c))
        return models

    def gen_models_svm(parameters):
        models = []
        for c in parameters['C']:
            for k in parameters['kernel']:
                if 'gamma' in parameters:
                    for g in parameters['gamma']:
                        models.append(svm.SVC(C=c, kernel=k, gamma=g, probability=True))
                else:
                    models.append(svm.SVC(C=c, kernel=k, probability=True))

        return models

    param_gbm = { 'n_estimators':[50, 100,200],
                  'learning_rate':[0.1,1.0],
                  'max_depth':[1,2,3]}
    param_rf  = {'n_estimators':[100,200,500,800,1000,],
                  'max_depth':[1,2,4,6,8,10,13]}
    param_svc = [
                  {'C':[0.1, 1, 10, 100], 'kernel': ['linear']},
                  {'C':[0.1, 1, 10, 100], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
                ]
    param_log = {'penalty':('l1','l2'), 
                  'C':[0.1, 1, 10, 100]}
    models  = gen_models_gbm(param_gbm)
    models += gen_models_rf(param_rf)
    models += gen_models_svm(param_svc[0])
    models += gen_models_svm(param_svc[1])
    models += gen_models_log(param_log)
    selection_ind = [45]#, 38, 52, 31, 24, 16, 17, 30, 14, 15, 37]
    models = [models[i] for i in selection_ind]
    return models


class model_collection:
    def __init__(self, models):
        self.models = models[:]
    def fit(self, X_train, y_train):
        self.classes = np.unique(y_train)
        for i in range(len(self.models)):
            self.models[i].fit(X_train,y_train)
    def predict_proba(self, X_test):
        y_proba = []
        for m in self.models:
            y_proba.append(m.predict_proba(X_test))
        return y_proba
    def predict(self, X_test):
        y_pred = []
        for m in self.models:
            y_pred.append(m.predict(X_test))
        return y_pred
    

class combined_model:
    def __init__(self):
        self.models = gen_models()
        self.num_models = len(self.models)
    def fit(self, X_train, y_train):
        self.classes = np.unique(y_train)
        for i in range(len(self.models)):
            self.models[i].fit(X_train,y_train)
    def predict(self, X_test):
        y_pred      = np.array([m.predict(X_test) for m in self.models]).T
        y_proba     = [m.predict_proba(X_test) for m in self.models]
        y_proba_max = np.array([[row.max() for row in yy] for yy in y_proba]).T
        
        # predictions with better confidency
        p_cut = 0.8
        y_pred_increament = np.zeros(X_test.shape[0])
        for k in range(y_pred.shape[1]):
            for ind in range(X_test.shape[0]):
                if y_pred_increament[ind]==0 and y_proba_max[ind,k]>p_cut:
                    y_pred_increament[ind] = y_pred[ind,k]

        # remaining preditions
        ind_remaining = [i for i in range(X_test.shape[0]) if y_pred_increament[i]==0]
        best_m_for_remaining = 0
        y_pred_increament[ind_remaining] = y_pred[ind_remaining,best_m_for_remaining]
#         self.prediction = y_pred_increament
#         print self.model_col.models[2]
        return y_pred_increament
        
        
def azureml_main(train_data = None, test_data = None):
    features = [x for x in train_data.columns if x not in ['geo','segment','subgroup' ]]
    test_data = test_data[features]
    
    X_train,y_train,X_test = preprocess_data(train_data, test_data)
    
    if 0:
        m = pickle.load(open("./Script Bundle/model_7.2_incremental_selection.pkl",'rb'))
    else:
        m = combined_model()
        m.fit(X_train,y_train)
        
    y_pred = m.predict(X_test)
    
#     test_data['Geo_Pred']      = y_pred/100
#     test_data['Segment_Pred']  = y_pred/10%10
#     test_data['Subgroup_Pred'] = y_pred%10
    
#     return test_data[['patientID','Geo_Pred','Segment_Pred','Subgroup_Pred']]

#     output = pd.DataFrame()
#     output['patientID']=test_data['patientID']
#     output['Geo_Pred']      = y_pred/100
#     output['Segment_Pred']  = y_pred/10%10
#     output['Subgroup_Pred'] = y_pred%10
#     return output
    return y_pred


In [15]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import linear_model
from sklearn import svm
from sklearn.preprocessing import OneHotEncoder
import pickle
# import matplotlib.pyplot as plt
# %matplotlib inline

class encode_categorical_feature:
    def __init__(self,feature):
        self.f_set = []
        self.feature = feature
    def fit(self,data):
        if self.feature in data.columns:
            self.f_set = sorted(list(data[self.feature].unique()))
            f_values   = data[self.feature].apply(lambda x: [self.f_set.index(x)])
            self.encoder = OneHotEncoder().fit(list(f_values))
    def transform(self,data):
        if not self.f_set:
            return data
        f_values  = data[self.feature].apply(lambda x: [self.f_set.index(x)])
        f_encoded = self.encoder.transform(list(f_values)).toarray()
        f_df      = pd.DataFrame(f_encoded, columns=[self.feature+'_'+str(x) for x in self.f_set])
        new_data  = pd.concat([data.reset_index(drop=True),f_df], axis=1)
        new_data.drop(self.feature, axis=1, inplace=True)
        return new_data


def preprocess_data_X(data):
    data.fillna(0, inplace=True)
    features = [x for x in data.columns if x not in 
                    ['patientID', 'geo','segment','subgroup','combined_label', 'INTNR' ]]
    X = data[features]
    # select features
    selection1 = [  'DISTRICT', u'tribe', u'REGION_PROVINCE', u'babydoc', u'foodinsecurity', 
                    u'religion_Buddhist', u'india', u'hindu', u'religion_Hindu', 
                    u'religion_Russian/Easter', u'educ', u'Debut', u'literacy', u'christian', 
                    u'hivknow', u'ModCon', u'age', u'thrasher', u'usecondom', 
                    u'religion_Other Christia', u'religion_Muslim', u'muslim', u'lowlit', 
                    u'multpart', u'motorcycle', u'CHILDREN', u'LaborDeliv']
    selection2 = [  u'christian', u'hindu', u'REGION_PROVINCE', u'DISTRICT', u'electricity', 
                    u'age', u'tribe', u'foodinsecurity', u'EVER_HAD_SEX', u'EVER_BEEN_PREGNANT', 
                    u'CHILDREN', u'india', u'married', u'multpart', u'educ', u'literacy', 
                    u'LaborDeliv', u'babydoc', u'Debut', u'ModCon', u'usecondom', u'hivknow', 
                    u'religion_Buddhist', u'religion_Hindu', u'religion_Russian/Easter']
    selection3 = [  u'DISTRICT', u'tribe', u'REGION_PROVINCE', u'babydoc', u'india', u'educ', 
                    u'Debut', u'literacy', u'hivknow', u'ModCon', u'age', u'usecondom', 
                    u'multpart', u'CHILDREN', u'LaborDeliv', u'married']
    selected_features = selection1+selection2+selection3
    X = X[selected_features]
    return X.values

    

def preprocess_data_y(data):
    target = 'combined_label'
    data['combined_label'] = 100*data['geo'] + 10*data['segment'] + data['subgroup']
    return data['combined_label'].values

def gen_models():
    def gen_models_rf(parameters):
        models = []
        for n in parameters['n_estimators']:
            for m in parameters['max_depth']:
                models.append(RandomForestClassifier(n_estimators=n, max_depth=m, random_state=0))
        return models
    def gen_models_gbm(parameters):
        models = []
        for n in parameters['n_estimators']:
            for m in parameters['max_depth']:
                for l in parameters['learning_rate']:
                    models.append(GradientBoostingClassifier(n_estimators=n, max_depth=m, random_state=0))
        return models
    def gen_models_log(parameters):
        models = []
        for p in parameters['penalty']:
            for c in parameters['C']:
                models.append(linear_model.LogisticRegression(penalty=p, C=c))
        return models

    def gen_models_svm(parameters):
        models = []
        for c in parameters['C']:
            for k in parameters['kernel']:
                if 'gamma' in parameters:
                    for g in parameters['gamma']:
                        models.append(svm.SVC(C=c, kernel=k, gamma=g, probability=True))
                else:
                    models.append(svm.SVC(C=c, kernel=k, probability=True))

        return models

    param_gbm = { 'n_estimators':[50, 100,200],
                  'learning_rate':[0.1,1.0],
                  'max_depth':[1,2,3]}
    param_rf  = {'n_estimators':[100,200,500,800,1000,],
                  'max_depth':[1,2,4,6,8,10,13]}
    param_svc = [
                  {'C':[0.1, 1, 10, 100], 'kernel': ['linear']},
                  {'C':[0.1, 1, 10, 100], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
                ]
    param_log = {'penalty':('l1','l2'), 
                  'C':[0.1, 1, 10, 100]}
    models  = gen_models_gbm(param_gbm)
    models += gen_models_rf(param_rf)
    models += gen_models_svm(param_svc[0])
    models += gen_models_svm(param_svc[1])
    models += gen_models_log(param_log)
    selection_ind = [45]#, 38, 52, 31, 24, 16, 17, 30, 14, 15, 37]
    models = [models[i] for i in selection_ind]
    return models


class model_collection:
    def __init__(self, models):
        self.models = models[:]
    def fit(self, X_train, y_train):
        self.classes = np.unique(y_train)
        for i in range(len(self.models)):
            self.models[i].fit(X_train,y_train)
    def predict_proba(self, X_test):
        self.y_proba = []
        for m in self.models:
            self.y_proba.append(m.predict_proba(X_test))
        return self.y_proba
    def predict(self, X_test):
        self.y_pred = []
        for m in self.models:
            self.y_pred.append(m.predict(X_test))
        return self.y_pred
    

class combined_model:
    def __init__(self):
        self.models = gen_models()
    def fit(self, X_train, y_train):
        self.classes = np.unique(y_train)
        for i in range(len(self.models)):
            self.models[i].fit(X_train,y_train)
    def predict(self, X_test):
        y_pred      = np.array([m.predict(X_test) for m in self.models]).T
        y_proba     = [m.predict_proba(X_test) for m in self.models]
        y_proba_max = np.array([[row.max() for row in yy] for yy in y_proba]).T
        # predictions with better confidency
        p_cut = 0.8
        y_pred_increament = np.zeros(X_test.shape[0])
        for k in range(y_pred.shape[1]):
            for ind in range(X_test.shape[0]):
                if y_pred_increament[ind]==0 and y_proba_max[ind,k]>p_cut:
                    y_pred_increament[ind] = y_pred[ind,k]

        # remaining preditions
        ind_remaining = [i for i in range(X_test.shape[0]) if y_pred_increament[i]==0]
        best_m_for_remaining = 0
        y_pred_increament[ind_remaining] = y_pred[ind_remaining,best_m_for_remaining]
#         self.prediction = y_pred_increament
#         print self.model_col.models[2]
        return np.array(y_pred_increament)
        
        
def azureml_main(train_data = None, test_data = None):
    features = [x for x in train_data.columns if x not in ['geo','segment','subgroup' ]]
    test_data = test_data[features]

    t = encode_categorical_feature('religion')
    t.fit(train_data)
    test_data = t.transform(test_data)
    X_test  = preprocess_data_X(test_data)

    if 0:
        m = pickle.load(open("./Script Bundle/model_7.2_incremental_selection.pkl",'rb'))
    else:
        train_data = t.transform(train_data)
        X_train    = preprocess_data_X(train_data)
        y_train    = preprocess_data_y(train_data)

        m = combined_model()
        m.fit(X_train,y_train)
    y_pred = m.predict(X_test)
    
    test_data['Geo_Pred']      = map(int,y_pred/100)
    test_data['Segment_Pred']  = map(int,y_pred/10%10)
    test_data['Subgroup_Pred'] = map(int,y_pred%10)
    
    return test_data[['patientID','Geo_Pred','Segment_Pred','Subgroup_Pred']]
    # output = pd.DataFrame()
    # output['patientID']=test_data['patientID']
    # output['Geo_Pred']      = y_pred/100
    # output['Segment_Pred']  = y_pred/10%10
    # output['Subgroup_Pred'] = y_pred%10
    # return output



## Verify performance

In [None]:
data = pd.read_csv('../datasets/WomenHealth_Training.csv')
from sklearn.cross_validation import KFold
scores = []
for train_ind, valid_ind in KFold(data.shape[0], n_folds=4): 
    train_data = data.loc[train_ind,:]
    test_data  = data.loc[valid_ind,:]
    y_test = 100*test_data['geo'] + 10*test_data['segment'] + test_data['subgroup']
    y_pred = azureml_main(train_data, test_data)
    scores.append(sklearn.metrics.accuracy_score(y_pred, y_test))
    print scores[-1]

In [None]:
# output
0.856169568509
0.8720666162
0.799394398183
0.856060606061

## Train model, save to a pickle file and upload to MS Azure ML

In [None]:
data = pd.read_csv('../datasets/WomenHealth_Training.csv')
train_data, test_data = data.copy(), data.copy()
X_train,y_train,X_test = preprocess_data(train_data, test_data)
y_test = 100*test_data['geo'] + 10*test_data['segment'] + test_data['subgroup']

m = combined_model()
m.fit(X_train, y_train)
print sklearn.metrics.accuracy_score(y_test, m.predict(X_test))
import pickle
# pickle.dump(m, open('model_7.2_incremental_selection.pkl', 'wb'))
# m2 = pickle.load(open('model_7.2_incremental_selection.pkl','rb'))
# print sklearn.metrics.accuracy_score(y_test, m2.predict(X_test))

In [16]:
data = pd.read_csv('../datasets/WomenHealth_Training.csv')
train_data, test_data = data.copy(), data.copy()
# X_train,y_train,X_test = preprocess_data(train_data, test_data)
# y_test = 100*test_data['geo'] + 10*test_data['segment'] + test_data['subgroup']

azureml_main(train_data,test_data)

Unnamed: 0,patientID,Geo_Pred,Segment_Pred,Subgroup_Pred
0,4835,9,3,1
1,6719,4,1,1
2,5450,2,3,1
3,1207,2,3,1
4,7290,9,3,1
5,620,3,1,1
6,7760,2,2,1
7,5437,2,3,1
8,983,9,3,1
9,7491,8,3,1


In [14]:
map(int,[1.2,3.2])

[1, 3]