In [142]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder

class encode_categorical_feature:
    def __init__(self,feature):
        self.f_set = []
        self.feature = feature
    def fit(self,data):
        if self.feature in data.columns:
            self.f_set = sorted(list(data[self.feature].unique()))
            f_values   = data[self.feature].apply(lambda x: [self.f_set.index(x)])
            self.encoder = OneHotEncoder().fit(list(f_values))
    def transform(self,data):
        if not self.f_set:
            return data
        f_values  = data[self.feature].apply(lambda x: [self.f_set.index(x)])
        f_encoded = self.encoder.transform(list(f_values)).toarray()
        f_df      = pd.DataFrame(f_encoded, columns=[self.feature+'_'+str(x) for x in self.f_set])
        new_data  = pd.concat([data.reset_index(drop=True),f_df], axis=1)
        new_data.drop(self.feature, axis=1, inplace=True)
        return new_data

def preprocess_data(train_data, test_data):
    t = encode_categorical_feature('religion')
    t.fit(train_data)
    train_data = t.transform(train_data)
    test_data  = t.transform(test_data)
    
    features = [x for x in train_data.columns if x not in ['patientID', 'geo','segment','subgroup','combined_label', 'INTNR' ]]
    target = 'combined_label'
    # train_data
    train_data['combined_label'] = 100*train_data['geo'] + 10*train_data['segment'] + train_data['subgroup']
    train_data = train_data.fillna(0)
    X_train,y_train = train_data[features],train_data[target]
    # test_data
    test_data = test_data.fillna(0)
    X_test = test_data[features]
    
    return X_train.values,y_train.values,X_test.values
#     return X_train,y_train,X_test

class geo_dependent_model:
    def __init__(self):
        self.geo_model= GradientBoostingClassifier( n_estimators=100,  max_depth=1, random_state=0, learning_rate=0.1)
        self.seg_models =[ 
                    GradientBoostingClassifier( n_estimators=100,  max_depth=2,  random_state=0, learning_rate=0.1),
                        RandomForestClassifier( n_estimators=1000, max_depth=13, random_state=0),
                    GradientBoostingClassifier( n_estimators=100,  max_depth=1,  random_state=0, learning_rate=0.1),
                    GradientBoostingClassifier( n_estimators=1000, max_depth=2,  random_state=0, learning_rate=0.1),
                        RandomForestClassifier( n_estimators=400,  max_depth=10, random_state=0),
                    GradientBoostingClassifier( n_estimators=100,  max_depth=1,  random_state=0, learning_rate=0.1),
                        RandomForestClassifier( n_estimators=1200, max_depth=6 , random_state=0),
                    GradientBoostingClassifier( n_estimators=100,  max_depth=1,  random_state=0, learning_rate=0.1),
                    GradientBoostingClassifier( n_estimators=100,  max_depth=1,  random_state=0, learning_rate=0.1)
                ]
        
    def fit(self, X_train, y_train):
        geo_train = np.array([yy/100 for yy in y_train])
        self.geo_model.fit(X_train,geo_train)
        for geo in range(1,10):
            geo_ind = [i for i in range(X_train.shape[0]) if geo_train[i]==geo]
            X_geo_i_train = X_train[geo_ind,:]
            y_geo_i_train = y_train[geo_ind]
            self.seg_models[geo-1].fit(X_geo_i_train,y_geo_i_train)
    
    def predict(self,X_test):
        y_pred = np.array([0]*X_test.shape[0])
        geo_pred = self.geo_model.predict(X_test)
        for geo in range(1,10):
            geo_ind = [i for i in range(X_test.shape[0]) if geo_pred[i]==geo]
            X_geo_i_test = X_test[geo_ind,:]
            y_geo_i_pred = self.seg_models[geo-1].predict(X_geo_i_test)
            y_pred[geo_ind] = y_geo_i_pred
        return y_pred
    
    def score(self,X_test,y_test):
        y_pred = self.predict(X_test)
        pred_score = sklearn.metrics.accuracy_score(y_pred,y_test)
        return pred_score


def azureml_main(train_data = None, test_data = None):
    features = [x for x in train_data.columns if x not in ['geo','segment','subgroup' ]]
    test_data = test_data[features]
    
    X_train,y_train,X_test = preprocess_data(train_data, test_data)
    
    m = geo_dependent_model()
    m.fit(X_train,y_train)
    y_pred = m.predict(X_test)
    
#     test_data['Geo_Pred']      = y_pred/100
#     test_data['Segment_Pred']  = y_pred/10%10
#     test_data['Subgroup_Pred'] = y_pred%10
    
#     return test_data[['patientID','Geo_Pred','Segment_Pred','Subgroup_Pred']]
    output = pd.DataFrame()
    output['patientID']=test_data['patientID']
    output['Geo_Pred']      = y_pred/100
    output['Segment_Pred']  = y_pred/10%10
    output['Subgroup_Pred'] = y_pred%10
    return output



In [143]:
data = pd.read_csv('../datasets/WomenHealth_Training.csv')

train_ind, valid_ind, y_train_ind, y_valid_ind = sklearn.cross_validation.train_test_split(range(data.shape[0]),range(data.shape[0]),test_size=0.25, random_state=42)
train_data = data.loc[train_ind,:]
test_data  = data.loc[valid_ind,:]

pred = azureml_main(train_data, test_data)

y_test = 100*test_data['geo'] + 10*test_data['segment'] + test_data['subgroup']
y_pred = 100*pred['Geo_Pred'] + 10*pred['Segment_Pred'] + pred['Subgroup_Pred']
print sklearn.metrics.accuracy_score(y_pred,y_test)

0.859197577593


In [106]:
data = pd.read_csv('../datasets/WomenHealth_Training.csv')

train_ind, valid_ind, y_train_ind, y_valid_ind = sklearn.cross_validation.train_test_split(range(data.shape[0]),range(data.shape[0]),test_size=0.25, random_state=42)
train_data = data.ix[train_ind,:]
test_data  = data.ix[valid_ind,:]

pred = azureml_main(data, data)

y_test = 100*data['geo'] + 10*data['segment'] + data['subgroup']
y_pred = 100*pred['Geo_Pred'] + 10*pred['Segment_Pred'] + pred['Subgroup_Pred']
print sklearn.metrics.accuracy_score(y_pred,y_test)

0.961385576377


In [138]:
data = pd.read_csv('../datasets/WomenHealth_Training.csv')

train_ind = range(data.shape[0])
valid_ind = [5251]
train_data = data.loc[train_ind,:]
test_data  = data.loc[valid_ind,:]

pred = azureml_main(train_data, test_data)

y_test = 100*test_data['geo'] + 10*test_data['segment'] + test_data['subgroup']
y_pred = 100*pred['Geo_Pred'] + 10*pred['Segment_Pred'] + pred['Subgroup_Pred']
print sklearn.metrics.accuracy_score(y_pred,y_test)

1.0
