# Ensembale Mode here
Combine all the sub-model with Bagging method

In [29]:
import numpy as np
import pandas as pd
import scipy
import json
import seaborn as sns
from sklearn.base import TransformerMixin
from sklearn import preprocessing
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, learning_curve, StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve, classification_report, confusion_matrix, plot_confusion_matrix
from sklearn.pipeline import make_pipeline, Pipeline
import joblib
import matplotlib.pyplot as plt

### Figure the basic value

In [30]:
model_path = 'Choesn_model/'
configure_file_path = 'Choesn_model/Configuration.json'

TRAINFILE = 'keyword.csv'
TESTFILE = 'key_word_test.csv'
boolean = 'True'

## Define the MODEL object

**!!! ATTENTION !!!**  
Follow cell code **DO NOT** modify

In [31]:
class Model(object):

    def  __init__(self, model, name, test_set, topic, is_preprocess, level):
        self.model = model
        self.name = name
        self.test_set = test_set
        self.topic = topic
        self.is_preprocess = is_preprocess
        self.level = level

    # for debug use
    def show(self):
        print(
            self.name,'\t',
            self.test_set,'\t',
            self.topic,'\t',
            self.is_preprocess,'\t',
            self.level
        )

    def predict(self, x):
        return self.model.predict(x)

    def predict_proba(self, x):
        return self.model.predict_proba(x)



## Load the model detail from json figuration file

In [32]:
def load_configuration_to_model(file_path):
    '''
    Load the json file and figure the parameter of each model

    Return: (Tuple Object) with two sub-list
            sub-list_1: Model for layer one (For revelent and irrevelent)
            sub-list_2: Model for layer two (For topic decision)
    '''

    with open(configure_file_path, 'r') as json_fp:
        configuration = json.load(json_fp)

    layer_1 = []
    layer_2 = []
    for model_figure in configuration:

        # read the figure
        model_file = joblib.load(model_path + model_figure['model_name'])
        name = model_figure['model_name']
        test_set = model_figure['test_set']
        topic = model_figure['topic']
        is_preprocess = boolean == model_figure['preprocess']
        level = int(model_figure['level'])

        # New model object to save those arguments
        model = Model(model_file, name, test_set, topic, is_preprocess, level)

        # append to model list for futher processing
        if level == 1:
            layer_1.append(model)
        else:
            layer_2.append(model)

    return layer_1,layer_2


## Prepare the testing data and preprocess vector

In [49]:
def get_vector(column_name, special=False):
    '''
    df                      str: The train df
    fit_column              str: The column for vector to fit

    Return: (Vectorizer Object)
            Vectorizer of current column
    '''
    train_df = pd.read_csv(TRAINFILE)
    if special is not False:
        train_df[special] = train_df[special].apply(lambda x: x.replace('_', ''))

    # prepare the tranform vector
    vector = TfidfVectorizer().fit(train_df[column_name])

    return vector

def preprocess(df, column_name_list):
    '''
    This function to use to prepare all the data for ensemble system running
    including RAW data and Vector-preprocess data

    Return: (Dict object)
            A after preprocessing data dict, it order by column_name_list
        
            ext:
            Input: column_name_list: ['key_word_100', 'article_words']
            Output: test_data_dict: test_data_dict['key_word_100'] --> key_word_100
                                    test_data_dict['article_words'] -> article_words
                                    test_data_dict['raw']          --> original data
    '''

    test_data_dict = {}

    # first add original data
    test_data_dict['raw'] = df

    vector = get_vector('article_words', special='article_words')
    
    for column in column_name_list:
        en_data = vector.transform(df[column])
        test_data_dict[str(column)] = en_data

    # for special data, add it by manul
    vector = get_vector('key_word_100')
    test_data_dict['key_word_100_1'] = vector.transform(df['key_word_100'])

    return test_data_dict


In [51]:
df = pd.read_csv(TESTFILE)
dict_data = preprocess(df, ['article_words', 'key_word_100'])
print(dict_data['article_words'].shape)
print(dict_data['key_word_100'].shape)

(500, 35817)
(500, 35817)


### Follow is for ensemble evaluate

In [None]:
def evaluate(res_df):
    '''
    Here is for the evaluate the ensamble model

    Input: (DataFrame Object) Should be result of the prediction
    Output: 
    '''

    report = []
    topic_list = list(set(res_df['label']))
    for topic in topic_list:
        test_df = df[df['label'] == topic]
        accuarcy = accuracy_score(test_df['label'], test_df['predict'])
        print(topic,'accuarcy is:\t\t', accuarcy)
        record.append(accuarcy)
