# Environment

In [1]:
# from google.colab import drive
# drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
# %cd /content/drive/MyDrive/Colab Notebooks/ML 2021Fall/HW03/src

/content/drive/MyDrive/Colab Notebooks/ML 2021Fall/HW03/src


In [3]:
TEST = True
part1_data_path = "../data/all_data/"
clause_counts = ['c300', 'c500', 'c1000', 'c1500', 'c1800']
example_counts = ['d100', 'd1000', 'd5000']

# Load Data

In [4]:
import os
import re
from tqdm.notebook import tqdm
import pandas as pd

def load_dataset(dataset_path):
    """ Load a dataset

    Parameters
    ----------
    dataset_path : str
        The file location of the dataset
    
    Returns
    --------
    sample_label_set : list
        A two element list, whose first element is a dataframe of the input patterns, and its second element is a dataframe of the labels of the first element. 
    """
    dataset = pd.read_csv(dataset_path, header=None)
    sample_label_set = [dataset.iloc[:, :-1], dataset.iloc[:, -1:]]
    return sample_label_set

def load_datasets(dataset_dir):
    """Load all the datasets

    Parameters
    ----------
    dataset_dir : str
        The file location of the datasets

    Returns
    -------
    datasets : dict
        A three-level dictionary containing the datasets. The three-level are: dataset-type, number of clause, number of samples. 
    """
    datasets = dict()
    for file in tqdm(os.listdir(part1_data_path)):
        keys = re.split('[_.]', file)
        if keys[0] not in datasets:
            datasets[keys[0]] = dict()
        if keys[1] not in datasets[keys[0]]:
            datasets[keys[0]][keys[1]] = dict()
        if keys[2] not in datasets[keys[0]][keys[1]]:
            datasets[keys[0]][keys[1]][keys[2]] = load_dataset(dataset_dir + file)
    return datasets

In [5]:
datasets = load_datasets(part1_data_path)

  0%|          | 0/45 [00:00<?, ?it/s]

In [6]:
if TEST:
    test_dataset = datasets['train']['c300']['d100']

# ML

## DecisionTreeClassifier

In [7]:
from sklearn import tree
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV, PredefinedSplit

In [8]:
def get_dataset(datasets, clause_count, example_count):
    """Preprocess the dataset

    Parameters
    ----------
    datasets : dict
        The three-level dictionary containing all the datasets
    clause_count : str
        The second-level key of the dataset
    example_count : str
        The three-level key of the dataset
    
    Returns
    -------
    X : DataFrame
        Concatenate of X_train and X_val
    y : DataFrame
        Labels of X
    X_test : DataFrame
        Input patterns of the testset
    y_test : DataFrame
        Label of the testset
    pds : PredefinedSplit
        Specify the train/valid split on X_tran and X_val
    """
    X_train, y_train = datasets['train'][clause_count][example_count][0], datasets['train'][clause_count][example_count][1]
    X_val, y_val = datasets['valid'][clause_count][example_count][0], datasets['valid'][clause_count][example_count][1]
    X_test, y_test = datasets['test'][clause_count][example_count][0], datasets['test'][clause_count][example_count][1]
    X = pd.concat([X_train, X_val])
    y = pd.concat([y_train, y_val])
    split_index = [-1]*X_train.shape[0]
    split_index.extend([0]*X_val.shape[0])
    pds = PredefinedSplit(test_fold = split_index)
    return X, y, X_test, y_test, pds

In [None]:
# def tune_DecisionTreeClassifier(X, y, pds):
#     #param
#     scoring_method = {'accuracy':make_scorer(accuracy_score), 'f1':make_scorer(f1_score)}
#     decisionTree_param = [{'criterion':['gini', 'entropy'], 'random_state':[42]}]
#     #initial fit
#     grid = GridSearchCV(tree.DecisionTreeClassifier(), param_grid=decisionTree_param, n_jobs=-1, scoring=scoring_method, refit='accuracy',cv=pds)
#     grid.fit(X, y)
#     #post prune
#     clf = grid.best_estimator_
#     path = clf.cost_complexity_pruning_path(X_train, y_train)
#     ccp_alphas, impurities = path.ccp_alphas, path.impurities

#     decisionTree_param = clf.get_params()
#     for key in decisionTree_param:
#         decisionTree_param[key] = [decisionTree_param[key]]
#     decisionTree_param['ccp_alpha'] = ccp_alphas

#     grid = GridSearchCV(tree.DecisionTreeClassifier(), param_grid=decisionTree_param, n_jobs=-1, scoring=scoring_method, refit='accuracy',cv=pds)
#     grid.fit(X, y)

#     return grid.best_estimator_

In [15]:
def tune_DecisionTreeClassifier(X, y, pds):
    """Fine tune the decision tree using grid search

    Parameters
    ----------
    X : DataFrame
        data pattern
    y : DataFrame
        lables
    pds : PredefinedSplit
        train/valid split
    
    Returns
    -------
    grid.cv_results_ :

    grid.best_estimator_ :

    """
    #param
    scoring_method = {'accuracy':make_scorer(accuracy_score), 'f1':make_scorer(f1_score)}
    decisionTree_param = [{'criterion':['gini', 'entropy'], 'random_state':[42], 'min_samples_split':[2, 4, 8, 16, 32, 64]}]
    #initial fit
    grid = GridSearchCV(tree.DecisionTreeClassifier(), param_grid=decisionTree_param, n_jobs=-1, scoring=scoring_method, refit='accuracy',cv=pds)
    grid.fit(X, y)

    return grid.cv_results_, grid.best_estimator_

In [21]:
def evaluate(model, X_test, y_test):
    """evaluate the model

    Parameters
    ----------
    model : 
        the model being evaluated
    X_test : DataFrame
        test input pattern
    y_test : DataFrame
        test labels

    Returns
    -------
    ac : float
        accuracy score
    f1 : float
        f1 score
    """
    y_pred = model.predict(X_test)
    ac = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return ac, f1

In [20]:
def train_all(tunning_fn):
    result = dict()
    for clause in clause_counts:
        if clause not in result:
            result[clause] = dict()
        for example in example_counts:
            if example not in result[clause]:
                result[clause][example] = dict()
            #get_dataset
            X, y, X_test, y_test, pds = get_dataset(datasets, clause, example)
            #tune model
            cv_results_, tuned_model = tunning_fn(X, y, pds)
            tuned_model.fit(X, y)
            #evaluate model
            ac, f1 = evaluate(tuned_model, X_test, y_test)
            result[clause][example] = {'best_model':tuned_model, 'ac':ac, 'f1':f1}
    return result


In [22]:
print('training decision trees')
result = train_all(tune_DecisionTreeClassifier)

training decision trees


In [27]:
def flat_DecisionTreeClassifier(result): 
    """reform result dictionary
   
    Parameters
    ----------
    result : dict
        a 2d dictionary from the train_all function

    Returns
    -------
    flat : dict
        a 1d dictionary
    """
    flat = dict()
    for k, v in result.items():
        for k2, v2 in v.items():
            params = v2['best_model'].get_params()
            flat[f'{k}_{k2}'] = {'criterion':params['criterion'], 'min_samples_split':params['min_samples_split'], 'ac':v2['ac'], 'f1':v2['f1']}
    return flat

In [28]:
flat = flat_DecisionTreeClassifier(result)
df = pd.DataFrame(flat)
df = df.transpose()
display(df)

Unnamed: 0,criterion,min_samples_split,ac,f1
c300_d100,gini,4,0.615,0.628019
c300_d1000,entropy,64,0.6675,0.670955
c300_d5000,gini,64,0.7734,0.780596
c500_d100,entropy,8,0.645,0.650246
c500_d1000,entropy,32,0.693,0.692385
c500_d5000,entropy,32,0.7814,0.785054
c1000_d100,gini,4,0.685,0.698565
c1000_d1000,entropy,16,0.804,0.806897
c1000_d5000,gini,64,0.8602,0.857782
c1500_d100,entropy,32,0.865,0.857143


## Bagging Classifier

In [29]:
from sklearn.ensemble import BaggingClassifier

In [32]:
def tune_BaggingClassifier(X, y, pds):
    """Fine tune the bagging classifier using grid search

    Parameters
    ----------
    X : DataFrame
        data pattern
    y : DataFrame
        lables
    pds : PredefinedSplit
        train/valid split
    
    Returns
    -------
    grid.cv_results_ :

    grid.best_estimator_ :

    """
    #param
    scoring_method = {'accuracy':make_scorer(accuracy_score), 'f1':make_scorer(f1_score)}
    bagging_param = [{'max_samples':[1.0, .5, .25], 'random_state':[42], 'max_features':[1.0, .5, .25]}]
    #initial fit
    grid = GridSearchCV(BaggingClassifier(), param_grid=bagging_param, n_jobs=-1, scoring=scoring_method, refit='accuracy',cv=pds)
    grid.fit(X, y)

    return grid.cv_results_, grid.best_estimator_

In [33]:
# X, y, X_test, y_test, pds = get_dataset(datasets, 'c300', 'd100')
# cv_results_, best_estimator = tune_BaggingClassifier(X, y, pds)

In [None]:
# df = pd.DataFrame(cv_results_)
# df = df[['param_max_features', 'param_max_samples', 'mean_test_accuracy','rank_test_accuracy', 'mean_test_f1', 'rank_test_f1']]
# df.sort_values('rank_test_accuracy')

In [None]:
print('training bagging')
result = train_all(tune_BaggingClassifier)

In [35]:
def flat_BaggingClassifier(result): 
    flat = dict()
    for k, v in result.items():
        for k2, v2 in v.items():
            params = v2['best_model'].get_params()
            flat[f'{k}_{k2}'] = {'max_samples':params['max_samples'], 'max_features':params['max_features'], 'ac':v2['ac'], 'f1':v2['f1']}
    return flat

In [36]:
flat = flat_BaggingClassifier(result)
df = pd.DataFrame(flat)
df = df.transpose()
display(df)

Unnamed: 0,max_samples,max_features,ac,f1
c300_d100,0.5,1.0,0.605,0.548571
c300_d1000,1.0,1.0,0.779,0.766385
c300_d5000,1.0,1.0,0.8553,0.852962
c500_d100,0.5,1.0,0.705,0.681081
c500_d1000,1.0,1.0,0.8185,0.811429
c500_d5000,1.0,1.0,0.8752,0.870405
c1000_d100,1.0,0.5,0.84,0.822222
c1000_d1000,1.0,0.25,0.913,0.910769
c1000_d5000,1.0,0.5,0.9556,0.95497
c1500_d100,1.0,0.5,0.955,0.95288


## RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
def tune_RandomForestClassifier(X, y, pds):
    #param
    scoring_method = {'accuracy':make_scorer(accuracy_score), 'f1':make_scorer(f1_score)}
    randomForest_param = [{'criterion':['gini', 'entropy'], 'random_state':[42], 'max_features':['sqrt', 'log2'], 'min_samples_split':[2, 8, 32, 128]}]
    #initial fit
    grid = GridSearchCV(RandomForestClassifier(), param_grid=randomForest_param, n_jobs=-1, scoring=scoring_method, refit='accuracy',cv=pds)
    grid.fit(X, y)

    return grid.cv_results_, grid.best_estimator_

In [None]:
print("training random forest")
result = train_all(tune_RandomForestClassifier)

In [None]:
def flat_RandomForestClassifier(result): 
    flat = dict()
    for k, v in result.items():
        for k2, v2 in v.items():
            params = v2['best_model'].get_params()
            flat[f'{k}_{k2}'] = {'criterion':params['criterion'], 'min_samples_split':params['min_samples_split'], 'max_features':params['max_features'], 'ac':v2['ac'], 'f1':v2['f1']}
    return flat

In [None]:
flat = flat_RandomForestClassifier(result)
df = pd.DataFrame(flat)
df = df.transpose()
display(df)

Unnamed: 0,criterion,min_samples_split,max_features,ac,f1
c300_d100,gini,8,sqrt,0.795,0.79397
c300_d1000,gini,32,sqrt,0.8675,0.867434
c300_d5000,entropy,128,log2,0.9077,0.908605
c500_d100,entropy,8,log2,0.84,0.841584
c500_d1000,gini,32,sqrt,0.9275,0.927464
c500_d5000,entropy,32,log2,0.9575,0.957958
c1000_d100,gini,8,log2,0.985,0.984925
c1000_d1000,entropy,8,log2,0.9905,0.990495
c1000_d5000,entropy,8,log2,0.9976,0.9976
c1500_d100,gini,2,sqrt,1.0,1.0


## GradientBoostingClassifier.

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
def tune_GradientBoostingClassifier(X, y, pds):
    #param
    scoring_method = {'accuracy':make_scorer(accuracy_score), 'f1':make_scorer(f1_score)}
    boosting_param = [{'random_state':[42], 'loss':['deviance', 'exponential'], 'n_iter_no_change':[3]}]
    #initial fit
    grid = GridSearchCV(GradientBoostingClassifier(), param_grid=boosting_param, n_jobs=-1, scoring=scoring_method, refit='accuracy',cv=pds)
    grid.fit(X, y)

    return grid.cv_results_, grid.best_estimator_

In [None]:
print("training gradient boosting")
result = train_all(tune_GradientBoostingClassifier)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [None]:
def flat_GradientBoostingClassifier(result): 
    flat = dict()
    for k, v in result.items():
        for k2, v2 in v.items():
            params = v2['best_model'].get_params()
            flat[f'{k}_{k2}'] = {'loss':params['loss'], 'ac':v2['ac'], 'f1':v2['f1']}
    return flat

In [None]:
flat = flat_GradientBoostingClassifier(result)
df = pd.DataFrame(flat)
df = df.transpose()
display(df)

Unnamed: 0,loss,ac,f1
c300_d100,exponential,0.7,0.705882
c300_d1000,exponential,0.9685,0.969313
c300_d5000,deviance,0.9829,0.983184
c500_d100,exponential,0.785,0.792271
c500_d1000,deviance,0.9795,0.979733
c500_d5000,deviance,0.9873,0.987439
c1000_d100,deviance,0.945,0.945274
c1000_d1000,deviance,0.9925,0.992534
c1000_d5000,deviance,0.9972,0.997205
c1500_d100,deviance,1.0,1.0
