# CIS 419/519 
#**Homework 4 : Adaboost and the Challenge**

In [2]:
import pandas as pd
import numpy as np

# Adaboost-SAMME

In [3]:
import numpy as np
import math
from sklearn import tree

class BoostedDT:

    def __init__(self, numBoostingIters=100, maxTreeDepth=3):
        '''
        Constructor

        Class Fields 
        clfs : List object containing individual DecisionTree classifiers, in order of creation during boosting
        betas : List of beta values, in order of creation during boosting
        '''

        self.clfs = []  # keep the class fields, and be sure to keep them updated during boosting
        self.betas = []
        self.numBoostingIters = numBoostingIters
        self.maxTreeDepth = maxTreeDepth
        self.K = None
        self.classes = None
        
        #TODO



    def fit(self, X, y, random_state=None):
        '''
        Trains the model. 
        Be sure to initialize all individual Decision trees with the provided random_state value if provided.
        
        Arguments:
            X is an n-by-d Pandas Data Frame
            y is an n-by-1 Pandas Data Frame
            random_seed is an optional integer value
        '''
        #TODO
        X = X.to_numpy()
        y = y.to_numpy()
        n,d = X.shape
        y = y.reshape(-1,1)
        
        self.classes = np.unique(y)
        self.K = len(self.classes)
        
        weights = np.full(n,1/n).reshape(-1,1)
        
        for iter_num in range(self.numBoostingIters):
            h = tree.DecisionTreeClassifier(max_depth = self.maxTreeDepth,
                                              random_state = random_state)
            h.fit(X,y,sample_weight = weights.flatten())
            self.clfs.append(h)
            y_pred = h.predict(X).reshape(-1,1)
            epsilon = np.sum((y_pred!=y)*weights)
            beta = np.log((self.K-1)*(1-epsilon)/epsilon)/2
            self.betas.append(beta)
            weights[y_pred==y] *= np.exp(-beta)
            weights[y_pred!=y] *= np.exp(beta)
            weights /= sum(weights)

    def predict(self, X):
        '''
        Used the model to predict values for each instance in X
        Arguments:
            X is an n-by-d Pandas Data Frame
        Returns:
            an n-by-1 Pandas Data Frame of the predictions
        '''
        #TODO
        X = X.to_numpy()
        n,d = X.shape
        proba = np.zeros((n,self.K))
        for iter_num in range(self.numBoostingIters):
            proba += self.clfs[iter_num].predict_proba(X)
        max_proba = np.argmax(proba,axis=1).reshape(-1)
        pred_array = np.tile(self.classes,(n,1))
        y_pred = np.choose(max_proba,pred_array.T).reshape(-1,1)
        return pd.DataFrame(y_pred)

# Test BoostedDT

In [4]:
import numpy as np
from sklearn import datasets
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

def test_boostedDT():

  # load the data set
  sklearn_dataset = datasets.load_breast_cancer()
  # convert to pandas df
  df = pd.DataFrame(sklearn_dataset.data,columns=sklearn_dataset.feature_names)
  df['CLASS'] = pd.Series(sklearn_dataset.target)
  df.head()

  # split randomly into training/testing
  train, test = train_test_split(df, test_size=0.5, random_state=42)
  # Split into X,y matrices
  X_train = train.drop(['CLASS'], axis=1)
  y_train = train['CLASS']
  X_test = test.drop(['CLASS'], axis=1)
  y_test = test['CLASS']


  # train the decision tree
  modelDT = DecisionTreeClassifier()
  modelDT.fit(X_train, y_train)

  # train the boosted DT
  modelBoostedDT = BoostedDT(numBoostingIters=100, maxTreeDepth=2)
  modelBoostedDT.fit(X_train, y_train)

  # train sklearn's implementation of Adaboost
  modelSKBoostedDT = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=100)
  modelSKBoostedDT.fit(X_train, y_train)

  # output predictions on the test data
  ypred_DT = modelDT.predict(X_test)
  ypred_BoostedDT = modelBoostedDT.predict(X_test)
  ypred_SKBoostedDT = modelSKBoostedDT.predict(X_test)

  # compute the training accuracy of the model
  accuracy_DT = accuracy_score(y_test, ypred_DT)
  accuracy_BoostedDT = accuracy_score(y_test, ypred_BoostedDT)
  accuracy_SKBoostedDT = accuracy_score(y_test, ypred_SKBoostedDT)

  print("Decision Tree Accuracy = "+str(accuracy_DT))
  print("My Boosted Decision Tree Accuracy = "+str(accuracy_BoostedDT))
  print("Sklearn's Boosted Decision Tree Accuracy = "+str(accuracy_SKBoostedDT))
  print()
  print("Note that due to randomization, your boostedDT might not always have the ")
  print("exact same accuracy as Sklearn's boostedDT.  But, on repeated runs, they ")
  print("should be roughly equivalent and should usually exceed the standard DT.")

test_boostedDT()

Decision Tree Accuracy = 0.9263157894736842
My Boosted Decision Tree Accuracy = 0.9578947368421052
Sklearn's Boosted Decision Tree Accuracy = 0.9543859649122807

Note that due to randomization, your boostedDT might not always have the 
exact same accuracy as Sklearn's boostedDT.  But, on repeated runs, they 
should be roughly equivalent and should usually exceed the standard DT.


# Challenging Problem: Chocolate

# Preprocessing

In [5]:
import pandas as pd
import numpy as np

def dropMissingColumn(inputDf,missing_thresh):
    outputDf = inputDf.copy()
    missDf = getMissingRatio(outputDf)
    missIdx = missDf['MissingPercent'] >= missing_thresh
    missFeat = missDf.loc[missIdx,'Feature']
    outputDf = outputDf.drop(missFeat,axis=1)
    return outputDf

def fillMissingColumnSpecify(numericalDf,categoryDf):
    numericalDf = numericalDf.fillna(numericalDf.mean())
    categoryDf = categoryDf.fillna(categoryDf.mode().iloc[0]).astype('O')
    categoryDf = pd.get_dummies(categoryDf)
    outputDf = pd.concat([numericalDf,categoryDf],axis=1)
    return outputDf

def getMissingRatio(inputDf):
    outSeries = inputDf.isna().mean()                 # calculate missing ratios
    outDf = pd.DataFrame({'Feature':outSeries.index,\
                          'MissingPercent':outSeries.values})
    return outDf

def sortData(X,y,id_feature):
    X = X.drop_duplicates(subset=id_feature)
    y = y.drop_duplicates(subset=id_feature)
    X, y = X.sort_values(by=[id_feature]), y.sort_values(by=[id_feature])
    X_ind = X[id_feature].to_numpy()
    y_ind = y[id_feature].to_numpy()
    ind = np.intersect1d(X_ind,y_ind)
    X = X.loc[X[id_feature].isin(ind)]
    y = y.loc[y[id_feature].isin(ind)]
    df_index = pd.Series(np.arange(X.shape[0]))
    X = X.set_index(df_index)
    y = y.set_index(df_index)
    return X, y

def filterUnlabel(X_train,X_unlabel):
    train_columns = X_train.columns.values
    unlabel_columns = X_unlabel.columns.values
    lack_columns = np.setdiff1d(train_columns, unlabel_columns)
    redundancy_columns = np.setdiff1d(unlabel_columns, train_columns)
    X_unlabel = X_unlabel.assign(**dict.fromkeys(lack_columns,0))
    X_unlabel.drop(redundancy_columns,axis=1)
    return X_unlabel

In [6]:
import numpy as np
import pandas as pd
import math
from sklearn import tree
from sklearn import datasets
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

def train_chocolate():
    X = pd.read_csv('ChocolatePipes_trainData.csv')
    y = pd.read_csv('ChocolatePipes_trainLabels.csv')
    X, y = sortData(X, y, 'id')
    useless_feature = ['id', 'Date of entry', 'Country funded by', 
                       'oompa loomper', 'Region code', 'District code',
                       'Chocolate consumers in town', 
                       'Does factory offer tours', 'Recorded by',
                       'Oompa loompa management', 'Payment scheme',
                       'management_group']
    X = X.drop(useless_feature,axis=1)
    
    X = dropMissingColumn(X, 0.5)
    categorical_feature = ['chocolate_quality', 'chocolate_quantity',
                           'pipe_type', 'chocolate_source',
                           'chocolate_source_class', 'Cocoa farm',
                           'Official or Unofficial pipe', 
                           'Type of pump','management']
    Xc = X[categorical_feature]
    Xn = X.drop(categorical_feature,axis=1)
    
    X = fillMissingColumnSpecify(Xn, Xc)
    y = y.drop(['id'],axis=1)
    
    X_grade = pd.read_csv('ChocolatePipes_gradingTestData.csv')
    X_grade_id = X_grade['id']
    X_grade = X_grade.drop(useless_feature,axis=1)
    Xc_grade = X_grade[categorical_feature]
    Xn_grade = X_grade.drop(categorical_feature,axis=1)
    X_grade = fillMissingColumnSpecify(Xn_grade, Xc_grade)
    X_grade = filterUnlabel(X, X_grade)
    
    X_leader = pd.read_csv('ChocolatePipes_leaderboardTestData.csv')
    X_leader_id = X_leader['id']
    X_leader = X_leader.drop(useless_feature,axis=1)
    Xc_leader = X_leader[categorical_feature]
    Xn_leader = X_leader.drop(categorical_feature,axis=1)
    X_leader = fillMissingColumnSpecify(Xn_leader, Xc_leader)
    X_leader = filterUnlabel(X, X_leader)
    
    # BoostedDT
    df = pd.concat([X,y],axis=1)
    train, test = train_test_split(df, test_size=0.5, random_state=42)
    X_train = train.drop(['label'], axis=1)
    y_train = train['label']
    X_test = test.drop(['label'], axis=1)
    y_test = test['label']
    
# =============================================================================
    # tuning the best numBoostingIters and maxTreeDepth
    max_train_accuracy = 0
    max_test_accuracy = 0
    max_train_iter = 0
    max_test_iter = 0
    for iter_num in range(25):
        modelBoostedDT = BoostedDT(numBoostingIters=iter_num, maxTreeDepth=4)
        modelBoostedDT.fit(X_train,y_train)
        train_accuracy = (modelBoostedDT.predict(X_train).values.reshape(-1)
                    ==y_train.values).sum()/len(y_train)
        test_accuracy = (modelBoostedDT.predict(X_test).values.reshape(-1)
                    ==y_test.values).sum()/len(y_test)
        print(f'iteration: {iter_num}')
        print(f'train: {train_accuracy}')
        print(f'test: {test_accuracy}\n')
        if train_accuracy > max_train_accuracy:
            max_train_accuracy = train_accuracy
            max_train_iter = iter_num
        if test_accuracy > max_test_accuracy:
            max_test_accuracy = test_accuracy
            max_test_iter = iter_num
    print(f'max train performance: {max_train_accuracy}; iteration: {max_train_iter}')
    print(f'max test performance: {max_test_accuracy}; iteration: {max_test_iter}')
    modelBoostedDT = BoostedDT(numBoostingIters=max_test_iter, maxTreeDepth=4)
# =============================================================================


# =============================================================================
    # Boosted Decision Tree
    modelBoostedDT = BoostedDT(numBoostingIters=28, maxTreeDepth=18)
    modelBoostedDT.fit(X_train,y_train)
    train_accuracy_boostedDT = (modelBoostedDT.predict(X_train).values.reshape(-1)
                     ==y_train.values).sum()/len(y_train)
    test_accuracy_boostedDT = (modelBoostedDT.predict(X_test).values.reshape(-1)
                     ==y_test.values).sum()/len(y_test)
    print(f'train_accuracy_boostedDT: {train_accuracy_boostedDT}')
    print(f'test_accuracy_boostedDT: {test_accuracy_boostedDT}')
    
    
    modelBoostedDT.fit(X,y)
    
    y_grade_boost = modelBoostedDT.predict(X_grade)
    output_grade_boost = pd.concat([X_grade_id, y_grade_boost], axis=1)
    output_grade_boost.columns = ['id', 'label']
    output_grade_boost.to_csv('predictions-grading-BoostedDT.csv',index=False)
    
    y_leader_boost = modelBoostedDT.predict(X_leader)
    output_leader_boost = pd.concat([X_leader_id, y_leader_boost], axis=1)
    output_leader_boost.columns = ['id', 'label']
    output_leader_boost.to_csv('predictions-leaderboard-BoostedDT.csv',index=False)
# =============================================================================


# =============================================================================
    # Preprocessing for SVM and Logistic Regression
    X = X.to_numpy()
    y = y.to_numpy().flatten()
    
    X_train = X_train.to_numpy()
    y_train = y_train.to_numpy().flatten()
    X_test = X_test.to_numpy()
    y_test = y_test.to_numpy().flatten()
    
    X_grade = X_grade.to_numpy()
    X_leader = X_leader.to_numpy()
    
    standardizer = StandardScaler()
    Xstandardized = pd.DataFrame(standardizer.fit_transform(X))
    Xstandardized_train = pd.DataFrame(standardizer.fit_transform(X_train))
    Xstandardized_test = pd.DataFrame(standardizer.fit_transform(X_test))
    Xstandardized_grade = pd.DataFrame(standardizer.fit_transform(X_grade))
    Xstandardized_leader = pd.DataFrame(standardizer.fit_transform(X_leader))
# =============================================================================
    
# =============================================================================
#     # SVM
    svm_clf = SVC(gamma='auto')
    svm_clf.fit(Xstandardized_train, y_train)
    train_accuracy_svm = (svm_clf.predict(Xstandardized_train)
                            ==y_train).sum()/len(y_train)
    test_accuracy_svm = (svm_clf.predict(Xstandardized_test)
                            ==y_test).sum()/len(y_test)
    print(f'train_accuracy_svm: {train_accuracy_svm}')
    print(f'test_accuracy_svm: {test_accuracy_svm}')
    
    svm_clf.fit(Xstandardized, y)
    
    y_grade_svm = pd.DataFrame(svm_clf.predict(Xstandardized_grade))
    output_grade_svm = pd.concat([X_grade_id, y_grade_svm], axis=1)
    output_grade_svm.columns = ['id', 'label']
    output_grade_svm.to_csv('predictions-grading-SVC.csv',index=False)
    
    y_leader_svm = pd.DataFrame(svm_clf.predict(Xstandardized_leader))
    output_leader_svm = pd.concat([X_leader_id, y_leader_svm], axis=1)
    output_leader_svm.columns = ['id', 'label']
    output_leader_svm.to_csv('predictions-leaderboard-SVC.csv',index=False)
# =============================================================================
    
# =============================================================================
    # Logistic Regression
    logistic_clf = LogisticRegression(random_state=42,max_iter=120)
    logistic_clf.fit(Xstandardized_train, y_train)
    train_accuracy_logistic = (logistic_clf.predict(Xstandardized_train)
                                    ==y_train).sum()/len(y_train)
    test_accuracy_logistic = (logistic_clf.predict(Xstandardized_test)
                                    ==y_test).sum()/len(y_test)
    print(f'train_accuracy_logistic: {train_accuracy_logistic}')
    print(f'test_accuracy_logistic: {test_accuracy_logistic}')
    
    logistic_clf.fit(Xstandardized, y)
    
    y_grade_logistic = pd.DataFrame(logistic_clf.predict(Xstandardized_grade))
    output_grade_logistic = pd.concat([X_grade_id, y_grade_logistic], axis=1)
    output_grade_logistic.columns = ['id', 'label']
    output_grade_logistic.to_csv('predictions-grading-best.csv',index=False)
    
    y_leader_logistic = pd.DataFrame(logistic_clf.predict(Xstandardized_leader))
    output_leader_logistic = pd.concat([X_leader_id, y_leader_logistic], axis=1)
    output_leader_logistic.columns = ['id', 'label']
    output_leader_logistic.to_csv('predictions-leaderboard-best.csv',index=False)
# =============================================================================

In [8]:
train_chocolate()