In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json

from sklearn.linear_model import LogisticRegression # Works
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis # Works
from sklearn.neighbors import KNeighborsClassifier # Works
from sklearn.naive_bayes import GaussianNB # Works
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn.metrics import make_scorer, f1_score, accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate

from imblearn.metrics import geometric_mean_score, make_index_balanced_accuracy, specificity_score

from sklearn.preprocessing import LabelEncoder
from numpy import average
from sklearn.multioutput import MultiOutputClassifier
from sklearn.exceptions import ConvergenceWarning
import warnings
from imblearn.over_sampling import SMOTE, RandomOverSampler, KMeansSMOTE
from imblearn.under_sampling import ClusterCentroids, CondensedNearestNeighbour, EditedNearestNeighbours, RandomUnderSampler
from matplotlib.pyplot import figure


le = LabelEncoder()

SEED = 42
DATA_STRUCTURE = json.load(open('data_structure.json'))
DATA_KEYS = list(DATA_STRUCTURE.keys())
CLASSIFIERS = {
  "GaussianNB":GaussianNB,
  "LinearDiscriminantAnalysis":LinearDiscriminantAnalysis,
  "KNeighborsClassifier":KNeighborsClassifier,
  "DecisionTreeClassifier":DecisionTreeClassifier,
  "LogisticRegression": LogisticRegression,
  # "SVC":SVC
  # Random forrest 
}
OUTPUT_COLS = ['dataset', 'target', 'target_name', 'avg_precision', 'precision_folds', 'avg_recall', 'recall_folds', 'avg_f1', 'f1_folds', 'avg_geometric_mean', 'geometric_mean_folds','avg_specificity','specificity_folds']

In [3]:
complexityDF = pd.read_csv(f'./out/complexityDF.csv', index_col=0)

complexityDFColumns = ['dataset','target1','target2', 'target3']
complexityDF.columns = complexityDFColumns
meltedComplexityDF = pd.melt(complexityDF, value_vars=['target1','target2', 'target3'], var_name='Target', id_vars=['dataset'])
meltedComplexityDF= meltedComplexityDF.sort_values(by=['value'], ignore_index=True, ascending=False)
chunks = np.array_split(meltedComplexityDF, 3)
finalComplexityDF = pd.DataFrame({})
for i, chunk in enumerate(chunks):
  chunk['chunk'] = [i]*len(chunk)
  labels = []
  for i in range(len(chunk['Target'].values)):
    labels.append(chunk['dataset'].values[i][0:3] + '_' +  chunk['Target'].values[i][-2:])
  # print('------------------------------------------------')
  finalComplexityDF = pd.concat((finalComplexityDF, chunk))
print(finalComplexityDF)


      dataset   Target     value  chunk
0    mushroom  target2  0.786322      0
1       telco  target3  0.637090      0
2        imdb  target2  0.567568      0
3      census  target2  0.509846      0
4        bank  target3  0.502913      0
5       paris  target3  0.498800      0
6       paris  target2  0.496300      0
7     smoking  target2  0.484625      0
8      census  target3  0.415984      0
9      flight  target3  0.363326      0
10      telco  target2  0.356348      1
11    smoking  target1  0.347998      1
12   mushroom  target3  0.346563      1
13    smoking  target3  0.328425      1
14      telco  target1  0.286163      1
15       bank  target2  0.270227      1
16     flight  target1  0.270035      1
17     census  target1  0.264239      1
18       imdb  target3  0.243511      1
19      paris  target1  0.223200      1
20  intention  target2  0.207192      2
21     flight  target2  0.194529      2
22       imdb  target1  0.187851      2
23  intention  target3  0.185995      2


In [7]:
def shuffle(df, t1, t2, t3):

  # print(t1.size)
  # print(t2.size)
  # print(t3)

  df['t1'] = t1.tolist()
  df['t2'] = t2.tolist()
  df['t3'] = t3.tolist()

  # use Seed
  shuffledDf = df.sample(frac=1, random_state=SEED).reset_index(drop=True)
  shuffledT1 = shuffledDf['t1']
  shuffledT2 = shuffledDf['t2']
  shuffledT3 = shuffledDf['t3']

  shuffledDf = shuffledDf.drop(['t1','t2','t3'], 1)
  return shuffledDf, shuffledT1, shuffledT2, shuffledT3

def split(df, t1, t2, t3, k):
  targets = np.vstack((t1, t2,t3)).T
  chunks = np.array_split(df, k)
  classes = np.array_split(targets, k)
  return chunks, classes

def combine(df,columns):
  for col in columns:
    df[col] = columns[col].tolist()
  return df
def initFoldResultObject():
  return {
    'target1' : {
      'precScores': [],
      'recallScores': [],
      'f1Scores': [],
      'geometricMeanScores': [],
      'specificityScores': []
    },
    'target3' : {
      'precScores': [],
      'recallScores': [],
      'f1Scores': [],
      'geometricMeanScores': [],
      'specificityScores': []
    },
    'target2' : {
      'precScores': [],
      'recallScores': [],
      'f1Scores': [],
      'geometricMeanScores': [],
      'specificityScores': []
    }
}
def get_data(name):
  orderedcomplexity = finalComplexityDF[finalComplexityDF['dataset']==name].sort_values(by=['value'], ascending=False)['Target'].values
  if name == 'imdb':
    movie_data = pd.read_csv('./Datasets/movie_metadata.csv')
    movie_data.drop_duplicates(inplace=True)
    movie_data.loc[:,'genres'] = movie_data.loc[:,'genres'].apply(lambda x: x.split('|'))

    genres = []
    for x in movie_data.genres:
      for g in x:
        if g not in genres:
          genres.append(g)

    for g in genres:
      movie_data.loc[:,g] = movie_data.loc[:,'genres'].apply(lambda x: int(g in x))

    movie_data.drop(["director_name","actor_2_name","movie_title","genres","actor_1_name","actor_3_name","plot_keywords","movie_imdb_link","cast_total_facebook_likes"],axis=1, inplace = True)
    movie_data.dropna(inplace=True)

    movie_data["imdb_score"] = movie_data["imdb_score"].apply(float)
    movie_data.loc[movie_data['imdb_score'].between(8,10), 'imdb_score'] = 100.0
    movie_data.loc[movie_data['imdb_score'].between(5,7.99), 'imdb_score'] = 50.0
    movie_data.loc[movie_data['imdb_score'].between(0,4.992), 'imdb_score'] = 30.0
    movie_data["imdb_score"] = movie_data["imdb_score"].apply(str)
    movie_data.loc[movie_data['imdb_score'] == "100.0", 'imdb_score'] = "GOOD"
    movie_data.loc[movie_data['imdb_score'] == "50.0", 'imdb_score'] = "AVERAGE"
    movie_data.loc[movie_data['imdb_score'] == "30.0", 'imdb_score'] = "BAD"

    ratings = movie_data["content_rating"].unique()
    for rate in ratings:
        if rate == "M":
            movie_data.loc[movie_data['content_rating'] == rate, 'content_rating'] = "PG"
        elif rate == "GP":
            movie_data.loc[movie_data['content_rating'] == rate, 'content_rating'] = "PG"
        elif rate == "Unrated":
            movie_data.loc[movie_data['content_rating'] == rate, 'content_rating'] = "Not Rated"
        elif rate == "Passed":
            movie_data.loc[movie_data['content_rating'] == rate, 'content_rating'] = "Approved"
        elif rate == "X":
            movie_data.loc[movie_data['content_rating'] == rate, 'content_rating'] = "NC-17"

    movie_data.loc[movie_data['gross'].between(0,15000000.0), 'gross'] = 0.0
    movie_data.loc[movie_data['gross'].between(1500000.01,762000000.0), 'gross'] = 1.0
    movie_data = pd.get_dummies(movie_data,columns=['color','language','country'],drop_first=True)

    target1 = le.fit_transform(movie_data[DATA_STRUCTURE[name][orderedcomplexity[0]]])
    target2 = le.fit_transform(movie_data[DATA_STRUCTURE[name][orderedcomplexity[1]]])
    target3 = le.fit_transform(movie_data[DATA_STRUCTURE[name][orderedcomplexity[2]]])
    
    X_final = movie_data.drop(["imdb_score", "content_rating", "gross"],axis=1)

  elif name == 'mushroom':
    mushroom_data = pd.read_csv('./Datasets/mushroom.csv')
    mushroom_data = mushroom_data[mushroom_data['stalk-root']!='?']
    mushroom_data = pd.get_dummies(mushroom_data,columns=['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
          'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
          'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
          'stalk-surface-below-ring', 'stalk-color-above-ring',
          'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
          'ring-type', 'spore-print-color'],drop_first=True)
    target1 = le.fit_transform(mushroom_data[DATA_STRUCTURE[name][orderedcomplexity[0]]])
    target2 = le.fit_transform(mushroom_data[DATA_STRUCTURE[name][orderedcomplexity[1]]])
    target3 = le.fit_transform(mushroom_data[DATA_STRUCTURE[name][orderedcomplexity[2]]])
    X_final = mushroom_data.drop(["Class", "population", "habitat"],axis=1)
  
  elif name == 'census':
    census_data = pd.read_csv('./Datasets/census.csv',names=['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','income']) 
    census_data=census_data.drop_duplicates()
    for col in census_data.columns:
      census_data = census_data[census_data[col]!=' ?']
    census_data.loc[census_data.income==' <=50K.','income'] = ' <=50K' 
    census_data.loc[census_data.income==' >50K.','income'] = ' >50K'
    census_data = pd.get_dummies(census_data,columns=['education','occupation','relationship','race','sex','native-country'],drop_first=True)
    target1 = le.fit_transform(census_data[DATA_STRUCTURE[name][orderedcomplexity[0]]])
    target2 = le.fit_transform(census_data[DATA_STRUCTURE[name][orderedcomplexity[1]]])
    target3 = le.fit_transform(census_data[DATA_STRUCTURE[name][orderedcomplexity[2]]])
    X_final = census_data.drop(["income", "marital-status", "workclass"],axis=1)
  
  elif name == 'bank':
    bank_data = pd.read_csv('./Datasets/bank-additional.csv')
    for col in bank_data.columns:
      bank_data = bank_data[bank_data[col]!='unknown']
    bank_data = pd.get_dummies(bank_data,columns=['job','education','default','contact','month','day_of_week','poutcome','marital'],drop_first=True)
    target1 = le.fit_transform(bank_data[DATA_STRUCTURE[name][orderedcomplexity[0]]])
    target2 = le.fit_transform(bank_data[DATA_STRUCTURE[name][orderedcomplexity[1]]])
    target3 = le.fit_transform(bank_data[DATA_STRUCTURE[name][orderedcomplexity[2]]])
    X_final = bank_data.drop(["y", "loan", "housing"],axis=1)

  elif name == 'intention':
    intention_data = pd.read_csv('./Datasets/online_shoppers_intention.csv')
    intention_data = intention_data.drop_duplicates()
    intention_data = intention_data[intention_data['VisitorType']!='Other']
    intention_data = pd.get_dummies(intention_data,columns=['Month','Weekend'],drop_first=True)        
    target1 = le.fit_transform(intention_data[DATA_STRUCTURE[name][orderedcomplexity[0]]])
    target2 = le.fit_transform(intention_data[DATA_STRUCTURE[name][orderedcomplexity[1]]])
    target3 = le.fit_transform(intention_data[DATA_STRUCTURE[name][orderedcomplexity[2]]])
    X_final = intention_data.drop(["Revenue", "VisitorType", "SpecialDay"],axis=1)

  elif name == 'anuran':
    anuran_data = pd.read_csv('./Datasets/Frogs_MFCCs.csv')
    anuran_data.drop(columns='RecordID',inplace=True)
    target1 = le.fit_transform(anuran_data[DATA_STRUCTURE[name][orderedcomplexity[0]]])
    target2 = le.fit_transform(anuran_data[DATA_STRUCTURE[name][orderedcomplexity[1]]])
    target3 = le.fit_transform(anuran_data[DATA_STRUCTURE[name][orderedcomplexity[2]]])
    X_final = anuran_data.drop(["Family", "Genus", "Species"],axis=1)
  
  elif name == 'telco':
    telco_data = pd.read_csv('./Datasets/telco.csv')
    telco_data.drop(columns=['customerID'],inplace=True)
    telco_data = telco_data.drop_duplicates()
    telco_data = telco_data[telco_data.TotalCharges!=' ']
    telco_data = pd.get_dummies(telco_data,columns=['gender','Partner','Dependents','PhoneService','MultipleLines','InternetService','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies','PaperlessBilling'],drop_first=True)
    target1 = le.fit_transform(telco_data[DATA_STRUCTURE[name][orderedcomplexity[0]]])
    target2 = le.fit_transform(telco_data[DATA_STRUCTURE[name][orderedcomplexity[1]]])
    target3 = le.fit_transform(telco_data[DATA_STRUCTURE[name][orderedcomplexity[2]]])
    X_final = telco_data.drop(["Churn", "Contract", "PaymentMethod"],axis=1)

  elif name == 'paris':
    paris_data = pd.read_csv('./Datasets/ParisHousingClass.csv')
    paris_data.drop(columns='made',inplace=True)
    target1 = le.fit_transform(paris_data[DATA_STRUCTURE[name][orderedcomplexity[0]]])
    target2 = le.fit_transform(paris_data[DATA_STRUCTURE[name][orderedcomplexity[1]]])
    target3 = le.fit_transform(paris_data[DATA_STRUCTURE[name][orderedcomplexity[2]]])
    X_final = paris_data.drop(["category", "isNewBuilt", "hasStorageRoom"],axis=1)
  
  elif name == 'smoking':
    smoking_data = pd.read_csv('./Datasets/smoking.csv')
    smoking_data.drop(columns=['ID','oral'],inplace=True)
    smoking_data = smoking_data.drop_duplicates()
    smoking_data = pd.get_dummies(smoking_data,columns=['gender'],drop_first=True)
    target1 = le.fit_transform(smoking_data[DATA_STRUCTURE[name][orderedcomplexity[0]]])
    target2 = le.fit_transform(smoking_data[DATA_STRUCTURE[name][orderedcomplexity[1]]])
    target3 = le.fit_transform(smoking_data[DATA_STRUCTURE[name][orderedcomplexity[2]]])
    X_final = smoking_data.drop(["smoking", "tartar", "dental caries"],axis=1)
  
  elif name == 'flight':
    flight_data = pd.read_csv('./Datasets/flight.csv')
    flight_data.dropna(inplace=True)
    flight_data = pd.get_dummies(flight_data,columns=['Gender','Type of Travel'],drop_first=True)
    target1 = le.fit_transform(flight_data[DATA_STRUCTURE[name][orderedcomplexity[0]]])
    target2 = le.fit_transform(flight_data[DATA_STRUCTURE[name][orderedcomplexity[1]]])
    target3 = le.fit_transform(flight_data[DATA_STRUCTURE[name][orderedcomplexity[2]]])
    X_final = flight_data.drop(["satisfaction", "Customer Type", "Class"],axis=1)
  
  else:
    raise ValueError('Incorrect dataset')
  
  return X_final, target1, target2, target3

# Original

In [None]:
from numpy import average
from sklearn.multioutput import MultiOutputClassifier
from sklearn.exceptions import ConvergenceWarning
import warnings
warnings.filterwarnings('ignore') 

# Because Scikit-learn's multi output classifier doesn't support metrics for multi label classification, we implemented our own.
def crossValidate(classifier, df, t1, t2, t3, k, dataSetName):
  df, t1, t2, t3 = shuffle(df, t1, t2, t3)
  # Split the data and classes into k subsets
  data, classes = split(df, t1, t2, t3, k)
  foldResults = {
    'target1' : {
      'precScores': [],
      'recallScores': [],
      'f1Scores': [],
      'geometricMeanScores': [],
      'specificityScores': []
      
    },
    'target3' : {
      'precScores': [],
      'recallScores': [],
      'f1Scores': [],
      'geometricMeanScores': [],
      'specificityScores': []
    },
    'target2' : {
      'precScores': [],
      'recallScores': [],
      'f1Scores': [],
      'geometricMeanScores': [],
      'specificityScores': []
    }
  }

  for i in range(k):
    # use the data at position i as the test data
    testData = data[i]
    testClasses = classes[i]

    # use the data at all positions other than i as the train data
    trainData = pd.concat((data[:i] + data[i+1:]))
    trainclasses = np.concatenate((classes[:i] + classes[i+1:]))
    
    # Create a multioutput classifier from the provided classifier
    clf = MultiOutputClassifier(classifier())



    # Fit the model to Training data
    clf.fit(trainData, trainclasses)

    # Use the model on the test data
    testResults = clf.predict(X=testData)

    # Since the results for multi label classification come as a list of size (n, 3), decompose the list into the results for each target
    t1Results = testResults[:, 0]
    t2Results = testResults[:, 1]
    t3Results = testResults[:, 2]

    # Do the same for the actual classes
    t1Classes = testClasses[:, 0]
    t2Classes = testClasses[:, 1]
    t3Classes = testClasses[:, 2]

    actualVsPredicted = {
      'target1':(t1Classes, t1Results),
      'target2':(t2Classes, t2Results),
      'target3':(t3Classes, t3Results)
    }
    for target in actualVsPredicted:
      (actual, predicted) = actualVsPredicted[target]
      # Calculate the scores for each metric
      prec_macro = precision_score(actual, predicted, average='macro', zero_division=0)
      recall_macro = recall_score(actual, predicted, average="macro", zero_division=0)
      f1_macro = f1_score(actual, predicted, average='macro')
      geometric_mean = geometric_mean_score(actual, predicted)
      specificity = specificity_score(actual, predicted, average='macro')

      # Add the scores for the current train/test split to the list of other scores for the respective target
      foldResults[target]['precScores'].append(prec_macro)
      foldResults[target]['recallScores'].append(recall_macro)
      foldResults[target]['f1Scores'].append(f1_macro)
      foldResults[target]['geometricMeanScores'].append(geometric_mean)
      foldResults[target]['specificityScores'].append(specificity)
  

  df = pd.DataFrame(columns=OUTPUT_COLS)
  for target in foldResults:
    stats = foldResults[target]
    row = [dataSetName, target,DATA_STRUCTURE[dataSetName][target]]
    for stat in stats:
      row.append(average(stats[stat]))
      row.append(stats[stat])
    pdRow = pd.DataFrame(row).T
    pdRow.columns = OUTPUT_COLS
    df = pd.concat([df,pdRow], axis=0, ignore_index=True)
  return df

aggregateMetrics = {}

for name in CLASSIFIERS:
  print(name+ ': Starting')
  outputDf = pd.DataFrame()
  for i in range(len(DATA_STRUCTURE)):
    if i < 80:
      print('   '+ DATA_KEYS[i] + ': starting')
      df, target1, target2, target3 = get_data(DATA_KEYS[i] )
      outputDf = pd.concat([outputDf, crossValidate(CLASSIFIERS[name],df, target1, target2, target3, 5, DATA_KEYS[i])], ignore_index=True)
    # print(outputDf)
  print(name+ ': done')
  print('The average of each metric after cross validation per target across all data sets')
  result = outputDf.groupby('target')['avg_precision', 'avg_recall', 'avg_f1', 'avg_geometric_mean','avg_specificity'].mean()
  print(result)
  aggregateMetrics[name] = result
  outputDf.to_csv(path_or_buf=f'./out/sorted-descending-complexity/original/{name}.csv')
  

# Addressing Imbalance for Target1

In [8]:
warnings.filterwarnings('ignore') 

plt.figure(figsize=(20, 50), dpi=80)
def crossValidateSmote(classifier, df, t1, t2, t3, k, dataSetName):
  
  df, t1, t2, t3 = shuffle(df, t1, t2, t3)
  data, classes = split(df, t1, t2, t3, k)
  foldResults = initFoldResultObject()

  for i in range(k):
    # use the data at position i as the test data
    testData = data[i]
    testClasses = classes[i]

    # use the data at all positions other than i as the train data
    trainData = pd.concat((data[:i] + data[i+1:]))
    trainClasses = np.concatenate((classes[:i] + classes[i+1:]))

    # Add target2 and target3 as features to the dataframe for Oversampling of target1
    testDataCombined = combine(trainData, {'t2': trainClasses[:, 1], 't3':trainClasses[:, 2]})

    # Apply SMOTE to the training data
    sm = SMOTE(random_state=SEED, sampling_strategy='not majority')
    reSampled_df, resampled_target1 = sm.fit_resample(testDataCombined, trainClasses[:, 0])

    # Build the training outpus classes using the resampled target1 values (generated as classes), and the target2 and target3 values (generated as features) from smote 
    resampledTrainClasses = np.vstack((resampled_target1, reSampled_df['t2'], reSampled_df['t3'])).T
    reSampled_df = reSampled_df.drop(['t2','t3'], 1)
  
    # Create and fit a multioutput classifier from the provided classifier
    clf = MultiOutputClassifier(classifier())
    clf.fit(reSampled_df, resampledTrainClasses)

    # Use the model on the test data
    testResults = clf.predict(X=testData)

    actualVsPredicted = {
      'target1':(testClasses[:, 0],  testResults[:, 0]),
      'target2':(testClasses[:, 1], testResults[:, 1]),
      'target3':(testClasses[:, 2], testResults[:, 2])
    }

    for target in actualVsPredicted:
      (actual, predicted) = actualVsPredicted[target]
      # Add the scores for the current train/test split to the list of other scores for the respective target
      foldResults[target]['precScores'].append(precision_score(actual, predicted, average='macro', zero_division=0))
      foldResults[target]['recallScores'].append(recall_score(actual, predicted, average="macro", zero_division=0))
      foldResults[target]['f1Scores'].append(f1_score(actual, predicted, average='macro'))
      foldResults[target]['geometricMeanScores'].append(geometric_mean_score(actual, predicted))
      foldResults[target]['specificityScores'].append(specificity_score(actual, predicted, average='macro'))

  df = pd.DataFrame(columns=OUTPUT_COLS)
  for target in foldResults:
    stats = foldResults[target]
    row = [dataSetName,target, DATA_STRUCTURE[dataSetName][target]]
    for stat in stats:
      row.append(average(stats[stat]))
      row.append(stats[stat])
    pdRow = pd.DataFrame(row).T
    pdRow.columns = OUTPUT_COLS
    df = pd.concat([df,pdRow], axis=0, ignore_index=True)
  return df

aggregateMetricsAdjusted = {}
for name in CLASSIFIERS:
  print(name+ ': Starting')
  outputDf = pd.DataFrame()
  for i in range(len(DATA_STRUCTURE)):
    print('   '+ DATA_KEYS[i] + ': starting')
    df, target1, target2, target3 = get_data(DATA_KEYS[i])
    outputDf = pd.concat([outputDf, crossValidateSmote(CLASSIFIERS[name],df, target1, target2, target3, 5, DATA_KEYS[i])], ignore_index=True)
    # print(outputDf)
    
  print(name+ ': done')
  print('The average of each metric after cross validation per target across all data sets')
  result = outputDf.groupby('target')['avg_precision', 'avg_recall', 'avg_f1', 'avg_geometric_mean', 'avg_specificity'].mean()
  aggregateMetricsAdjusted[name] = result
  outputDf.to_csv(path_or_buf=f'./out/sorted-decreasing-complexity/balanced-target1/{name}.csv')

GaussianNB: Starting
   imdb: starting
   mushroom: starting
   census: starting
   bank: starting
   intention: starting
   anuran: starting
   telco: starting
   paris: starting
   smoking: starting
   flight: starting
GaussianNB: done
The average of each metric after cross validation per target across all data sets
LinearDiscriminantAnalysis: Starting
   imdb: starting
   mushroom: starting
   census: starting
   bank: starting
   intention: starting
   anuran: starting
   telco: starting
   paris: starting
   smoking: starting
   flight: starting
LinearDiscriminantAnalysis: done
The average of each metric after cross validation per target across all data sets
KNeighborsClassifier: Starting
   imdb: starting
   mushroom: starting
   census: starting
   bank: starting
   intention: starting
   anuran: starting
   telco: starting
   paris: starting
   smoking: starting
   flight: starting
KNeighborsClassifier: done
The average of each metric after cross validation per target across al

<Figure size 1600x4000 with 0 Axes>

# Addressing Imbalance for Target1 and target2

In [11]:
warnings.filterwarnings('ignore') 

plt.figure(figsize=(20, 50), dpi=80)
def crossValidateSmote(classifier, df, t1, t2, t3, k, dataSetName):
  
  df, t1, t2, t3 = shuffle(df, t1, t2, t3)
  data, classes = split(df, t1, t2, t3, k)
  foldResults = initFoldResultObject()

  for i in range(k):
    # use the data at position i as the test data
    testData = data[i]
    testClasses = classes[i]

    # use the data at all positions other than i as the train data
    trainData = pd.concat((data[:i] + data[i+1:]))
    trainClasses = np.concatenate((classes[:i] + classes[i+1:]))

    # Add target2 and target3 as features to the dataframe for Oversampling of target1
    testDataCombined = combine(trainData, {'t2': trainClasses[:, 1], 't3':trainClasses[:, 2]})

    # Apply SMOTE to the training data
    sm = SMOTE(random_state=SEED, sampling_strategy='not majority')


    reSampled_df_t1, reSampled_t1_target1 = sm.fit_resample(testDataCombined, trainClasses[:, 0])
    reSampled_df_t1['t1'] = reSampled_t1_target1
    reSampled_df_t1_classes = np.vstack((reSampled_t1_target1, reSampled_df_t1['t2'], reSampled_df_t1['t3'])).T


    #remove target 2 from the features before oversampling for target 2
    reSampled_df_t1_target2 = reSampled_df_t1['t2']
    reSampled_df_t1 = reSampled_df_t1.drop(['t2'], 1)

    reSampled_df_t1_t2, reSampled_t1_t2_target2 = sm.fit_resample(reSampled_df_t1, reSampled_df_t1_target2)
    reSampled_df_t1_t2_classes = np.vstack((reSampled_df_t1_t2['t1'], reSampled_t1_t2_target2, reSampled_df_t1_t2['t3'])).T
    reSampled_df_t1_t2 = reSampled_df_t1_t2.drop(['t1','t3'], 1)
  
    # Create and fit a multioutput classifier from the provided classifier
    clf = MultiOutputClassifier(classifier())
    clf.fit(reSampled_df_t1_t2, reSampled_df_t1_t2_classes)

    # Use the model on the test data
    testResults = clf.predict(X=testData)

    actualVsPredicted = {
      'target1':(testClasses[:, 0],  testResults[:, 0]),
      'target2':(testClasses[:, 1], testResults[:, 1]),
      'target3':(testClasses[:, 2], testResults[:, 2])
    }

    for target in actualVsPredicted:
      (actual, predicted) = actualVsPredicted[target]
      # Add the scores for the current train/test split to the list of other scores for the respective target
      foldResults[target]['precScores'].append(precision_score(actual, predicted, average='macro', zero_division=0))
      foldResults[target]['recallScores'].append(recall_score(actual, predicted, average="macro", zero_division=0))
      foldResults[target]['f1Scores'].append(f1_score(actual, predicted, average='macro'))
      foldResults[target]['geometricMeanScores'].append(geometric_mean_score(actual, predicted))
      foldResults[target]['specificityScores'].append(specificity_score(actual, predicted, average='macro'))

  df = pd.DataFrame(columns=OUTPUT_COLS)
  for target in foldResults:
    stats = foldResults[target]
    row = [dataSetName,target, DATA_STRUCTURE[dataSetName][target]]
    for stat in stats:
      row.append(average(stats[stat]))
      row.append(stats[stat])
    pdRow = pd.DataFrame(row).T
    pdRow.columns = OUTPUT_COLS
    df = pd.concat([df,pdRow], axis=0, ignore_index=True)
  return df

aggregateMetricsAdjusted = {}
for name in CLASSIFIERS:
  print(name+ ': Starting')
  outputDf = pd.DataFrame()
  for i in range(len(DATA_STRUCTURE)):
    print('   '+ DATA_KEYS[i] + ': starting')
    df, target1, target2, target3 = get_data(DATA_KEYS[i])
    outputDf = pd.concat([outputDf, crossValidateSmote(CLASSIFIERS[name],df, target1, target2, target3, 5, DATA_KEYS[i])], ignore_index=True)
    # print(outputDf)
    
  print(name+ ': done')
  print('The average of each metric after cross validation per target across all data sets')
  result = outputDf.groupby('target')['avg_precision', 'avg_recall', 'avg_f1', 'avg_geometric_mean', 'avg_specificity'].mean()
  aggregateMetricsAdjusted[name] = result
  outputDf.to_csv(path_or_buf=f'./out/sorted-decreasing-complexity/balanced-target1-target2/{name}.csv')

GaussianNB: Starting
   imdb: starting
   mushroom: starting
   census: starting
   bank: starting
   intention: starting
   anuran: starting
   telco: starting
   paris: starting
   smoking: starting
   flight: starting
GaussianNB: done
The average of each metric after cross validation per target across all data sets
LinearDiscriminantAnalysis: Starting
   imdb: starting
   mushroom: starting
   census: starting
   bank: starting
   intention: starting
   anuran: starting
   telco: starting
   paris: starting
   smoking: starting
   flight: starting
LinearDiscriminantAnalysis: done
The average of each metric after cross validation per target across all data sets
KNeighborsClassifier: Starting
   imdb: starting
   mushroom: starting
   census: starting
   bank: starting
   intention: starting
   anuran: starting
   telco: starting
   paris: starting
   smoking: starting
   flight: starting
KNeighborsClassifier: done
The average of each metric after cross validation per target across al

<Figure size 1600x4000 with 0 Axes>

# Addressing Imbalance for Target1, Target3, and Target3

In [10]:
warnings.filterwarnings('ignore') 

plt.figure(figsize=(20, 50), dpi=80)
def crossValidateSmote(classifier, df, t1, t2, t3, k, dataSetName):
  
  df, t1, t2, t3 = shuffle(df, t1, t2, t3)
  data, classes = split(df, t1, t2, t3, k)
  foldResults = initFoldResultObject()

  for i in range(k):
    # use the data at position i as the test data
    testData = data[i]
    testClasses = classes[i]

    # use the data at all positions other than i as the train data
    trainData = pd.concat((data[:i] + data[i+1:]))
    trainClasses = np.concatenate((classes[:i] + classes[i+1:]))

    # Add target2 and target3 as features to the dataframe for Oversampling of target1
    testDataCombined = combine(trainData, {'t2': trainClasses[:, 1], 't3':trainClasses[:, 2]})

    # Apply SMOTE to the training data
    sm = SMOTE(random_state=SEED, sampling_strategy='not majority')


    reSampled_df_t1, reSampled_t1_target1 = sm.fit_resample(testDataCombined, trainClasses[:, 0])
    reSampled_df_t1['t1'] = reSampled_t1_target1
    reSampled_df_t1_classes = np.vstack((reSampled_t1_target1, reSampled_df_t1['t2'], reSampled_df_t1['t3'])).T


    #remove target 2 from the features before oversampling for target 2
    reSampled_df_t1_target2 = reSampled_df_t1['t2']
    reSampled_df_t1 = reSampled_df_t1.drop(['t2'], 1)
    reSampled_df_t1_t2, reSampled_t1_t2_target2 = sm.fit_resample(reSampled_df_t1, reSampled_df_t1_target2)
    reSampled_df_t1_t2_classes = np.vstack((reSampled_df_t1_t2['t1'], reSampled_t1_t2_target2, reSampled_df_t1_t2['t3'])).T

    # Add the resampled target values to the attribute dataset
    reSampled_df_t1_t2['t2'] = reSampled_t1_t2_target2

    #remove target 3 from the features before oversampling for target 3
    reSampled_df_t1_t2_target3 = reSampled_df_t1_t2['t3']
    reSampled_df_t1_t2 = reSampled_df_t1_t2.drop(['t3'], 1)
    reSampled_df_t1_t2_t3, reSampled_t1_t2_t3_target3 = sm.fit_resample(reSampled_df_t1_t2, reSampled_df_t1_t2_target3)
    reSampled_df_t1_t2_t3_classes = np.vstack((reSampled_df_t1_t2_t3['t1'], reSampled_df_t1_t2_t3['t2'], reSampled_t1_t2_t3_target3)).T
    
    reSampled_df_t1_t2_t3 = reSampled_df_t1_t2_t3.drop(['t1','t2'], 1)

    # Create and fit a multioutput classifier from the provided classifier
    clf = MultiOutputClassifier(classifier())
    clf.fit(reSampled_df_t1_t2_t3, reSampled_df_t1_t2_t3_classes)

    # Use the model on the test data
    testResults = clf.predict(X=testData)

    actualVsPredicted = {
      'target1':(testClasses[:, 0],  testResults[:, 0]),
      'target2':(testClasses[:, 1], testResults[:, 1]),
      'target3':(testClasses[:, 2], testResults[:, 2])
    }

    for target in actualVsPredicted:
      (actual, predicted) = actualVsPredicted[target]
      # Add the scores for the current train/test split to the list of other scores for the respective target
      foldResults[target]['precScores'].append(precision_score(actual, predicted, average='macro', zero_division=0))
      foldResults[target]['recallScores'].append(recall_score(actual, predicted, average="macro", zero_division=0))
      foldResults[target]['f1Scores'].append(f1_score(actual, predicted, average='macro'))
      foldResults[target]['geometricMeanScores'].append(geometric_mean_score(actual, predicted))
      foldResults[target]['specificityScores'].append(specificity_score(actual, predicted, average='macro'))

  df = pd.DataFrame(columns=OUTPUT_COLS)
  for target in foldResults:
    stats = foldResults[target]
    row = [dataSetName,target, DATA_STRUCTURE[dataSetName][target]]
    for stat in stats:
      row.append(average(stats[stat]))
      row.append(stats[stat])
    pdRow = pd.DataFrame(row).T
    pdRow.columns = OUTPUT_COLS
    df = pd.concat([df,pdRow], axis=0, ignore_index=True)
  return df

aggregateMetricsAdjusted = {}
for name in CLASSIFIERS:
  print(name+ ': Starting')
  outputDf = pd.DataFrame()
  for i in range(len(DATA_STRUCTURE)):
    print('   '+ DATA_KEYS[i] + ': starting')
    df, target1, target2, target3 = get_data(DATA_KEYS[i])
    outputDf = pd.concat([outputDf, crossValidateSmote(CLASSIFIERS[name],df, target1, target2, target3, 5, DATA_KEYS[i])], ignore_index=True)
    # print(outputDf)
    
  print(name+ ': done')
  print('The average of each metric after cross validation per target across all data sets')
  result = outputDf.groupby('target')['avg_precision', 'avg_recall', 'avg_f1', 'avg_geometric_mean', 'avg_specificity'].mean()
  aggregateMetricsAdjusted[name] = result
  outputDf.to_csv(path_or_buf=f'./out/sorted-decreasing-complexity/balanced-target1-target2-target3/{name}.csv')

GaussianNB: Starting
   imdb: starting
   mushroom: starting
   census: starting
   bank: starting
   intention: starting
   anuran: starting
   telco: starting
   paris: starting
   smoking: starting
   flight: starting
GaussianNB: done
The average of each metric after cross validation per target across all data sets
LinearDiscriminantAnalysis: Starting
   imdb: starting
   mushroom: starting
   census: starting
   bank: starting
   intention: starting
   anuran: starting
   telco: starting
   paris: starting
   smoking: starting
   flight: starting
LinearDiscriminantAnalysis: done
The average of each metric after cross validation per target across all data sets
KNeighborsClassifier: Starting
   imdb: starting
   mushroom: starting
   census: starting
   bank: starting
   intention: starting
   anuran: starting
   telco: starting
   paris: starting
   smoking: starting
   flight: starting
KNeighborsClassifier: done
The average of each metric after cross validation per target across al

<Figure size 1600x4000 with 0 Axes>