In [6]:
import warnings


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json

from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import make_scorer, f1_score, accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate

from imblearn.metrics import geometric_mean_score, make_index_balanced_accuracy, specificity_score

from sklearn.preprocessing import LabelEncoder
from numpy import average
from sklearn.multioutput import MultiOutputClassifier

le = LabelEncoder()

SEED = 42
DATA_STRUCTURE = json.load(open('data_structure.json'))
DATA_KEYS = list(DATA_STRUCTURE.keys())
CLASSIFIERS = {
  "DecisionTreeClassifier":DecisionTreeClassifier,
}
OUTPUT_COLS = ['dataset', 'target', 'target_name', 'avg_precision', 'precision_folds', 'avg_recall', 'recall_folds', 'avg_f1', 'f1_folds', 'avg_geometric_mean', 'geometric_mean_folds','avg_specificity','specificity_folds']


In [2]:
def get_data(name):
  if name == 'imdb':
    movie_data = pd.read_csv('./Datasets/movie_metadata.csv')
    movie_data.drop_duplicates(inplace=True)
    movie_data.loc[:,'genres'] = movie_data.loc[:,'genres'].apply(lambda x: x.split('|'))

    genres = []
    for x in movie_data.genres:
      for g in x:
        if g not in genres:
          genres.append(g)

    for g in genres:
      movie_data.loc[:,g] = movie_data.loc[:,'genres'].apply(lambda x: int(g in x))

    movie_data.drop(["director_name","actor_2_name","movie_title","genres","actor_1_name","actor_3_name","plot_keywords","movie_imdb_link","cast_total_facebook_likes"],axis=1, inplace = True)
    movie_data.dropna(inplace=True)

    movie_data["imdb_score"] = movie_data["imdb_score"].apply(float)
    movie_data.loc[movie_data['imdb_score'].between(8,10), 'imdb_score'] = 100.0
    movie_data.loc[movie_data['imdb_score'].between(5,7.99), 'imdb_score'] = 50.0
    movie_data.loc[movie_data['imdb_score'].between(0,4.992), 'imdb_score'] = 30.0
    movie_data["imdb_score"] = movie_data["imdb_score"].apply(str)
    movie_data.loc[movie_data['imdb_score'] == "100.0", 'imdb_score'] = "GOOD"
    movie_data.loc[movie_data['imdb_score'] == "50.0", 'imdb_score'] = "AVERAGE"
    movie_data.loc[movie_data['imdb_score'] == "30.0", 'imdb_score'] = "BAD"

    ratings = movie_data["content_rating"].unique()
    for rate in ratings:
        if rate == "M":
            movie_data.loc[movie_data['content_rating'] == rate, 'content_rating'] = "PG"
        elif rate == "GP":
            movie_data.loc[movie_data['content_rating'] == rate, 'content_rating'] = "PG"
        elif rate == "Unrated":
            movie_data.loc[movie_data['content_rating'] == rate, 'content_rating'] = "Not Rated"
        elif rate == "Passed":
            movie_data.loc[movie_data['content_rating'] == rate, 'content_rating'] = "Approved"
        elif rate == "X":
            movie_data.loc[movie_data['content_rating'] == rate, 'content_rating'] = "NC-17"

    movie_data.loc[movie_data['gross'].between(0,15000000.0), 'gross'] = 0.0
    movie_data.loc[movie_data['gross'].between(1500000.01,762000000.0), 'gross'] = 1.0
    movie_data = pd.get_dummies(movie_data,columns=['color','language','country'],drop_first=True)

    target1 = le.fit_transform(movie_data["imdb_score"])
    target2 = le.fit_transform(movie_data["content_rating"])
    target3 = le.fit_transform(movie_data["gross"])
    
    X_final = movie_data.drop(["imdb_score", "content_rating", "gross"],axis=1)

  elif name == 'mushroom':
    mushroom_data = pd.read_csv('./Datasets/mushroom.csv')
    mushroom_data = mushroom_data[mushroom_data['stalk-root']!='?']
    mushroom_data = pd.get_dummies(mushroom_data,columns=['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
          'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
          'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
          'stalk-surface-below-ring', 'stalk-color-above-ring',
          'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
          'ring-type', 'spore-print-color'],drop_first=True)
    target1 = le.fit_transform(mushroom_data["Class"])
    target2 = le.fit_transform(mushroom_data["population"])
    target3 = le.fit_transform(mushroom_data["habitat"])
    X_final = mushroom_data.drop(["Class", "population", "habitat"],axis=1)
  
  elif name == 'census':
    census_data = pd.read_csv('./Datasets/census.csv',names=['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','income']) 
    census_data=census_data.drop_duplicates()
    for col in census_data.columns:
      census_data = census_data[census_data[col]!=' ?']
    census_data.loc[census_data.income==' <=50K.','income'] = ' <=50K' 
    census_data.loc[census_data.income==' >50K.','income'] = ' >50K'
    census_data = pd.get_dummies(census_data,columns=['education','occupation','relationship','race','sex','native-country'],drop_first=True)
    target1 = le.fit_transform(census_data["income"])
    target2 = le.fit_transform(census_data["marital-status"])
    target3 = le.fit_transform(census_data["workclass"])
    X_final = census_data.drop(["income", "marital-status", "workclass"],axis=1)
  
  elif name == 'bank':
    bank_data = pd.read_csv('./Datasets/bank-additional.csv')
    for col in bank_data.columns:
      bank_data = bank_data[bank_data[col]!='unknown']
    bank_data = pd.get_dummies(bank_data,columns=['job','education','default','contact','month','day_of_week','poutcome','marital'],drop_first=True)
    target1 = le.fit_transform(bank_data["y"])
    target2 = le.fit_transform(bank_data["loan"])
    target3 = le.fit_transform(bank_data["housing"])
    X_final = bank_data.drop(["y", "loan", "housing"],axis=1)

  elif name == 'intention':
    intention_data = pd.read_csv('./Datasets/online_shoppers_intention.csv')
    intention_data = intention_data.drop_duplicates()
    intention_data = intention_data[intention_data['VisitorType']!='Other']
    intention_data = pd.get_dummies(intention_data,columns=['Month','Weekend'],drop_first=True)        
    target1 = le.fit_transform(intention_data["Revenue"])
    target2 = le.fit_transform(intention_data["VisitorType"])
    target3 = le.fit_transform(intention_data["SpecialDay"])
    X_final = intention_data.drop(["Revenue", "VisitorType", "SpecialDay"],axis=1)

  elif name == 'anuran':
    anuran_data = pd.read_csv('./Datasets/Frogs_MFCCs.csv')
    anuran_data.drop(columns='RecordID',inplace=True)
    target1 = le.fit_transform(anuran_data["Family"])
    target2 = le.fit_transform(anuran_data["Genus"])
    target3 = le.fit_transform(anuran_data["Species"])
    X_final = anuran_data.drop(["Family", "Genus", "Species"],axis=1)
  
  elif name == 'telco':
    telco_data = pd.read_csv('./Datasets/telco.csv')
    telco_data.drop(columns=['customerID'],inplace=True)
    telco_data = telco_data.drop_duplicates()
    telco_data = telco_data[telco_data.TotalCharges!=' ']
    telco_data = pd.get_dummies(telco_data,columns=['gender','Partner','Dependents','PhoneService','MultipleLines','InternetService','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies','PaperlessBilling'],drop_first=True)
    target1 = le.fit_transform(telco_data["Churn"])
    target2 = le.fit_transform(telco_data["Contract"])
    target3 = le.fit_transform(telco_data["PaymentMethod"])
    X_final = telco_data.drop(["Churn", "Contract", "PaymentMethod"],axis=1)

  elif name == 'paris':
    paris_data = pd.read_csv('./Datasets/ParisHousingClass.csv')
    paris_data.drop(columns='made',inplace=True)
    target1 = le.fit_transform(paris_data["category"])
    target2 = le.fit_transform(paris_data["isNewBuilt"])
    target3 = le.fit_transform(paris_data["hasStorageRoom"])
    X_final = paris_data.drop(["category", "isNewBuilt", "hasStorageRoom"],axis=1)
  
  elif name == 'smoking':
    smoking_data = pd.read_csv('./Datasets/smoking.csv')
    smoking_data.drop(columns=['ID','oral'],inplace=True)
    smoking_data = smoking_data.drop_duplicates()
    smoking_data = pd.get_dummies(smoking_data,columns=['gender'],drop_first=True)
    target1 = le.fit_transform(smoking_data["smoking"])
    target2 = le.fit_transform(smoking_data["tartar"])
    target3 = le.fit_transform(smoking_data["dental caries"])
    X_final = smoking_data.drop(["smoking", "tartar", "dental caries"],axis=1)
  
  elif name == 'flight':
    flight_data = pd.read_csv('./Datasets/flight.csv')
    flight_data.dropna(inplace=True)
    flight_data = pd.get_dummies(flight_data,columns=['Gender','Type of Travel'],drop_first=True)
    target1 = le.fit_transform(flight_data["satisfaction"])
    target2 = le.fit_transform(flight_data["Customer Type"])
    target3 = le.fit_transform(flight_data["Class"])
    X_final = flight_data.drop(["satisfaction", "Customer Type", "Class"],axis=1)
  
  else:
    raise ValueError('Incorrect dataset')
  
  return X_final, target1, target2, target3

In [38]:
import warnings
warnings.filterwarnings('ignore') 


def split(df, t, k):
  chunks = np.array_split(df, k)
  classes = np.array_split(t, k)
  return chunks, classes

metrics = {
  'prec_macro': [],
  'recall_macro': [],
  'f1_macro': [],
  'geometric_mean': [],
  'specificity': []
}

for name in CLASSIFIERS:
  # print(name+ ': Starting')
  outputDf = pd.DataFrame()
  for i in range(len(DATA_STRUCTURE)):
    # print('   '+ DATA_KEYS[i] + ': starting')
    df, target1, target2, target3 = get_data(DATA_KEYS[i])
    data, classes = split(df, target1, 5)

    for i in range(5):
      testData = data[i]
      testClasses = classes[i]
      
      # use the data at all positions other than i as the train data
      trainData = pd.concat((data[:i] + data[i+1:]))
      trainclasses = np.concatenate((classes[:i] + classes[i+1:]))
      clf = DecisionTreeClassifier()
      clf.fit(trainData, trainclasses)
      predicted = clf.predict(X=testData)

      prec_macro = precision_score(testClasses, predicted, average='macro', zero_division=0)
      recall_macro = recall_score(testClasses, predicted, average="macro", zero_division=0)
      f1_macro = f1_score(testClasses, predicted, average='macro', zero_division=0)
      geometric_mean = geometric_mean_score(testClasses, predicted)
      specificity = specificity_score(testClasses, predicted, average='macro')

      metrics['prec_macro'].append(prec_macro)
      metrics['recall_macro'].append(recall_macro)
      metrics['f1_macro'].append(f1_macro)
      metrics['geometric_mean'].append(geometric_mean)
      metrics['specificity'].append(specificity)

for metric in metrics:
  print(metric, sum(metrics[metric])/len(metrics[metric]))


prec_macro 0.7044932628716871
recall_macro 0.703390884122784
f1_macro 0.7006479368940736
geometric_mean 0.6547973751059588
specificity 0.7639526174154306


In [40]:
import warnings
warnings.filterwarnings('ignore') 
from imblearn.over_sampling import SMOTE, RandomOverSampler, KMeansSMOTE


def split(df, t, k):
  chunks = np.array_split(df, k)
  classes = np.array_split(t, k)
  return chunks, classes

metrics = {
  'prec_macro': [],
  'recall_macro': [],
  'f1_macro': [],
  'geometric_mean': [],
  'specificity': []
}

for name in CLASSIFIERS:
  print(name+ ': Starting')
  outputDf = pd.DataFrame()
  for i in range(len(DATA_STRUCTURE)):
    print('   '+ DATA_KEYS[i] + ': starting')
    df, target1, target2, target3 = get_data(DATA_KEYS[i])
    data, classes = split(df, target1, 5)

    for i in range(5):
      testData = data[i]
      testClasses = classes[i]
      
      # use the data at all positions other than i as the train data
      trainData = pd.concat((data[:i] + data[i+1:]))
      trainclasses = np.concatenate((classes[:i] + classes[i+1:]))

      sm = SMOTE(random_state=SEED, sampling_strategy='not majority')

      reSampled_df, resampled_target1 = sm.fit_resample(trainData, trainclasses)


      clf = DecisionTreeClassifier()
      clf.fit(trainData, trainclasses)
      predicted = clf.predict(X=testData)

      prec_macro = precision_score(testClasses, predicted, average='macro', zero_division=0)
      recall_macro = recall_score(testClasses, predicted, average="macro", zero_division=0)
      f1_macro = f1_score(testClasses, predicted, average='macro', zero_division=0)
      geometric_mean = geometric_mean_score(testClasses, predicted)
      specificity = specificity_score(testClasses, predicted, average='macro')

      metrics['prec_macro'].append(prec_macro)
      metrics['recall_macro'].append(recall_macro)
      metrics['f1_macro'].append(f1_macro)
      metrics['geometric_mean'].append(geometric_mean)
      metrics['specificity'].append(specificity)

for metric in metrics:
  print(metric, sum(metrics[metric])/len(metrics[metric]))


DecisionTreeClassifier: Starting
   imdb: starting
   mushroom: starting
   census: starting
   bank: starting
   intention: starting
   anuran: starting
   telco: starting
   paris: starting
   smoking: starting
   flight: starting
prec_macro 0.70582543180367
recall_macro 0.7007379061438858
f1_macro 0.699920898935508
geometric_mean 0.6521563098940139
specificity 0.7616400462748742
