In [2]:
from matplotlib import pyplot as plt
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier 
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.tree import _tree
import numpy as np
import pandas as pd
import csv
import math
import copy
import scipy.stats as st
#from argument import *

In [8]:
# Map string values to integer
def create_dictionary(feature, df):
    number = 0
    map_dictionary = {}
    for category in df[feature].unique():
        map_dictionary[category] = number
        number += 1
    #print(map_dictionary)
    return map_dictionary

In [33]:
# Cirrhosis code
df = pd.read_csv('Data/cirrhosis.csv')

categories = ["Status", "Drug", "Sex", "Ascites", 
              "Hepatomegaly", "Spiders", "Edema"]

for category in categories:
    df[category] = df[category].map(create_dictionary(category, df))


missing_percentages = df.isna().mean() * 100
#print(missing_percentages)

# Get the columns with missing percentages greater than 5% (Cholesterol, Copper, Alk_Phos, SGOT and Tryglicerides)
columns_to_drop = missing_percentages[missing_percentages > 5].index

# Drop the columns from the dataframe
df = df.drop(columns_to_drop, axis=1)

# Drop remaining rows with NA
df = df.dropna()

# Numeric file
df.to_csv("Data/cirrhosis_numeric.csv")

# Final df info
print(df['Stage'].value_counts())
print("Shape", df.shape)

Stage
3.0    153
4.0    141
2.0     86
1.0     19
Name: count, dtype: int64
Shape (399, 15)


In [23]:
# Cars code
df = pd.read_csv('original data/cars.csv')

# Map string values to code
# df['buying'] = df['buying'].map({"low":0,"med":1,"high":2,"vhigh":3})
# df['maint'] = df['maint'].map({"low":0,"med":1,"high":2,"vhigh":3})
# df['lug_boot'] = df['lug_boot'].map({"small":0,"med":1,"big":2})
# df['safety'] = df['safety'].map({"low":0,"med":1,"high":2})
# df['doors'].replace('5more', 5, inplace=True)
# df['persons'].replace('more', 5, inplace=True)
# df['persons'].replace('2', 2, inplace=True)
# df['persons'].replace('4', 5, inplace=True)
# df['doors'].replace('2', 2, inplace=True)
# df['doors'].replace('3', 3, inplace=True)
# df['doors'].replace('4', 4, inplace=True)

df['class'] = df['class'].map({"unacc":0,"acc":1,"good":2,"vgood":3})
# Group positive classes together
df['class'].replace(2, 1, inplace=True)
df['class'].replace(3, 1, inplace=True)

feature_names=["buying", "maint", "doors", "persons", "lug_boot", "safety"]
class_names=["unacc", "acc", "good", "vgood"]

# missing_vals = df.isna().sum()
# view the sum of missing values
# print(missing_vals)
# df.describe()

print("Total positive class:", df['class'].sum()/df.shape[0] * 100)
print("Total negative class:", (df.shape[0] - df['class'].sum()) /df.shape[0] * 100)

# Export to csv to import in the framework
df.to_csv("original data/cars_cleaned.csv")
df.shape

Total positive class: 29.976851851851855
Total negative class: 70.02314814814815


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['class'].replace(2, 1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['class'].replace(3, 1, inplace=True)


(1728, 7)

In [14]:
# Myocardial code

df = pd.read_csv('original data/myocardial.csv')

# Based on this, column 120, ZSN, was chosen as target since it is better balanced
# value_percentages = df.iloc[:, 112:].apply(pd.value_counts, normalize=True)
# print(value_percentages)

df = df.drop(df.columns[0], axis=1) # Drop ID
df = df.drop(df.columns[111:119], axis=1) # Drop predictors not used
df = df.drop(df.columns[112:], axis=1) # Drop predictors not used

missing_percentages = df.isna().mean() * 100

# Get the columns with missing percentages greater than 5%
columns_to_drop = missing_percentages[missing_percentages > 5].index

# Drop the columns from the dataframe
df = df.drop(columns_to_drop, axis=1)

#list(df.columns)
df = df.dropna()

feature_names = list(df.columns)[:-1]
class_names = ["no", "yes"]


print("Total positive class:", df['ZSN'].sum()/df.shape[0] * 100)
print("Total negative class:", (df.shape[0] - df['ZSN'].sum()) /df.shape[0] * 100)

# # Export to csv to import in the framework
df.to_csv("original data/myocardial_cleaned.csv")
df.shape

Total positive class: 23.955431754874652
Total negative class: 76.04456824512535


(1436, 60)

In [13]:
df.describe()

Unnamed: 0,AGE,SEX,INF_ANAM,FK_STENOK,IBS_POST,GB,SIM_GIPERT,ZSN_A,nr_11,nr_01,...,NA_R_1_n,NOT_NA_1_n,LID_S_n,B_BLOK_S_n,ANT_CA_S_n,GEPAR_S_n,ASP_S_n,TIKL_S_n,TRENT_S_n,ZSN
count,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0,...,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0
mean,61.348886,0.639276,0.500696,1.181755,1.126741,1.379526,0.034123,0.164345,0.026462,0.002786,...,0.447075,0.336351,0.283426,0.130223,0.695682,0.736072,0.769499,0.020195,0.206128,0.239554
std,11.285593,0.480378,0.798517,1.045169,0.80284,1.084627,0.181607,0.594525,0.160562,0.052723,...,0.734884,0.641504,0.450818,0.336666,0.460278,0.440914,0.4213,0.140716,0.404665,0.42696
min,26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,54.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
50%,62.0,1.0,0.0,2.0,1.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
75%,69.0,1.0,1.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
max,92.0,1.0,3.0,4.0,2.0,3.0,1.0,4.0,1.0,1.0,...,4.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [21]:
# US census code

df = pd.read_csv('original data/adult_data.csv')
df = df.replace(" ?", np.nan)
df = df.dropna()

# Map string values to code

def create_dictionary(feature):
    number = 0
    map_dictionary = {}
    for category in df[feature].unique():
        map_dictionary[category] = number
        number += 1
    #print(map_dictionary)
    return map_dictionary


categories = ["marital-status", "education", "workclass", "occupation", 
              "relationship", "race", "sex", "native-country", "target"]

for category in categories:
    df[category] = df[category].map(create_dictionary(category))

feature_names=["age", "workclass", "fnlwgt", "education", "education-num",
               "marital-status", "occupation", "relationship", "race", "sex",
               "capital-gain", "capital-loss", "hours-per-week", "native-country"]
class_names=["less_50k", "more_50k"]

print("Total positive class:", df['target'].sum()/df.shape[0] * 100)
print("Total negative class:", (df.shape[0] - df['target'].sum()) /df.shape[0] * 100)

# # Export to csv to import in the framework
df.to_csv("original data/adult_data_cleaned.csv")
df.shape

Total positive class: 24.892248524633644
Total negative class: 75.10775147536636


(30162, 15)

In [17]:
df.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,30162.0,30162.0,30162.0,30162.0,30162.0,30162.0
mean,38.437902,189793.8,10.121312,1092.007858,88.372489,40.931238
std,13.134665,105653.0,2.549995,7406.346497,404.29837,11.979984
min,17.0,13769.0,1.0,0.0,0.0,1.0
25%,28.0,117627.2,9.0,0.0,0.0,40.0
50%,37.0,178425.0,10.0,0.0,0.0,40.0
75%,47.0,237628.5,13.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [22]:
# Bank code

df = pd.read_csv('original data/bank-full.csv')
# df.head()
missing_vals = df.isna().sum()
# view the sum of missing values
print("Missing:", missing_vals)
df = df.replace(" ?", np.nan)
df = df.dropna()

# Map string values to code

def create_dictionary(feature):
    number = 0
    map_dictionary = {}
    for category in df[feature].unique():
        map_dictionary[category] = number
        number += 1
    #print(map_dictionary)
    return map_dictionary


categories = ["job", "marital", "education", "default", 
              "housing", "loan", "contact", "month", "poutcome", "y"]

for category in categories:
    df[category] = df[category].map(create_dictionary(category))


feature_names = ["age","job","marital","education","default","balance","housing",
                 "loan","contact","day","month","duration","campaign","pdays",
                 "previous","poutcome"]

class_names = ["no", "yes"]

print("Total positive class:", df['poutcome'].sum()/df.shape[0] * 100)
print("Total negative class:", (df.shape[0] - df['poutcome'].sum()) /df.shape[0] * 100)

# # Export to csv to import in the framework
df.to_csv("original data/bank-full_cleaned.csv")
df.shape

Missing: age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64
Total positive class: 29.006215301585897
Total negative class: 70.9937846984141


(45211, 17)