In [10]:
from matplotlib import pyplot as plt
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier 
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.tree import _tree
import numpy as np
import pandas as pd
import csv
import math
import copy
import scipy.stats as st
#from argument import *

In [11]:
# Map string values to integer
def create_dictionary(feature, df):
    number = 0
    map_dictionary = {}
    for category in df[feature].unique():
        map_dictionary[category] = number
        number += 1
    #print(map_dictionary)
    return map_dictionary

In [None]:
# GDS4968 code
df = pd.read_csv('Data/GDS4968.csv')

# Define a function to modify the values
def remove_gsm(x):
    return x[3:]

df['Sample'] = df['Sample'].apply(remove_gsm)
df['Sample'] = df['Sample'].astype(int)

categories = ["Output"] # Only target feature has strings

for category in categories:
    df[category] = df[category].map(create_dictionary(category, df))

# There is no missing data
# missing_percentages = df.isna().mean() * 100
# print(missing_percentages)

# Numeric file
df.to_csv("Data/GDS4968_numeric.csv", index=False)

# Final df info
print(df['Output'].value_counts())
print("Shape", df.shape)

In [None]:
# Cirrhosis code
df = pd.read_csv('Data/cirrhosis.csv')

categories = ["Status", "Drug", "Sex", "Ascites", 
              "Hepatomegaly", "Spiders", "Edema"]

for category in categories:
    df[category] = df[category].map(create_dictionary(category, df))


missing_percentages = df.isna().mean() * 100
#print(missing_percentages)

# Get the columns with missing percentages greater than 5% (Cholesterol, Copper, Alk_Phos, SGOT and Tryglicerides)
columns_to_drop = missing_percentages[missing_percentages > 5].index

# Drop the columns from the dataframe
df = df.drop(columns_to_drop, axis=1)

# Drop remaining rows with NA
df = df.dropna()

# Numeric file
df.to_csv("Data/cirrhosis_numeric.csv", index=False)

# Final df info
print(df['Stage'].value_counts())
print("Shape", df.shape)

In [None]:
# Cars code
# archive.ics.uci.edu/ml/datasets/Car+Evaluation
df = pd.read_csv('Cars/cars_multiclass.csv')

df['class'] = df['class'].map({"unacc":0,"acc":1,"good":2,"vgood":3})

# missing_vals = df.isna().sum()
# # view the sum of missing values
# print(missing_vals)
# df.describe()

# Export to csv to import in the framework
df.to_csv("Cars/cars_numeric.csv")
df.shape

print(df['class'].value_counts())
print("Class", df.shape)

In [None]:
# Myocardial code
# archive.ics.uci.edu/ml/datasets/Myocardial+infarction+complications
df = pd.read_csv('Myocardial/myocardial.csv')

# Column 123, LET_IS, was chosen as target since it is the only multiclass
df = df.drop(df.columns[0], axis=1) # Drop ID
df = df.drop(df.columns[111:122], axis=1) # Drop predictors not used

missing_percentages = df.isna().mean() * 100
#print(missing_percentages)

# Get the columns with missing percentages greater than 5%
columns_to_drop = missing_percentages[missing_percentages > 5].index

# Drop the columns from the dataframe
df = df.drop(columns_to_drop, axis=1)

#list(df.columns)
df = df.dropna() # Drop remaining NA rows

# Export to csv to import in the framework
df.to_csv("Myocardial/myocardial_numeric.csv")

# Final shape
print(df.iloc[:, 59].value_counts()) # LET_IS
# Lethal outcome 
# (cause) 0: unknown 
# (alive) 1: cardiogenic shock 
# 2: pulmonary edema 
# 3: myocardial rupture 
# 4: progress of congestive heart failure 
# 5: thromboembolism 
# 6: asystole 
# 7: ventricular fibrillation

df.shape

In [None]:
# Soccer code
# List of required columns, based on rules given by Rulex
required_columns = [
    'id',
    'CONCAT_buildUpPlayPositioningClass',
    'CONCAT_chanceCreationPositioningClass',
    'CONCAT_defenceDefenderLineClass',
    'DELTA_buildUpPlayDribbling',
    'DELTA_buildUpPlayPassing',
    'DELTA_buildUpPlaySpeed',
    'DELTA_chanceCreationCrossing',
    'DELTA_chanceCreationPassing',
    'DELTA_chanceCreationShooting',
    'DELTA_defenceAggression',
    'DELTA_defencePressure',
    'DELTA_defenceTeamWidth',
    'Month_match',
    'buildUpPlayDribbling_1',
    'buildUpPlayPassing_1',
    'buildUpPlaySpeed_1',
    'chanceCreationCrossing_1',
    'chanceCreationPassing_1',
    'chanceCreationShooting_1',
    'defenceAggression_1',
    'defencePressure_1',
    'defenceTeamWidth_1',
    'RESULT'
]

# Read the CSV into a DataFrame
df = pd.read_excel('Soccer/soccer_input.xlsx', sheet_name=0)

# Keep only the columns that are in the required_columns list
df = df[df.columns.intersection(required_columns)]
df = df[required_columns] # Put in the right order

categories = ["CONCAT_buildUpPlayPositioningClass", "CONCAT_chanceCreationPositioningClass", "CONCAT_defenceDefenderLineClass", "RESULT"]

# Map categorical to numeric
for category in categories:
    df[category] = df[category].map(create_dictionary(category, df))

missing_percentages = df.isna().mean() * 100
#print(missing_percentages)	

# Get the columns with missing percentages greater than 5%
# This will drop DELTA_buildUpPlayDribbling and buildUpPlayDribbling_1             
columns_to_drop = missing_percentages[missing_percentages > 5].index

# Drop the columns from the dataframe
df = df.drop(columns_to_drop, axis=1)

#list(df.columns)
df = df.dropna() # Drop remaining NA rows

df["RESULT"] += 1  # Add 1 to keep the original values for RESULT column, and have X as 3
print(df.iloc[:, -1].value_counts()) # Result
# 1: 1
# 2: 2
# X: 3

print(df.shape)

# Save the filtered DataFrame to a new CSV file
df.to_csv('Soccer/soccer_numeric.csv', index=False)
