In [None]:
! pip install chefboost

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from chefboost import Chefboost as chef
from chefboost.training import Training
from sklearn.datasets import make_blobs
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# EDA for four datasets

## German Credit Data

In [None]:
credit_df = pd.read_csv('german_credit_data.csv')

In [None]:
credit_df.shape

In [None]:
credit_df.drop("Unnamed: 0", inplace=True, axis=1)
credit_df.head()

In [None]:
credit_df.info()

In [None]:
credit_df.describe()

In [None]:
credit_df[['Sex', 'Housing', 'Saving accounts', 'Checking account', 'Purpose']].describe()

In [None]:
#No duplicates
credit_df.duplicated().sum()

In [None]:
credit_df.isnull().sum()

In [None]:
# Check for classes balance
plt.figure(figsize=(10, 8))
y_countplot = sns.countplot(x='Risk', data=credit_df)
# To add annotation to counter plot https://github.com/mwaskom/seaborn/issues/1582
for y in y_countplot.patches:
    y_countplot.annotate(format(y.get_height()), (y.get_x() + y.get_width() / 2., y.get_height()), 
                     ha = 'center', va = 'center', xytext = (0, 10),textcoords = 'offset points')
plt.show()

In [None]:
credit_df.hist(figsize=(20, 10))
plt.show()

In [None]:
# Categorical features count
plt.figure(figsize=(35, 25))
j = 1
for i in ['Sex', 'Housing', 'Saving accounts', 'Checking account', 'Purpose']:
    plt.subplot(2, 3, j)
    plot = sns.countplot(x=i, data=credit_df)
    # To add annotation to counter plot https://github.com/mwaskom/seaborn/issues/1582
    for z in plot.patches:
        plot.annotate(format(z.get_height()), (z.get_x() + z.get_width() / 2.,z.get_height()), 
              ha = 'center', va = 'center', xytext = (0, 10), textcoords= 'offset points')
    j += 1
#plt.tight_layout()

In [None]:
# Numerical feature boxplot
plt.figure(figsize=(20, 10))
j = 1
for i in ['Age', 'Job', 'Credit amount', 'Duration']:
    plt.subplot(2, 2, j)
    credit_df.boxplot([i])
    j += 1
plt.show()

## Red Wine Quality Dataset

In [None]:
wine_df = pd.read_csv("winequality-red.csv")
wine_df.head()

In [None]:
wine_df.columns.tolist()

There are 12 variables in this dataset. The quality variable is the dependent variable.

In [None]:
wine_df.describe()

In [None]:
wine_df.isnull().sum() # there are no null variables in this dataset

In [None]:
sns.countplot(x = 'quality',data = wine_df) # This dataset is unbalanced

In [None]:
plt.figure(figsize = (15, 15))
sns.heatmap(wine_df.corr(), annot = True)

Fixed acidity, citric acid, residual sugar, sulphates, and alcohol have positive correlation with quality of the red wine. Volatile acidity, chlorides, free sulfur dioxide, total sulfur dioxide, density, and pH have negative correlation with quality of the red wine. Alcohol has the strongest correlation and volatile acidity has the weakest correlation. 

In [None]:
sns.boxplot(x='quality', y='alcohol', data = wine_df) 
# a box plot between alcohol concentration and the quality of red wine

Most of the outliers are around wine with quality 5 and 6.

In [None]:
sns.boxplot(x='quality', y='alcohol', data = wine_df, showfliers = False) # removing the outliers

This shows that the higher the alcohol concentration, the higher the quality of the red wine. 

In [None]:
sns.boxplot(x='quality', y='volatile acidity', data = wine_df, showfliers = False)
# a box plot between volatile acidity and the quality of red wine

This shows that the lower the volatile acidity, the lower the quality of the red wine. 

## Adult Income Dataset

In [None]:
adult_df = pd.read_csv('adult.csv')
adult_df.shape

In [None]:
adult_df.head()

In [None]:
#no null values but we can see that some values have ? instead of being null
adult_df.isnull().sum()

#fnlwgt: final weight, the number of people the census believes the entry represents

In [None]:
#replace ? with Null
adult_df = adult_df.replace({'?': None})

In [None]:
#since the target varaible and most columns have zero null values, we will keep rows with missing values for now
adult_df.isnull().sum()

In [None]:
adult_df.describe()

In [None]:
#target variable is imbalanced, 37155 of observations have <=50K income, whereas only 11687 of obervations have >50K income 
sns.countplot(x = 'income',data = adult_df)
adult_df.groupby(['income']).count()

In [None]:
#we see that average age and hours-per-week for people with higher income is bigger than those with lower income
adult_df.groupby(['income']).mean()

In [None]:
#there is almost no correlation between the numerical variables 
sns.heatmap(adult_df.corr(), annot = True)

In [None]:
sns.histplot(adult_df['age']).set_title('Age Distribution')

In [None]:
#mostly white 
sns.countplot(x = 'race',data = adult_df)

In [None]:
#marital status
plt.figure(figsize = (14, 5))
sns.countplot(x = 'marital-status',data = adult_df)

## First Artificial Dataset

Below is the plot of first two features. This artificial dataset is generated by clusters. So it should be easily classified using decision tree.

In [None]:
X_first_Artificial, y_first_Artificial = make_blobs(n_samples=10000, centers=6, n_features=50, random_state=42)
plt.figure()
plt.scatter(X_first_Artificial[:, 0], X_first_Artificial[:, 1], c=y_first_Artificial)
plt.title('centers = 1')

In [None]:
First_Artificial_df = pd.DataFrame(X_first_Artificial,columns = ['1','2','3','4','5','6','7','8','9','10',
                                                                 '11','12','13','14','15','16','17','18','19','20',
                                                                 '21','22','23','24','25','26','27','28','29','30',
                                                                '31','32','33','34','35','36','37','38','39','40',
                                                                '41','42','43','44','45','46','47','48','49','50'])
First_Artificial_df['Target'] = y_first_Artificial
First_Artificial_df = First_Artificial_df.astype({'Target': str})
First_Artificial_df

## Second Artificial Dataset

This artificial dataset has the same sample size and number of features. However, there are only 45 useful informative features, and remaining 5 features are redundant which means they are linear combination of other features. In addition, there are 5% percent of the data whose class is randomly assigned, so there exists nosiy data. Finally, I make the shift = 1, which means shift features by 1 each time so the range of each features varies. Details referred to https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html#sklearn.datasets.make_classification

In [None]:
X_sec_Artificial,y_sec_Artificial = make_classification(n_samples = 10000, n_features = 50, n_informative = 45, n_redundant = 5, n_repeated = 0, n_classes = 6,
                                                        n_clusters_per_class = 2, flip_y = 0.05, shift = 1, random_state = 42)
plt.figure()
plt.scatter(X_sec_Artificial[:, 0], X_sec_Artificial[:, 1], c=y_sec_Artificial)
plt.title('centers = 1')

In [None]:
Sec_Artificial_df = pd.DataFrame(X_sec_Artificial,columns = ['1','2','3','4','5','6','7','8','9','10',
                                                                 '11','12','13','14','15','16','17','18','19','20',
                                                                 '21','22','23','24','25','26','27','28','29','30',
                                                                '31','32','33','34','35','36','37','38','39','40',
                                                                '41','42','43','44','45','46','47','48','49','50'])
Sec_Artificial_df['Target'] = y_sec_Artificial
Sec_Artificial_df = Sec_Artificial_df.astype({'Target': str})
Sec_Artificial_df

# Algorithm Implementation Starts Here

## ID3

### A. German Credit Dataset

In [None]:
# example of copy dataset
credit_df_ID3 = credit_df.copy()

### B. Wine Quality Dataset

### C. Adult Income Dataset

### D. First Artificial Dataset

### E.Second Artificial Dataset

# C4.5

### A. German Credit Dataset

### B. Wine Quality Dataset

### C. Adult Income Dataset

### D. First Artificial Dataset

### E.Second Artificial Dataset

# CART

### A. German Credit Dataset

### B. Wine Quality Dataset

### C. Adult Income Dataset

### D. First Artificial Dataset

### E.Second Artificial Dataset

# CHAID

### A. German Credit Dataset

### B. Wine Quality Dataset

### C. Adult Income Dataset

In [None]:
#renaming income to be decision in order to fit the model since it only recognizes the target variable as Decision
adult_df = adult_df.rename(columns={"income": "Decision"})

In [None]:
X = adult_df.iloc[:,:-1]
y = adult_df.Decision
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.2, random_state=12345)

In [None]:
train_data = pd.concat([X_train, Y_train], axis=1)
train_data

In [None]:
#took so long so I tested on 80% of the data
config = {"algorithm": "CHAID"}
chaid_tree = cb.fit(train_data, config)

### D. First Artificial Dataset

### E.Second Artificial Dataset