In [None]:
import pandas as pd
import seaborn as sns
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from sklearn import svm, datasets, model_selection
from sklearn.metrics import confusion_matrix, average_precision_score, recall_score, precision_score
from sklearn.metrics import mean_absolute_error, average_precision_score
from sklearn.metrics import cohen_kappa_score, f1_score, log_loss, mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression


%run 'utils/imports_1.ipynb'
%run 'utils/arff.ipynb'
%run 'utils/arffconverter.ipynb'
%run 'utils/arffcrossvalidation.ipynb'

# 1. Zależność wielkości zbioru testowego od poprawności klasyfikacji

In [None]:
class ComputeForArff(object):
    def __init__(self, classifier, arff_path, splits):
        self.classifier = classifier
        self.arff_path = arff_path
        self.splits = splits
        self.seed = 7
    
    def compute_result(self):
        data = load_arff(self.arff_path)
        
        kfold = model_selection.KFold(n_splits=self.splits, random_state=self.seed)
        return cross_val_score(self.classifier, data.data, data.target, cv=kfold)

In [None]:
class ComputeBayesianNetworkStats(object):
    def __init__(self, classifier, arff_array):
        self.classifier = classifier
        self.arff_array = arff_array
        
    def compute_stats_dataframe(self):
        bn_data = dict()
        for i, path in enumerate(self.arff_array):
            print(i)
            bn_values = self.compute_stats(path)
            bn_data[labels[::-1][i]] = bn_values
        
        return pd.DataFrame(data=bn_data, index=["Precision", "Recall", "F1", "CKS", "MAE", "MSE"])
        
    def compute_stats(self, arff_path):
        attributes, a, b = ARFFLoader(arff_path).load_attributes_and_samples(8, 100)
        y, predicted = KFoldCrossValidation(arff_path, 8).predict(self.classifier)

        final_predicted = []
        for pred in predicted:
            for i, t in enumerate(attributes[0]['states']):
                if t == pred:
                    final_predicted.append(i)
                    continue

        final_y = []
        for pred in y:
            for i, t in enumerate(attributes[0]['states']):
                if t == pred:
                    final_y.append(i)
                    continue      

        PRECISION = precision_score(final_y, final_predicted, average='weighted')
        RECALL = recall_score(final_y, final_predicted, average='weighted')
        F1 = f1_score(final_y, final_predicted, average='weighted')
        MAE = mean_absolute_error(final_y, final_predicted)
        MSE = mean_squared_error(final_y, final_predicted)
        CKS = cohen_kappa_score(final_y, final_predicted)
        
        return [PRECISION, RECALL, F1, CKS, MAE, MSE]

In [None]:
class ComputeStatisticsForArff(object):
    def __init__(self, classifier, arff_path):
        self.classifier = classifier
        self.arff_path = arff_path
    
    def compute_result(self):
        data = load_arff(self.arff_path)
        X = data.data
        y = data.target
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=123456)
        self.classifier.fit(X_train, y_train)
        predicted_values = self.classifier.predict(X_test)
        
        PRECISION = precision_score(y_test, predicted_values, average='weighted')
        RECALL = recall_score(y_test, predicted_values, average='weighted')
        F1 = f1_score(y_test, predicted_values, average='weighted')
        CKS = cohen_kappa_score(y_test, predicted_values)
        MAE = mean_absolute_error(y_test, predicted_values)
        MSE = mean_squared_error(y_test, predicted_values)
        
        return [PRECISION, RECALL, F1, CKS, MAE, MSE]

In [None]:
nursery_paths = ['data/comparison/nursery_200.arff',
                'data/comparison/nursery_400.arff',
                'data/comparison/nursery_800.arff', 
                'data/comparison/nursery_1000.arff',
                'data/comparison/nursery_2000.arff',
                'data/comparison/nursery_4000.arff',
                'data/comparison/nursery_8000.arff',
                'data/comparison/nursery_12960.arff']

best_arff = 7
data = load_arff(nursery_paths[best_arff])

labels = [200, 400, 800, 1000, 2000, 4000, 8000, 12960]

sns.set_style("whitegrid")
sns.set_palette(sns.light_palette((210, 90, 60), input="husl"))

#### 1. SVM

In [None]:
SVM_df = pd.DataFrame()
linear_data = dict()
rbf_data = dict()
   
for i, nursery in enumerate(nursery_paths): 
    value = ComputeForArff(SVC(kernel='linear', C=50), nursery, 10).compute_result()
    linear_data[labels[i]] = [value.mean(), value.std()]
    df_temp = pd.DataFrame({"value": value})
    df_temp['size'] = labels[i]
    df_temp['kernel'] = "linear"
    SVM_df = SVM_df.append(df_temp)

for i, nursery in enumerate(nursery_paths):
    value = ComputeForArff(SVC(kernel='rbf', C=50), nursery, 10).compute_result()
    rbf_data[labels[i]] = [value.mean(), value.std()]
    df_temp = pd.DataFrame({"value": value})
    df_temp['size'] = labels[i]
    df_temp['kernel'] = "rbf"
    SVM_df = SVM_df.append(df_temp)

for i, path in enumerate(nursery_paths):
    linear_values = ComputeStatisticsForArff(SVC(kernel='linear', C=50), path).compute_result()
    linear_data[labels[i]] += linear_values
    
    rbf_values = ComputeStatisticsForArff(SVC(kernel='rbf', C=50), path).compute_result()
    rbf_data[labels[i]] += rbf_values

In [None]:
linear_kernel_df = pd.DataFrame(data=linear_data, index=["Mean", "Std", "Precision", "Recall", "F1", "CKS", "MAE", "MSE"])
rbf_kernel_df = pd.DataFrame(data=rbf_data, index=["Mean", "Std", "Precision", "Recall", "F1", "CKS", "MAE", "MSE"])

In [None]:
sns.set(rc={'figure.figsize':(16, 9)})
plt.ylim(0.58, 1.01)

sns.boxplot(x="size", y="value", hue="kernel", data=SVM_df, palette=sns.light_palette("green")).set_title("SVM, kernel = {linear, rbf}, 10-fold cross validation, nursery.arff")

In [None]:
rbf_kernel_df

In [None]:
linear_kernel_df

#### 2. KNN

In [None]:
KNN_df = pd.DataFrame()
k_5_data = dict()
k_25_data = dict()

for i, nursery in enumerate(nursery_paths):
    value = ComputeForArff(KNeighborsClassifier(5, weights='distance'), nursery, 10).compute_result()
    k_5_data[labels[i]] = [value.mean(), value.std()]
    df_temp = pd.DataFrame({"value": value})
    df_temp['size'] = labels[i]
    df_temp['knn'] = "5"
    KNN_df = KNN_df.append(df_temp)

for i, nursery in enumerate(nursery_paths): 
    value = ComputeForArff(KNeighborsClassifier(25, weights='distance'), nursery, 10).compute_result()
    k_25_data[labels[i]] = [value.mean(), value.std()]
    df_temp = pd.DataFrame({"value": value})
    df_temp['size'] = labels[i]
    df_temp['knn'] = "25"
    KNN_df = KNN_df.append(df_temp)

for i, path in enumerate(nursery_paths):
    k_5_values = ComputeStatisticsForArff(KNeighborsClassifier(5, weights='distance'), path).compute_result()
    k_5_data[labels[i]] += k_5_values
    
    k_25_values = ComputeStatisticsForArff(KNeighborsClassifier(25, weights='distance'), path).compute_result()
    k_25_data[labels[i]] += k_25_values
    
k_5_df = pd.DataFrame(data=k_5_data, index=["Mean", "Std", "Precision", "Recall", "F1", "CKS", "MAE", "MSE"])
k_25_df = pd.DataFrame(data=k_25_data, index=["Mean", "Std", "Precision", "Recall", "F1", "CKS", "MAE", "MSE"])

In [None]:
sns.set(rc={'figure.figsize':(16, 9)})
plt.ylim(0.58, 1.01)
sns.boxplot(x="size", y="value", hue="knn", data=KNN_df, palette=sns.light_palette("green")).set_title('K-NN, K={5, 25}, 10-fold cross validation, nursery.arff')

In [None]:
k_5_df

In [None]:
k_25_df

#### 3. Lasy losowe

In [None]:
LL_df = pd.DataFrame()
est_50_data = dict()
est_100_data = dict()
   
for i, nursery in enumerate(nursery_paths):
    value = ComputeForArff(RandomForestClassifier(n_estimators=50, criterion='entropy'), nursery, 10).compute_result()
    est_50_data[labels[i]] = [value.mean(), value.std()]
    df_temp = pd.DataFrame({"value": value})
    df_temp['size'] = labels[i]
    df_temp['estimators'] = "50"
    LL_df = LL_df.append(df_temp)

for i, nursery in enumerate(nursery_paths):
    value = ComputeForArff(RandomForestClassifier(n_estimators=100, criterion='entropy'), nursery, 10).compute_result()
    est_100_data[labels[i]] = [value.mean(), value.std()]
    df_temp = pd.DataFrame({"value": value})
    df_temp['size'] = labels[i]
    df_temp['estimators'] = "100"
    LL_df = LL_df.append(df_temp)
    
for i, path in enumerate(nursery_paths):
    est_50_values = ComputeStatisticsForArff(RandomForestClassifier(n_estimators=5, criterion='entropy'), path).compute_result()
    est_50_data[labels[i]] += est_50_values
    
    est_100_values = ComputeStatisticsForArff(RandomForestClassifier(n_estimators=50, criterion='entropy'), path).compute_result()
    est_100_data[labels[i]] += est_100_values
    
est_50_df = pd.DataFrame(data=est_50_data, index=["Mean", "Std", "Precision", "Recall", "F1", "CKS", "MAE", "MSE"])
est_100_df = pd.DataFrame(data=est_100_data, index=["Mean", "Std", "Precision", "Recall", "F1", "CKS", "MAE", "MSE"])

In [None]:
sns.set(rc={'figure.figsize':(16, 9)})
plt.ylim(0.58, 1.01)
labels = labels[::-1]
sns.set_palette(sns.light_palette("green"))
sns.boxplot(x="size", y="value", hue="estimators", data=LL_df).set_title('Random forests, no. of estimators = {50, 100}, 10-fold cross validation, nursery.arff')

In [None]:
est_50_df

In [None]:
est_100_df

# 2. Sieci bayesowskie - porównanie algorytmów w zależności od data setu

# ! Wywołanie poniższego skryptu jest czasochłonne - w zależności od komputera może potrwać od 3 do 5 godzin

In [None]:
bn_splits = 10

BN_df = pd.DataFrame()

total = len(nursery_paths)

In [None]:
BN_K2_df = pd.DataFrame()

K2_stats_dict_temp = dict()

for i, path in enumerate(nursery_paths):
    print(i + 1, "of", total)
    value = KFoldCrossValidation(path, 8).perform_k_fold_cross_validation(bn_splits, K2Algorithm(score_method='aic', number_of_parents=3))
    K2_stats_dict_temp[labels[::-1][i]] =  [value.mean(), value.std()]
    df_temp = pd.DataFrame({"value": value})
    df_temp['size'] = labels[::-1][i]
    df_temp['name'] = "K2"
    BN_K2_df = BN_K2_df.append(df_temp)

In [None]:
BN_TAN_df = pd.DataFrame()

TAN_stats_dict_temp = dict()

for i, path in enumerate(nursery_paths):
    print(i + 1, "of", total)
    value = KFoldCrossValidation(path, 8).perform_k_fold_cross_validation(bn_splits, TANAlgorithm())
    TAN_stats_dict_temp[labels[::-1][i]] =  [value.mean(), value.std()]
    df_temp = pd.DataFrame({"value": value})
    df_temp['size'] = labels[::-1][i]
    df_temp['name'] = "TAN"
    BN_TAN_df = BN_TAN_df.append(df_temp)

In [None]:
BN_TABU_df = pd.DataFrame()

TABU_stats_dict_temp = dict()

for i, path in enumerate(nursery_paths):
    print(i + 1, "of", total)
    value = KFoldCrossValidation(path, 8).perform_k_fold_cross_validation(bn_splits, TabuSearch(score_method='aic', number_of_parents=2, number_of_iterations=10, tabu_length=5))
    TABU_stats_dict_temp[labels[::-1][i]] =  [value.mean(), value.std()]
    df_temp = pd.DataFrame({"value": value})
    df_temp['size'] = labels[::-1][i]
    df_temp['name'] = "TABU"
    BN_TABU_df = BN_TABU_df.append(df_temp)

In [None]:
BN_df = pd.DataFrame()
BN_df = BN_df.append(BN_TAN_df)
BN_df = BN_df.append(BN_K2_df)
BN_df = BN_df.append(BN_TABU_df)

In [None]:
sns.set(rc={'figure.figsize':(16, 9)})
plt.ylim(0.48, 1.01)
sns.set_palette(sns.light_palette("green"))
sns.boxplot(x="size", y="value", hue="name", data=BN_df).set_title('Bayesian networks, 10-fold cross validation, TAN/TABU/K2, nursery.arff')

In [None]:
K2_stats_df = ComputeBayesianNetworkStats(K2Algorithm(score_method='aic', number_of_parents=3), nursery_paths).compute_stats_dataframe()

In [None]:
TAN_stats_df = ComputeBayesianNetworkStats(TANAlgorithm(), nursery_paths).compute_stats_dataframe()

In [None]:
TABU_stats_df = ComputeBayesianNetworkStats(TabuSearch(score_method='aic', number_of_parents=2, number_of_iterations=10, tabu_length=5), nursery_paths).compute_stats_dataframe()

In [None]:
K2_stats_df_temp = pd.DataFrame(data=K2_stats_dict_temp, index=["mean", "std"])
pd.concat([K2_stats_df, K2_stats_df_temp])

In [None]:
TAN_stats_df_temp = pd.DataFrame(data=TAN_stats_dict_temp, index=["mean", "std"])
pd.concat([TAN_stats_df, TAN_stats_df_temp])

In [None]:
TABU_stats_df_temp = pd.DataFrame(data=TABU_stats_dict_temp, index=["mean", "std"])
pd.concat([TABU_stats_df, TABU_stats_df_temp])

# 3. Porównanie algorytmów - wykres świecowy

* SVM - kernel=rbf, bo wyszło najkorzystniej,
* BN - TAN, bo wyszedł najkorzystniej,
* KNN - k=5, bo wyszło najkorzystniej.

In [None]:
splits = 10
seed = 7
path = nursery_paths[best_arff]
classifier_names = ['SVM', 'KNN', 'LL']

names = []
results = []
models = []
models.append(('SVM', SVC(kernel='rbf', C=50)))
models.append(('KNN', KNeighborsClassifier(25, weights='distance')))
models.append(('LL', RandomForestClassifier(n_estimators=100, criterion='entropy')))

X = data.data
y = data.target

temp_summary = dict()
    
for i, (name, model) in enumerate(models):
    kfold = model_selection.KFold(n_splits=splits, random_state=seed)
    cv_results = cross_val_score(model, X, y, cv=kfold)
    temp_summary[classifier_names[i]] = [cv_results.mean(), cv_results.std()]
    results.append(cv_results)
    names.append(name)

bn_value = KFoldCrossValidation(nursery_paths[best_arff], 8).perform_k_fold_cross_validation(10, TANAlgorithm())
results.append(bn_value)    
names.append("BN TAN")    
temp_summary["BN"] = [bn_value.mean(), bn_value.std()]

In [None]:
pd_SVM = pd.DataFrame({"value": results[0]})
pd_SVM['name'] = "SVM"

pd_KNN = pd.DataFrame({"value": results[1]})
pd_KNN['name'] = "KNN"

pd_LL = pd.DataFrame({"value": results[2]})
pd_LL['name'] = "LL"

pd_TAN = pd.DataFrame({"value": results[3]})
pd_TAN['name'] = "BN TAN"

COMP_df = pd.DataFrame()
COMP_df = COMP_df.append(pd_SVM)
COMP_df = COMP_df.append(pd_KNN)
COMP_df = COMP_df.append(pd_LL)
COMP_df = COMP_df.append(pd_TAN)

In [None]:
sns.set(rc={'figure.figsize':(12, 11)})
sns.set_palette(sns.light_palette("green"))
sns.boxplot(x="name", y="value", data=COMP_df, order=['SVM', 'LL', 'KNN', 'BN TAN']).set_title("SVM/KNN/LL/BN TAN")

In [None]:
est_SVM = ComputeStatisticsForArff(SVC(), path).compute_result()
est_KNN = ComputeStatisticsForArff(KNeighborsClassifier(), path).compute_result()
est_LL = ComputeStatisticsForArff(RandomForestClassifier(n_estimators=10, criterion='entropy'), path).compute_result()
est_BN = ComputeBayesianNetworkStats(TANAlgorithm(), []).compute_stats(path)

In [None]:
temp_summary

In [None]:
SVM_best_stat = est_SVM + temp_summary['SVM']
KNN_best_stat = est_KNN + temp_summary['KNN']
LL_best_stat = est_LL + temp_summary['LL']
BN_best_stat = est_BN + temp_summary['BN']

d = {'SVM': SVM_best_stat, 'KNN': KNN_best_stat, 'LL': LL_best_stat, 'BN TAN': BN_best_stat}
df = pd.DataFrame(data=d, index=["Precision", "Recall", "F1", "CKS", "MAE", "MSE", "Mean", "Std"])
df

# Rozkład parametrów w zbiorze danych testowych

In [None]:
class CheckClassNumbers(object):
    def plot_stats(self, path):
        loading_data = load_arff(path)
        result = [0, 0, 0, 0]
        for target in loading_data.target:
            result[int(target)] += 1
        
        print(result)
            
        sns.set(rc={'figure.figsize':(8, 6)})
        sns.barplot(loading_data.target_names, result, palette="BuGn_d")
    
    def show_stat_table(self):
        final_result = dict()
        target_names = []
        for i, path in enumerate(nursery_paths):
            loading_data = load_arff(path)
            if target_names == []:
                target_names = loading_data.target_names
                
            result = [0, 0, 0, 0]
            for target in loading_data.target:
                result[int(target)] += 1
            
            final_result[labels[i]] = result
        return pd.DataFrame.from_dict(final_result, orient='index')
        
CheckClassNumbers().plot_stats('data/comparison_new/nursery_100.arff')

In [None]:
CheckClassNumbers().plot_stats('data/comparison/nursery_200.arff')

In [None]:
CheckClassNumbers().plot_stats('data/comparison/nursery_400.arff')

In [None]:
CheckClassNumbers().plot_stats('data/comparison/nursery_800.arff')

In [None]:
CheckClassNumbers().plot_stats('data/comparison/nursery_1000.arff')

In [None]:
CheckClassNumbers().plot_stats('data/comparison/nursery_2000.arff')

In [None]:
CheckClassNumbers().plot_stats('data/comparison/nursery_4000.arff')

In [None]:
CheckClassNumbers().plot_stats('data/comparison/nursery_8000.arff')

In [None]:
CheckClassNumbers().plot_stats('data/comparison/nursery_12960.arff')