### Importing Modules

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import matplotlib.patches as mpatches
from sklearn.multiclass import OneVsRestClassifier
import pickle

# Scores
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

# Classifiers
from sklearn.svm import SVC

# Ignoring Errors
import warnings
warnings.simplefilter('ignore')

In [None]:
# def find_val_score(X, y, k):
#     all_f1_scores_test = []
#     all_f1_scores_train = []

#     kf = StratifiedKFold(10, random_state=0, shuffle=True)
    
#     for train_index, test_index in kf.split(X, y):
#         X_train, X_test = X[train_index], X[test_index]
#         y_train, y_test = y[train_index], y[test_index]
#         linear_svm = Pipeline([['sc', StandardScaler()], ['clf', SVC(gamma=1/X_train.shape[0],
#                     kernel="linear", probability=True, random_state=0)]])
#         ovr_clf = OneVsRestClassifier(linear_svm)
#         ovr_clf.fit(X_train, y_train)

#         y_pred_train = ovr_clf.predict(X_train)
#         y_pred_test = ovr_clf.predict(X_test)
#         print(y_pred_test)

#         for true, pred in zip(y_train, y_pred_train):
#             if true == pred:
#                 all_f1_scores_train.append(species_score(true, k, 'train'))
#             else:
#                 all_f1_scores_train.append(0)

#         for true, pred in zip(y_test, y_pred_test):
#             if true == pred:
#                 all_f1_scores_test.append(species_score(true, k, 'test'))
#             else:
#                 all_f1_scores_test.append(0)
    
#     all_f1_scores_test = np.ravel(all_f1_scores_test)
#     all_f1_scores_train = np.ravel(all_f1_scores_train)
#     return [(all_f1_scores_train.mean(), all_f1_scores_train.std()),
#             (all_f1_scores_test.mean(), all_f1_scores_test.std())]

### Estimating Taxonomic Class:

- If a sample is predicted true it will goes to predict subclasses, it will append 0 if it is estimated wrong

In [None]:
def find_val_score(X, y, k):
    all_f1_scores_test = []
    all_f1_scores_train = []

    kf = StratifiedKFold(10, random_state=0, shuffle=True)
    
    for train_index, test_index in kf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        linear_svm = Pipeline([['sc', StandardScaler()], ['clf', SVC(gamma=1/X_train.shape[0],
                    kernel="linear", probability=True, random_state=0)]])
        ovr_clf = OneVsRestClassifier(linear_svm)
        ovr_clf.fit(X_train, y_train)

        y_pred_train = ovr_clf.predict(X_train)
        y_pred_test = ovr_clf.predict(X_test)
        print(y_pred_test)

        for true, pred in zip(y_train, y_pred_train):
            if true == pred:
                all_f1_scores_train.append(uncombined_datas[str(k)][true][0])
            else:
                all_f1_scores_train.append(0)

        for true, pred in zip(y_test, y_pred_test):
            if true == pred:
                all_f1_scores_test.append(uncombined_datas[str(k)][true][1])
            else:
                all_f1_scores_test.append(0)
    
    all_f1_scores_test = np.ravel(all_f1_scores_test)
    all_f1_scores_train = np.ravel(all_f1_scores_train)
    return [(all_f1_scores_train.mean(), all_f1_scores_train.std()),
            (all_f1_scores_test.mean(), all_f1_scores_test.std())]

In [None]:
# def species_score(taxonomic_class, k, train_test):
#     species_f1 = []

#     sub_df = uncombined_datas[str(k)][taxonomic_class]
#     X = sub_df.iloc[:,2:]
#     y = sub_df.iloc[:,1]
#     kf = StratifiedKFold(10, random_state=0, shuffle=True)
    
#     for train_index, test_index in kf.split(X, y):
#         X_train, X_test = X[train_index], X[test_index]
#         y_train, y_test = y[train_index], y[test_index]

#         linear_svm = Pipeline([['sc', StandardScaler()], ['clf', SVC(gamma=1/X_train.shape[0],
#                     kernel="linear", probability=True, random_state=0)]])
#         ovr_clf = OneVsRestClassifier(linear_svm)
#         ovr_clf.fit(X_train, y_train)

#         if train_test == 'train':
#             y_pred_train = ovr_clf.predict(X_train)
#             f1_micro_train = f1_score(y_train, y_pred_train, pos_label=None, average='micro')
#             species_f1.append(f1_micro_train)
          
#         elif train_test == 'test':
#             y_pred_test = ovr_clf.predict(X_test)
#             f1_micro_test = f1_score(y_test, y_pred_test, pos_label=None, average='micro')
#             species_f1.append(f1_micro_test)

#     species_f1 = np.ravel(species_f1)
#     return species_f1.mean()

### F1 Score Calculation For Species Classes

In [None]:
def species_score(data):
    species_f1_train = []
    species_f1_test = []

    X = np.array(data.iloc[:,2:])
    y = np.ravel(data.iloc[:,1])
    kf = StratifiedKFold(10, random_state=0, shuffle=True)
    
    for train_index, test_index in kf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        linear_svm = Pipeline([['sc', StandardScaler()], ['clf', SVC(gamma=1/X_train.shape[0],
                    kernel="linear", probability=True, random_state=0)]])
        ovr_clf = OneVsRestClassifier(linear_svm)
        ovr_clf.fit(X_train, y_train)

        y_pred_train = ovr_clf.predict(X_train)
        f1_micro_train = f1_score(y_train, y_pred_train, pos_label=None, average='micro')
        species_f1_train.append(f1_micro_train)
          
        y_pred_test = ovr_clf.predict(X_test)
        f1_micro_test = f1_score(y_test, y_pred_test, pos_label=None, average='micro')
        species_f1_test.append(f1_micro_test)

    species_f1_train = np.ravel(species_f1_train)
    species_f1_test = np.ravel(species_f1_test)
    return (species_f1_train.mean(), species_f1_test.mean())

### Inserting and Merging Data & Species Classification

In [None]:
data_names = ['Aves', 'Chiroptera', 'Rodentia']
combined_datas = {}
uncombined_datas = {}
for k in range(1,8):
    before_combine_df = []
    uncombined_datas.setdefault(str(k), {})
    for data in data_names:
        csv_file = pd.read_csv("{}.Cleaned.k{}.csv".format(data, k))
        csv_file = csv_file.drop(['Unnamed: 0', 'nucleotides'],1)
        taxonomies = pd.DataFrame(np.ravel([data for i in range(csv_file.shape[0])]))
        taxonomy_class_added = pd.concat([taxonomies, csv_file], axis=1)
        before_combine_df.append(taxonomy_class_added)
        
        spec_score = species_score(taxonomy_class_added)
        uncombined_datas[str(k)][data] = spec_score

    combined_datas[str(k)] = pd.concat(before_combine_df, axis=0)

### Starting to Hierarchical Classification

In [None]:
hierarchical_scores = {}
for k in range(1,8):
    X_data = np.array(combined_datas[str(k)].iloc[:,2:])
    y_taxonomic_classes = np.ravel(combined_datas[str(k)].iloc[:,0])

    f1_scores_ = find_val_score(X_data, y_taxonomic_classes, k)
    hierarchical_scores[str(k)] = f1_scores_

### Storing The Scores

In [None]:
with open('scores.db','wb') as score:
    pickle.dump(hierarchical_scores, score)

In [None]:
print(hierarchical_scores)