### Importing Modules

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import matplotlib.patches as mpatches
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
import pickle

# Scores
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

# Classifiers
from sklearn.svm import SVC

# Ignoring Errors
import warnings
warnings.simplefilter('ignore')

In [12]:
df = pd.read_csv("Aves.Cleaned.k1.csv")
# df[df.genus_name == 'Pterodroma']
asdf = np.array(df)[2,:]
# df
asdf

array([3,
       'AACCGATGATTATTCTCAACTAACCACAAAGACATCGGCACCCTATACCTAATCTTTGGCGCATGAGCCGGTATAGTCGGTACAGCCCTCAGCTTACTTATCCGCGCAGAACTAGGCCAACCAGGAACCCTCCTAGGAGAC---GACCAAATCTATAACGTAATTGTCACCGCCCACGCTTTCGTAATAATCTTTTTCATAGTCATGCCAATCATAATCGGCGGCTTCGGCAACTGACTAGTCCCACTGATAATTGGTGCACCGGACATAGCATTCCCCCGCATAAATAACATAAGCTTCTGACTACTTCCTCCATCATTTTTACTTCTTCTAGCCTCTTCTACAGTCGAAGCAGGAGCAGGTACCGGATGAACCGTTTACCCGCCTCTAGCTGGCAACCTAGCACACGCTGGAGCATCAGTAGACCTAGCCATCTTCTCACTTCACCTAGCAGGTGTCTCCTCTATCCTAGGCGCAATCAACTTCATCACAACCGCCATCAACATAAAACCACCTGCCCTCTCACAATACCAAACTCCCCTATTCGTCTGATCCGTACTTATCACCGCTGTTCTATTACTCCTCTCACTTCCAGTCCTCGCTGCAGGCATCACCATGCTACTAACAGACCGAAATCTGAACACTACATTCTTCGATCCTGCTGGAGGAGGTGACCCAGTCCTGTACCAACATCTCTTTTGATTCTTCGGCCACCCAGAAGTCTACATCTTAATCTTACCAGGATTTGGAATCATCTCCCACGTAGTAACATACTACGCAGGTAAAAAAGAGCCATTCGGCTATATAGGAATAGTTTGAGCCATACTATCAATCGGATTCCTAGGCTTCATTGTTTGAGCCCACCACATATTCACCGTAGAAATGGACGTAGACACCCGAG---------------------------------------------------------------------------------

### Estimating Taxonomic Class:

- If a sample is predicted true it will goes to predict subclasses, it will append 0 if it is estimated wrong

In [3]:
def find_val_score(k):
    f1_scores_train, f1_scores_test = [], []
    
    data = combined_datas[str(k)]
    X = np.array(data.iloc[:,2:]) # X is the barcode sequences
    species = np.ravel(data.iloc[:,1])
    y = np.ravel(data.iloc[:,0]) # y is the taxonomic classes
    

    kf = StratifiedKFold(10, random_state=0, shuffle=True)
    
    for train_index, test_index in kf.split(X, y):
        predictions_train, predictions_test = [], []
        
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        species_train, species_test = species[train_index], species[test_index]
        
        # Keeping Taxonomy Classes to Predict species
        Aves_train = data.iloc[train_index,:][data.taxonomic_class == 'Aves']
        Aves_test = data.iloc[test_index,:][data.taxonomic_class == 'Aves']
        Chiroptera_train = data.iloc[train_index,:][data.taxonomic_class == 'Chiroptera']
        Chiroptera_test = data.iloc[test_index,:][data.taxonomic_class == 'Chiroptera']
        Rodentia_train = data.iloc[train_index,:][data.taxonomic_class == 'Rodentia']
        Rodentia_test = data.iloc[test_index,:][data.taxonomic_class == 'Rodentia']
        classes = {'Aves':[Aves_train, Aves_test], "Chiroptera":[Chiroptera_train, Chiroptera_test],
                  "Rodentia":[Rodentia_train, Rodentia_test]}
        
        # Classifiers
        linear_svm_taxonomy = Pipeline([['sc', StandardScaler()], ['clf', SVC(gamma=1/X_train.shape[0],
                    kernel="linear", probability=True, random_state=0)]])
        linear_svm_taxonomy.fit(X_train, y_train)
        
        linear_svm_aves = Pipeline([['sc', StandardScaler()], ['clf', SVC(gamma=1/Aves_train.shape[0],
                kernel="linear", probability=True, random_state=0)]])
        linear_svm_aves.fit(np.array(Aves_train.iloc[:,2:]), np.ravel(Aves_train.iloc[:,1]))
        
        linear_svm_chiroptera = Pipeline([['sc', StandardScaler()], ['clf', SVC(gamma=1/Chiroptera_train.shape[0],
                kernel="linear", probability=True, random_state=0)]])
        linear_svm_chiroptera.fit(np.array(Chiroptera_train.iloc[:,2:]), np.ravel(Chiroptera_train.iloc[:,1]))
            
        linear_svm_rodentia = Pipeline([['sc', StandardScaler()], ['clf', SVC(gamma=1/Rodentia_train.shape[0],
                kernel="linear", probability=True, random_state=0)]])
        linear_svm_rodentia.fit(np.array(Rodentia_train.iloc[:,2:]), np.ravel(Rodentia_train.iloc[:,1]))
        
        y_pred_train = linear_svm_taxonomy.predict(X_train)
        y_pred_test = linear_svm_taxonomy.predict(X_test)
        
        
        # Predict the Species
        for idx in range(X_train.shape[0]):
            if y_pred_train[idx] == 'Aves':
                
                y_pred_train_species = linear_svm_aves.predict(X_train[idx, :])
                predictions_train.append(y_pred_train_species)
                
            elif y_pred_train[idx] == 'Chiroptera':
                
                y_pred_train_species = linear_svm_chiroptera.predict(X_train[idx, :])
                predictions_train.append(y_pred_train_species)
                
            elif y_pred_train[idx] == 'Rodentia':
                
                y_pred_train_species = linear_svm_rodentia.predict(X_train[idx, :])
                predictions_train.append(y_pred_train_species)
            
        for idx in range(X_test.shape[0]):
            if y_pred_test[idx] == 'Aves':
                
                y_pred_test_species = linear_svm_aves.predict(X_test[idx, :])
                predictions_test.append(y_pred_test_species)
                
            elif y_pred_test[idx] == 'Chiroptera':
                
                y_pred_test_species = linear_svm_chiroptera.predict(X_test[idx, :])
                predictions_test.append(y_pred_test_species)
                
            elif y_pred_test[idx] == 'Rodentia':
                
                y_pred_test_species = linear_svm_rodentia.predict(X_test[idx, :])
                predictions_test.append(y_pred_test_species)
                
                
        f1_scores_train.append(f1_score(species_train, predictions_train, pos_label=None, average='micro'))
        f1_scores_test.append(f1_score(species_test, predictions_test, pos_label=None, average='micro'))
            
            
    
    f1_scores_train = np.ravel(f1_scores_train)
    f1_scores_test = np.ravel(f1_scores_test)
    
    return [(f1_scores_train.mean(), f1_scores_train.std()),
            (f1_scores_test.mean(), f1_scores_test.std())]

### F1 Score Calculation For Species Classes

In [4]:
# def species_score(data):
#     species_f1_train = []
#     species_f1_test = []

#     X = np.array(data.iloc[:,2:])
#     y = np.ravel(data.iloc[:,1])
#     kf = StratifiedKFold(10, random_state=0, shuffle=True)
    
#     for train_index, test_index in kf.split(X, y):
#         X_train, X_test = X[train_index], X[test_index]
#         y_train, y_test = y[train_index], y[test_index]

#         linear_svm = Pipeline([['sc', StandardScaler()], ['clf', SVC(gamma=1/X_train.shape[0],
#                     kernel="linear", probability=True, random_state=0)]])
#         linear_svm.fit(X_train, y_train)

#         y_pred_train = linear_svm.predict(X_train)
#         f1_micro_train = f1_score(y_train, y_pred_train, pos_label=None, average='micro')
#         species_f1_train.append(f1_micro_train)
          
#         y_pred_test = linear_svm.predict(X_test)
#         f1_micro_test = f1_score(y_test, y_pred_test, pos_label=None, average='micro')
#         species_f1_test.append(f1_micro_test)

#     species_f1_train = np.ravel(species_f1_train)
#     species_f1_test = np.ravel(species_f1_test)
#     return (species_f1_train.mean(), species_f1_test.mean())

In [5]:
# def trained_model(data):
#     X = np.array(data.iloc[:,2:])
#     y = np.ravel(data.iloc[:,1])
    
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0, stratify=y)
#     linear_svm = Pipeline([['sc', StandardScaler()], ['clf', SVC(gamma=1/X_train.shape[0],
#                     kernel="linear", probability=True, random_state=0)]])
#     linear_svm.fit(X_train, y_train)
#     return linear_svm

### Inserting and Merging Data & Species Classification

In [6]:
data_names = ['Aves', 'Chiroptera', 'Rodentia']
combined_datas = {}
# trained_taxonomic_class_models = {}
for k in range(1,8):
    before_combine_df = []
#     trained_taxonomic_class_models.setdefault(str(k), {})
    for data in data_names:
        csv_file = pd.read_csv("{}.Cleaned.k{}.csv".format(data, k))
        csv_file = csv_file.drop(['Unnamed: 0', 'nucleotides'],1)
        taxonomies = pd.DataFrame(np.ravel([data for i in range(csv_file.shape[0])]), columns = ["taxonomic_class"])
        taxonomy_class_added = pd.concat([taxonomies, csv_file], axis=1)
        before_combine_df.append(taxonomy_class_added)
        
#         model = trained_model(taxonomy_class_added)
#         trained_taxonomic_class_models[str(k)][data] = model

    combined_datas[str(k)] = pd.concat(before_combine_df, axis=0)
    print(k)

1


FileNotFoundError: [Errno 2] File b'Aves.Cleaned.k2.csv' does not exist: b'Aves.Cleaned.k2.csv'

In [None]:
# print(combined_datas['1'][combined_datas['1'].taxonomic_class == 'Aves'])

### Starting to Hierarchical Classification

In [None]:
hierarchical_scores = {}
for k in range(1,8):
    f1_scores_ = find_val_score(k)
    hierarchical_scores[str(k)] = f1_scores_

### Storing The Scores

In [None]:
with open('scores.db','wb') as score:
    pickle.dump(hierarchical_scores, score)

In [None]:
print(hierarchical_scores)