In [1]:
from sklearn import metrics
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn import svm
import numpy as np
import os
import pandas as pd

In [5]:
%%time
train = pd.read_csv("./datasets/cov_filtered_2.5_training.csv", index_col=None)

CPU times: user 4.27 s, sys: 547 ms, total: 4.81 s
Wall time: 4.86 s


In [6]:
%%time
validation = pd.read_csv("./datasets/cov_filtered_2.5_validation.csv", index_col=None)

CPU times: user 1.52 s, sys: 125 ms, total: 1.64 s
Wall time: 1.68 s


In [7]:
%%time
test = pd.read_csv("./datasets/cov_filtered_2.5_test.csv", index_col=None)

CPU times: user 1.48 s, sys: 109 ms, total: 1.59 s
Wall time: 1.58 s


In [8]:
labels = pd.read_csv("labels.csv")
labels.columns = ["Label", "Cluster"]
labels_dict = {}
for i in range(75):
    labels_dict[labels.iloc[i,1]] = i
train["Classification"].replace(labels_dict, inplace=True)
validation["Classification"].replace(labels_dict, inplace=True)
test["Classification"].replace(labels_dict, inplace=True)

In [10]:
size = train.shape[1] - 1
x = train.iloc[:, 1:size]
y = train.iloc[:, [size]]
x_val = validation.iloc[:,1:size]
y_val = validation.iloc[:,[size]]
x_test = test.iloc[:,1:size]
y_test = test.iloc[:,[size]]

In [None]:
%%time
# Used to run through entire cluster set
cm = pd.DataFrame()
cm_train = pd.DataFrame()
cm_test = pd.DataFrame()
roc_curve_data = pd.DataFrame()

for i in range(75):
    class_model = i
    
    y = train.iloc[:, [size]]
    y_np = np.array(y["Classification"].values.tolist())
    y["Classification"] = np.where(y_np == class_model, 1,0)
    
    y_val = validation.iloc[:, [size]]
    y_val_np = np.array(y_val["Classification"].values.tolist())
    y_val["Classification"] = np.where(y_val_np == class_model, 1,0)
    
    y_test = test.iloc[:, [size]]
    y_test_np = np.array(y_test["Classification"].values.tolist())
    y_test["Classification"] = np.where(y_test_np == class_model, 1,0)
    
    clf = svm.SVC(kernel='linear')
    clf.fit(x, y.values.ravel())
    
    y_pred = clf.predict(x_val)
    y_pred_train = clf.predict(x)
    y_pred_test = clf.predict(x_test)
    
    cm_dummy = [cm, pd.DataFrame(metrics.confusion_matrix(y_val, y_pred))]
    cm = pd.concat(cm_dummy)
    
    cm_dummy_train = [cm_train, pd.DataFrame(metrics.confusion_matrix(y, y_pred_train))]
    cm_train = pd.concat(cm_dummy_train)
    
    cm_dummy_test = [cm_test, pd.DataFrame(metrics.confusion_matrix(y_test, y_pred_test))]
    cm_test = pd.concat(cm_dummy_test)

## Run only one box of the three below

In [None]:
new_cm = pd.DataFrame(columns = ["TN", "FP", "FN", "TP"])
for i in range(75):
    new_cm.loc[i] = [cm.iloc[0+2*i][0], cm.iloc[0+2*i][1], cm.iloc[1+2*i][0], cm.iloc[1+2*i][1]]

In [None]:
new_cm = pd.DataFrame(columns = ["TN", "FP", "FN", "TP"])
for i in range(75):
    new_cm.loc[i] = [cm_train.iloc[0+2*i][0], cm_train.iloc[0+2*i][1], cm_train.iloc[1+2*i][0], cm_train.iloc[1+2*i][1]]

In [None]:
new_cm = pd.DataFrame(columns = ["TN", "FP", "FN", "TP"])
for i in range(75):
    new_cm.loc[i] = [cm_test.iloc[0+2*i][0], cm_test.iloc[0+2*i][1], cm_test.iloc[1+2*i][0], cm_test.iloc[1+2*i][1]]

##  

In [None]:
cm = new_cm

In [None]:
labels = pd.read_csv("labels.csv")
dataset_sizes = pd.read_csv("subset sizes.csv", header = None)

In [None]:
cm["TN"] = cm["TN"].astype(np.int64)
cm["FP"] = cm["FP"].astype(np.int64)
cm["FN"] = cm["FN"].astype(np.int64)
cm["TP"] = cm["TP"].astype(np.int64)
cm = cm.assign(precision = cm["TP"] / (cm["TP"] + cm["FP"]))
cm["precision"].astype(np.float64)
#cm["precision"].fillna(1, inplace=True)
cm = cm.assign(recall = cm["TP"] / (cm["TP"] + cm["FN"]))
cm["recall"].astype(np.float64)
cm = cm.assign(f1 = 2 * 1 / ((1 / cm["precision"]) + (1 / cm["recall"])))
cm = cm.assign(fbeta = (1 + 0.5 ** 2) * (cm["precision"] * cm["recall"]) / ((0.5 ** 2 * cm["precision"]) + cm["recall"]))
cm["fbeta"].fillna(0, inplace=True)
cm["cluster"] = labels["0"]
cm = cm.merge(dataset_sizes, how = 'inner', left_on='cluster', right_on=0)
cm.drop(columns=0, inplace = True)
cm.rename(columns={1:"size"}, inplace = True)
cm["log_size"] = np.log2(cm["size"])

In [None]:
fig = cm.plot.scatter(x="log_size", y="fbeta", title="LR C=0.1, Newton", ylim=[-0.05,1.05]).get_figure()
plt.xlabel("Log2 Size")
plt.ylabel("F-beta Score")