In [15]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from typing import Tuple
from sklearn import preprocessing

In [43]:
def load_data() -> Tuple[np.ndarray, np.ndarray]:
    data = pd.read_csv("./data/data.csv").drop(["Unnamed: 0"], axis=1)
    labels = pd.read_csv("./data/labels.csv")["Class"]
        
    # Encodes string labels as ints
    le = preprocessing.LabelEncoder()
    y = le.fit_transform(labels)
    
    return data.to_numpy(), y
X, y = load_data()

# First feature reduction to reduce run time

In [44]:
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif

# Dropping the constant features
X_nonconst = VarianceThreshold().fit_transform(X)

# Doing feature selection by choosing the k features with the largest F-statistic
k = 200
X_fstat = SelectKBest(score_func=f_classif, k=k).fit_transform(X_nonconst, y)

# Feature selection using multi-class logistic regression

In [64]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

n_folds = 10
test_fraction = 0.33
score_func = f1_score

# Splits data into test and train
X_train, X_test, y_train, y_test = train_test_split(X_fstat, y, test_size=test_fraction)

# Train model and extracts the best inverst regularisation parameter
log_reg = LogisticRegressionCV(cv=n_folds,
                               multi_class='ovr', 
                               solver='liblinear',
                               intercept_scaling=10000).fit(X_train,y_train)
c_min = log_reg.Cs
print(type(log_reg.scores_))

# Finds the C_1se for each label, i.e. the smalles C that produces
# score within one std of the mean score for C_min for a given label
c_1se = {}
for key, val in log_reg.scores_.items():
    print(val)
    cv_mean = np.mean(val, axis=1)
    cv_std = np.std(val, axis=1)
    idx_min_mean = np.argmin(cv_mean)
    idx_c = np.where(
        (cv_mean <= cv_mean[idx_min_mean] + cv_std[idx_min_mean] / np.sqrt(n_folds)) &
        (cv_mean >= cv_mean[idx_min_mean])
    )[0][0]
    c_1se[key] = val[idx_c]

<class 'dict'>
[[0.98148148 0.98148148 1.         1.         1.         1.
  1.         1.         1.         1.        ]
 [0.98148148 0.98148148 0.98148148 1.         1.         1.
  1.         1.         1.         1.        ]
 [1.         1.         1.         1.         1.         1.
  1.         1.         1.         1.        ]
 [1.         1.         1.         1.         1.         1.
  1.         1.         1.         1.        ]
 [1.         1.         1.         1.         1.         1.
  1.         1.         1.         1.        ]
 [1.         1.         1.         1.         1.         1.
  1.         1.         1.         1.        ]
 [1.         1.         1.         1.         1.         1.
  1.         1.         1.         1.        ]
 [1.         1.         1.         1.         1.         1.
  1.         1.         1.         1.        ]
 [1.         1.         1.         1.         1.         1.
  1.         1.         1.         1.        ]
 [1.         1.       

In [63]:
c_1se

{0: array([0.98148148, 0.98148148, 0.98148148, 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 1.        ]),
 1: array([0.98148148, 0.98148148, 0.98148148, 0.98148148, 0.98148148,
        0.98148148, 0.98148148, 0.98148148, 0.98148148, 0.98148148]),
 2: array([0.96226415, 0.98113208, 0.98113208, 0.98113208, 0.98113208,
        0.98113208, 0.98113208, 0.98113208, 0.98113208, 0.98113208]),
 3: array([0.98148148, 0.98148148, 0.98148148, 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 1.        ]),
 4: array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])}