<a href="https://colab.research.google.com/github/LeszekBlazewski/MTSwM/blob/master/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import libraries

In [25]:
from google.colab import drive
drive.mount('/content/gdrive')
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, chi2, f_classif
import seaborn as sns
import matplotlib.pyplot as plt
import glob
%cd /content/gdrive/My Drive/Colab Notebooks/MTSwM

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/My Drive/Colab Notebooks/MTSwM


# Dataset features

In [4]:
dataset_features_columns = [
    'Age',
    'Sex',
    # Pain
    'Pain location', 
    'Chest pain radiation', 
    'Pain character', 
    'Onset of pain', 
    'Number of hours since onset', 
    'Duration of the last episode', 
    # Associated symptoms
    'Nausea', 
    'Diaphoresis',
    'Palpitations',
    'Dyspnea',
    'Dizziness/syncope',
    'Burping',
    # Palliative factors
    'Palliative factors',
    # History of similar pain
    'Prior chest pain of this type',
    'Physician consulted for prior pain',
    'Prior pain related to heart',
    'Prior pain due to MI',
    'Prior pain due to angina prectoris',
    # Past medical history
    'Prior MI',
    'Prior angina prectoris',
    'Prior atypical chest pain',
    'Congestive heart failure',
    'Peripheral vascular disease',
    'Hiatal hernia',
    'Hypertension',
    'Diabetes',
    'Smoker',
    # Current medication usage
    'Diuretics',
    'Nitrates',
    'Beta blockers',
    'Digitalis',
    'Nonsteroidal anti-inflammator',
    'Antacids/H2 blockers',
    # Physical examinations
    'Systolic blood pressure',
    'Diastolic blood pressure',
    'Heart rate',
    'Respiration rate',
    'Rales',
    'Cyanosis',
    'Pallor',
    'Systolic murmur',
    'Diastolic murmur',
    'Oedema',
    'S3 gallop',
    'S4 gallop',
    'Chest wall tenderness',
    'Diaphoresis',
    # ECG examination
    'New Q wave',
    'Any Q wave',
    'New ST segment elevation',
    'Any ST segment elevation',
    'New ST segment depression',
    'Any ST segment depression',
    'New T wave inversion',
    'Any T wave inversion',
    'New intraventricular conduction defect',
    'Any intraventricular conduction defect',
    'Class'
]

# Load & merge datasets

In [None]:
data_list = []

for i, file in enumerate(glob.glob("data/*.txt"), 1):
  data_set = pd.read_csv(file,sep="\t", header=None).transpose()
  data_set['Class'] = i
  data_list.append(data_set)

dataset = pd.concat(data_list, axis=0)

dataset.columns = dataset_features_columns

dataset.info()
#dataset.head()
#dataset.describe()

# Features ranking

## SelectKBest

In [16]:
def build_features_ranking(x, y, score_func):
    features_num = x.shape[1]
    k_best_selector = SelectKBest(score_func=score_func, k=features_num) 
    k_best_selector.fit(x, y)
    scores_ranking = [
        (name, round(score, 2))
        for name, score in zip(x.columns, k_best_selector.scores_)
    ]
    scores_ranking.sort(reverse=True, key=lambda x: x[1])
    return scores_ranking

In [8]:
def print_features_ranking_with_plot(features_ranking, used_score_func):
  print(f'Features ranking after using {used_score_func} score function:')
  for i, feature in enumerate(features_ranking, 1):
    print(f"{i}. {feature[0]} {feature[1]}")
  # display bar plot
  plt.figure(figsize=(30,20))
  estimator_num = len(features_ranking)
  # sort ascending because horizontal bars print in reverse order
  ascending_features = sorted([(f[0], f[1]) for f in features_ranking], key=lambda f: f[1])
  plt.barh(range(estimator_num), [feature[1] for feature in ascending_features], align='center') # extract score value
  plt.yticks(range(estimator_num), [feature[0] for feature in ascending_features]) # extract the feature label
  plt.title(f'Ranking based on {used_score_func}')
  plt.show()

In [None]:
x = dataset.drop('Class', axis=1)
y = dataset['Class']
# f_classif: ANOVA test (F-value between label/feature for regression tasks)
features_ranking_classif = build_features_ranking(x, y, f_classif)
print_features_ranking_with_plot(features_ranking_classif, 'f_classif')
# chi-squared stats of non-negative features for classification tasks.
features_ranking_chi = build_features_ranking(x, y, chi2)
print_features_ranking_with_plot(features_ranking_chi, 'chi2')

# Evaluation of the classifier with cross validation


In [40]:
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_validate

def evaluate_and_get_classifiers(classfier,features_set,y):
  # prepare the cross-validation procedure
  validator = RepeatedKFold(n_splits=2, n_repeats=5, random_state=0)
  # evaluate model
  cv_results = cross_validate(classfier,features_set,y,cv=validator,scoring='accuracy',return_estimator=True)
  # report performance
  mean = np.mean(cv_results['test_score'])
  return (mean,cv_results['estimator'])

# Confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix

def build_and_plot_confustion_matrix(x,y,best_classifiers):
  confusion_matrix = []
  for classifier in best_classifiers:
    y_pred = classifier.predict(x)
    confusion_matrix += y_pred



# Here eiither use aggregated confusion matrix from the best classifiers or use cross_val_predict function

# Experiment

In [52]:
from sklearn.neighbors import KNeighborsClassifier

x = dataset.drop('Class', axis=1)
y = dataset['Class']

features_ranking = build_features_ranking(x,y,f_classif)
feature_names = [feature[0] for feature in features_ranking] # convert to only class names

number_of_neighbours=(1,5,10)
metrics=('euclidean','manhattan')

result_columns = ['features_count', 'neighbours_count', 'metric', 'mean_accuracy']
results = []
best_classifiers = []
maximum_features = 5

# for each metric
for metric in metrics:
  # for each k neighbours
  for neighbours_count in number_of_neighbours:
    # append features on the way
    for features_count in range(1, maximum_features+1):
      # include in the dataset only features selected from ranking in given run
      x = dataset.drop(dataset.columns.difference(feature_names[:features_count]), 1)
      # construct the classifier
      knn_classifier = KNeighborsClassifier(n_neighbors=neighbours_count, metric=metric)
      # perfomr the Cross validation process and get results
      current_result_mean, classifiers = evaluate_and_get_classifiers(knn_classifier,x,y)
      # save the best set of classifiers
      if all(result[3] <= current_result_mean for result in results):
        best_classifiers = classifiers
      results.append([features_count, neighbours_count, metric, current_result_mean])


results_df = pd.DataFrame(results)
results_df.columns = result_columns
print(results_df.head())
print(f"max accuracy: {results_df['mean_accuracy'].max()}")

   features_count  neighbours_count     metric  mean_accuracy
0               1                 1  euclidean       0.511654
1               2                 1  euclidean       0.501487
2               3                 1  euclidean       0.502142
3               4                 1  euclidean       0.528111
4               5                 1  euclidean       0.524113
max accuracy: 0.5281108647450111


1. Puszczam ranking cech
2. Dla zestawów:
```
k=1,metric=euclidean
k=5,metric=euclidean
k=10,metric=euclidean
```
```
k=1,metric=manhattan
k=5,metric=manhattan
k=10,metric=manhattan
```

wykonuję testy dodając kolejno po klasie z rankingu.

3. Zapisuje wszystkie wyniki (średnie) z uzyskanych przebiegów w tablicy.
4. Podczas iteracji sprawdzam, czy obecnie uzyskana średnia z całego przebiegu jest większa od jakiejkolwiek już uzyskanej i jeśli tak to zapisuje zestaw estymatorów uzyskanych w tym przebiegu(posłużą do wyciągnięcia macierzy konfuzji)
5. Wyznaczam sumę macierzy konfuzji z wyciągniętych klasyfikatorów
6. Analiza statystyczna wyników (pytanie czy średnich pomiędzy całymi iteracjami czy wyników w danym obiegu KFolda).