# TM10007 Assignment lipomas G9

In [3]:
# Run this to use from colab environment
!pip install -q --upgrade git+https://github.com/Machine-Learning-TM10007-G9/Lipo-MRI.git

  Preparing metadata (setup.py) ... [?25l[?25hdone


# **Data loading**



In [4]:
from worclipo.load_data import load_data

data = load_data()
print(f'The number of samples: {len(data.index)}')
print(f'The number of columns: {len(data.columns)}')

The number of samples: 115
The number of columns: 494


# **Import important functions**

In [105]:
# General packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets as ds
import seaborn


# Classifiers
from sklearn import model_selection
from sklearn import metrics
from sklearn import feature_selection
from sklearn import preprocessing
from sklearn import neighbors
from sklearn import svm
from sklearn import decomposition
from scipy import stats
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier

# **Create X and Y**

In [6]:
X = data.iloc[:, 1:].values
y = np.array(data['label'])

# **Split the data**

In [7]:
# Split data
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, stratify=y)

#print(len(X_train))
#print(len(X_test))
#print(len(y_train))
#print(len(y_test))

# **Checks if all the data points are floats**
Is 0.0 a missing value?

In [8]:
for row in X_train:
  for col in row:
    if not isinstance(col, float):
        print(col)

# **Check for missing values**



In [9]:
missing_values = np.sum(np.isnan(X_train))

print("Number of missing values:",missing_values)

Number of missing values: 0


# **Check for duplicates**
Can we assume that duplicates values are duplicates?

In [10]:
# Function to find duplicate columns and delete them
def delete_duplicate_columns(arr):
    num_cols = arr.shape[1]
    duplicate_columns = []
    for i in range(num_cols):
        for j in range(i + 1, num_cols):
            if np.array_equal(arr[:, i], arr[:, j]):
                duplicate_columns.append(j)  # Append index of the duplicate column
    if duplicate_columns:
        # Delete duplicate columns
        arr_without_duplicates = np.delete(arr, duplicate_columns, axis=1)
        return arr_without_duplicates
    else:
        return arr

# Remove duplicate columns
X_train_clean = delete_duplicate_columns(X_train)

print("Number of columns in original array:",X_train.shape[1])
print("Number of columns after removal of duplicate columns:",X_train_clean.shape[1])

Number of columns in original array: 493
Number of columns after removal of duplicate columns: 465


# **Checking for outliers**
Using the Z-score

In [18]:
num_rows, num_cols = X_train_clean.shape
total_outliers = 0

for i in range(num_cols):
    z = np.abs(stats.zscore(X_train_clean[:,i]))

    threshold = 3
    outliers = X_train_clean[z > threshold, i]
    total_outliers +=(len(outliers))

print(f'Total number of outliers  = {total_outliers}/{num_rows*num_cols}')
print(f'Avarage number of outliers per feature = {round(total_outliers/num_cols,2)}/{num_rows}')


Total number of outliers  = 651/42780
Avarage number of outliers per feature = 1.4/92


# **Check if the data is normally distributed**
Using the Kolmogorov Smirnov

In [12]:
total_not_normal = 0
p_threshold = 0.05

for l in range(num_cols):
    kstest_result = stats.kstest(X_train_clean[:,l], 'norm')
    if kstest_result.pvalue < p_threshold:
        total_not_normal += 1

print(f'Total features that are not normally distributed = {total_not_normal}/{num_cols}')


Total features that are not normally distributed = 465/465


# **Scaling**
Robust scaling because outliers and non Gaussian distribution

In [141]:
# Scale the data to be normal
scaler = preprocessing.MinMaxScaler()
scaler.fit(X_train_clean)
X_train_scaled = scaler.transform(X_train_clean)

# **Removing features with low variance**

In [58]:
# Compute variance along each feature (axis=0)
variances = np.var(X_train_scaled, axis=0)

# Define a threshold for variance
threshold = 2

# Filter out features with variance below the threshold
high_variance_indices = np.where(variances >= threshold)[0]

# Select only the high variance features from the dataset
data_high_variance = X_train_scaled[:, high_variance_indices]

# Display the selected features
print("Selected features with high variance:")
print(data_high_variance.shape)

Selected features with high variance:
(92, 58)


# **Feature selection**
Using L1 because it can better handle high demensionality, it migigates overfitting and is easy to interpret

In [135]:
from sklearn.linear_model import LogisticRegression

print("Original data shape:",X_train_scaled.shape)
clf = LogisticRegression(max_iter=1000, solver='liblinear')
lsvc = LogisticRegression(C=1.9, penalty="l1", dual=False, solver='liblinear').fit(X_train_scaled, y_train)
model = SelectFromModel(lsvc, prefit=True)
X_feature = model.transform(X_train_scaled)
print("New data shape:",X_feature.shape)

Original data shape: (92, 465)
New data shape: (92, 55)


# **PCA**

In [147]:
# Voer PCA uit
pca = decomposition.PCA(n_components=10)  #n_components is the amount of features that remain after PCA (can also be a ratio)
X_pca = pca.fit_transform(X_feature)

# Verklaarde variantie ratio's
explained_variance_ratio = pca.explained_variance_ratio_

# Cumulatieve verklaarde variantie
cumulative_explained_variance = np.cumsum(explained_variance_ratio)

# Maak een DataFrame om de resultaten mooi weer te geven
df = pd.DataFrame({
    'Principal Component': [f'PC{i+1}' for i in range(len(explained_variance_ratio))],
    'Explained Variance Ratio': explained_variance_ratio,
    'Cumulative Explained Variance': cumulative_explained_variance
})

print(df)

  Principal Component  Explained Variance Ratio  Cumulative Explained Variance
0                 PC1              9.999995e-01                       0.999999
1                 PC2              4.713020e-07                       1.000000
2                 PC3              5.717308e-08                       1.000000
3                 PC4              1.051200e-08                       1.000000
4                 PC5              8.603359e-09                       1.000000
5                 PC6              6.244737e-10                       1.000000
6                 PC7              5.060454e-10                       1.000000
7                 PC8              2.118805e-10                       1.000000
8                 PC9              1.665553e-10                       1.000000
9                PC10              1.662022e-11                       1.000000


# **Cross-validation**

In [128]:
# Initialize KFold cross-validator
kf = model_selection.StratifiedKFold(n_splits=5)

# Iterate over each fold
for train_index, val_index in kf.split(X_train_clean, y_train):
    # Split data into train and validation sets for this fold
    X_train_fold, X_val_fold = X_train_scaled[train_index], X_train_scaled[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

print(len(X_train_fold))
print(len(X_val_fold))
print(len(y_train_fold))
print(len(y_val_fold))

74
18
74
18


# **Random forest**

In [156]:
# Initialize KFold cross-validator
kf = model_selection.StratifiedKFold(n_splits=5)

scores_rfc = []
# Iterate over each fold
for train_index, val_index in kf.split(X_feature, y_train):
    # Split data into train and validation sets for this fold
    X_train_fold, X_val_fold = X_feature[train_index], X_feature[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]
    rfc = RandomForestClassifier(n_estimators=100)
    rfc.fit(X_train_fold,y_train_fold)
    pred_rfc = rfc.predict(X_val_fold)
    accuracy_rfc = accuracy_score(y_val_fold, pred_rfc)

    # Append accuracy to scores list
    scores_rfc.append(accuracy_rfc)

# Print the accuracy scores for each fold
print("Accuracy scores for each fold:", scores_rfc)

# Calculate and print the average accuracy across all folds
average_accuracy_rfc = sum(scores_rfc) / len(scores_rfc)
print("Average Accuracy:", average_accuracy_rfc)

Accuracy scores for each fold: [0.8421052631578947, 0.7368421052631579, 0.7777777777777778, 0.8888888888888888, 0.7777777777777778]
Average Accuracy: 0.8046783625730993


# **SVM**

In [157]:
# Initialize KFold cross-validator
kf = model_selection.StratifiedKFold(n_splits=5)

scores_clf = []
# Iterate over each fold
for train_index, val_index in kf.split(X_feature, y_train):
    # Split data into train and validation sets for this fold
    X_train_fold, X_val_fold = X_feature[train_index], X_feature[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]
    clf = svm.SVC()
    clf.fit(X_train_fold,y_train_fold)
    pred_clf = clf.predict(X_val_fold)
    accuracy_clf = accuracy_score(y_val_fold, pred_clf)

    # Append accuracy to scores list
    scores_clf.append(accuracy_clf)

# Print the accuracy scores for each fold
print("Accuracy scores for each fold:", scores_clf)

# Calculate and print the average accuracy across all folds
average_accuracy_clf = sum(scores_clf) / len(scores_clf)
print("Average Accuracy:", average_accuracy_clf)

Accuracy scores for each fold: [0.42105263157894735, 0.47368421052631576, 0.5, 0.5, 0.4444444444444444]
Average Accuracy: 0.4678362573099415


# **Neural network**

In [158]:
# Initialize KFold cross-validator
kf = model_selection.StratifiedKFold(n_splits=5)

scores_mlpc = []
# Iterate over each fold
for train_index, val_index in kf.split(X_feature, y_train):
    # Split data into train and validation sets for this fold
    X_train_fold, X_val_fold = X_feature[train_index], X_feature[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]
    mlpc = MLPClassifier(hidden_layer_sizes=(50,50),max_iter=500)
    mlpc.fit(X_train_fold, y_train_fold)
    pred_mlpc = mlpc.predict(X_val_fold)
    accuracy_mlpc = accuracy_score(y_val_fold, pred_mlpc)

    # Append accuracy to scores list
    scores_mlpc.append(accuracy_mlpc)

# Print the accuracy scores for each fold
print("Accuracy scores for each fold:", scores_mlpc)

# Calculate and print the average accuracy across all folds
average_accuracy_mlpc = sum(scores_mlpc) / len(scores_mlpc)
print("Average Accuracy:", average_accuracy_mlpc)

Accuracy scores for each fold: [0.6842105263157895, 0.5263157894736842, 0.3888888888888889, 0.6111111111111112, 0.6111111111111112]
Average Accuracy: 0.5643274853801169
