# COVID PS and RX dataset analysis

In [None]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler, normalize
from typing import Any, List, Optional

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

import sys
import os

### Paths to data

In [None]:
data_path = "/Users/manuel/Desktop/BiomedDataAnalysisCourse/project/data/"
# filled some missing values and corrected mistakes
ps_rx_fname = os.path.join(data_path, "merged_data_processed_corrected.csv")

### Define functions used throughout the analysis

In [None]:
def compute_age(birth_year: int, visit_year: int) -> int:
    assert isinstance(birth_year, int)
    assert isinstance(visit_year, int)
    assert birth_year < visit_year
    age = visit_year - birth_year
    return age

In [None]:
def fillna_column(column_data: pd.Series, dtype: str) -> List[Any]:
    assert isinstance(column_data, pd.Series)
    assert isinstance(dtype, str)
    if dtype == "categorical":
        most_freq = column_data.describe()["top"]
        column_data.fillna(most_freq, inplace=True)
    elif dtype == "numerical":
        mean_val = column_data.describe()["mean"]
        column_data.fillna(mean_val, inplace=True)
    else:
        raise ValueError(f"Unknown data type ({dtype})")
    return column_data

In [None]:
def compute_pca(X: pd.DataFrame, n_components: Optional[int]=2) -> pd.DataFrame:
    assert isinstance(X, pd.DataFrame)
    pca = PCA(n_components=n_components)
    pcs = pca.fit_transform(X)
    X_pcs = pd.DataFrame(
        data=pcs, 
        columns=[f"PC{i}" for i in range(1, (n_components + 1))],
        index=X.index.tolist()
    )
    assert X.shape[0] == X_pcs.shape[0]
    return X_pcs

In [None]:
def plot_pca(
    X_pcs: pd.DataFrame, 
    title: str,
    pc1: Optional[str]="PC1",
    pc2: Optional[str]="PC2"
) -> None:
    assert isinstance(X_pcs, pd.DataFrame)
    assert isinstance(title, str)
    assert isinstance(pc1, str)
    assert pc1 in X_pcs.columns.tolist()
    assert isinstance(pc2, str)
    assert pc2 in X_pcs.columns.tolist()
    f,ax = plt.subplots(1,1,figsize=(15,10))
    sns.scatterplot(data=X_pcs, x=pc1, y=pc2, ax=ax)
    ax.set_xlabel(pc1, size=16)
    ax.set_ylabel(pc2, size=16)
    ax.set_title(title, size=18)
    plt.show()  # display plot
    

In [None]:
def remove_measures(column_data: pd.Series) -> List[float]:
    assert isinstance(column_data, pd.Series)
    corrected_data = []
    for v in column_data.tolist():
        if str(v) == "nan":
            corrected_data.append(v)
        else:
            fields = v.split("10^9")
            value = float(fields[0].replace(",", "."))
            corrected_data.append(value)
    assert len(column_data) == len(corrected_data)
    return corrected_data

In [None]:
def visualize_results(X, Y, predictions, title, f1_score):
    assert isinstance(predictions, np.ndarray)
    assert isinstance(title, str)
    assert isinstance(f1_score, float)
    pca = PCA(n_components=2)
    pcs = pca.fit_transform(X)
    X_pcs = pd.DataFrame(
        data=pcs, columns=[f"PC{i}" for i in range(1,3)], index=Y.index.tolist()
    )
    X_pcs["Prediction"] = predictions
    X_pcs["Death"] = Y.tolist()
    f, (ax1, ax2) = plt.subplots(1,2,figsize=(20,10))
    palette=["#D3880E", "#3B1375"]
    sns.scatterplot(data=X_pcs, x="PC1", y="PC2", palette=palette, hue="Death", ax=ax1)
    ax1.set_xlabel("PC1", size=16)
    ax1.set_ylabel("PC2", size=16)
    ax1.set_title("Original data", size=18)
    sns.scatterplot(data=X_pcs, x="PC1", y="PC2", palette=palette, hue="Prediction", ax=ax2)
    ax2.set_xlabel("PC1", size=16)
    ax2.set_ylabel("PC2", size=16)
    ax2.set_title(" ".join([title, "(F1-score: %.2f)" % (f1_score)]), size=18)
    plt.show()

## Start analysis

Load and visualize the dataset.

In [None]:
ps_rx_df = pd.read_csv(ps_rx_fname, delimiter=";", decimal=",")
ps_rx_df.head()

Remove units of measure from some columns of the DataFrame.

In [None]:
# two cols with units of measure
for col in [
    "FIELDSET_PS-BLOOD_COUNT_LEUCOCITI", "FIELDSET_PS-BLOOD_COUNT_NEUTROFILI"
]:
    ps_rx_df[col] = remove_measures(ps_rx_df[col])
ps_rx_df.head()

#### Preprocessing steps

Compute patients age at ER visit time.

In [None]:
ps_rx_df["AGE"] = ps_rx_df.apply(lambda x : compute_age(int(x[1]), int(x[-1].split("/")[-1])), axis=1)
ps_rx_df.head()

Drop columns not used throughout the analysis.

In [None]:
drop_cols = [
    "BIRTHDAY",
    "DEAD_DATE",
    "STOP",
    "START",
    "CODE"
]
ps_rx_df.drop(drop_cols, axis=1, inplace=True)
ps_rx_df.head()  # 769 visits and 89 variables

Set visit ID as DataFrame index.

In [None]:
ps_rx_df.index = ps_rx_df.ID.tolist()
ps_rx_df.drop(["ID"], axis=1, inplace=True)
ps_rx_df.head()  # 88 variables

Recover training data and the response we want predict.

In [None]:
X = ps_rx_df.drop(["DEATH"], axis=1)
Y = ps_rx_df["DEATH"]
X.head()  # training data

In [None]:
Y.head()  # response -> death or survival

Encode categorical variables in the dataset. For encoding we'll use `OrdinalEncoder` function from the `sklearn` Python package.

In [None]:
# fill NaN rows for each categorical variable using the most frequent value
cat_vars = X.select_dtypes(["object"]).columns.tolist()
for cat_var in cat_vars:
    X[cat_var] = fillna_column(X[cat_var], "categorical")
# categorical variables encoding
X_cat = X[cat_vars]
enc = OrdinalEncoder()
X[cat_vars] = enc.fit_transform(X_cat)
X.head()


We can now count how many `NaN` values there are in our dataset. 

In [None]:
X.isna().sum().sum()  # 7452

We have 7452 `NaN` values in our data. They could create some problems when fitting models to our data:
- can change some metrics, like mean, variance, median, etc.
- `sklearn` models do not manage the presence of `NaN` values

The easiest solution is to remove rows containing `NaN` values. However, we would remove too much observations from our datasets.<br>

An alternative solution is to impute `NaN` values. Let's try imputing our values using KNN. KNN finds the closest `k` samples to the considered sample in our dataset, and impute the `NaN` value computing the mean value of the closest `k` samples. <br>

Luckily, `sklearn` provides an implementation of such method. Let's use it on our data.

In [None]:
# impute NaN values using KNN 
imputer = KNNImputer(n_neighbors=5)  # use k == 5
X = pd.DataFrame(data=imputer.fit_transform(X), columns=X.columns, index=X.index)
X.head()

We can clearly see that now `NaN` values have been replaced by the imputed data.

Let's now explore our dataset to look for potential correlations among variables.

In [None]:
f, ax = plt.subplots(1,1,figsize=(15,15))
sns.heatmap(X.corr(), linewidths=0.5, cmap="coolwarm")
plt.show()

Unfortunately, there are few variables correlating between each others. However, as expected blood values strongly correlates between each other.

Our dataset contains 87 variables, so it is not possible to visualize how data distribute in a human interpretable manner. <br>

Luckily, there exist dimensionality reduction techniques to visualize such multi-dimensional data. Let's use the **Principal Component Analysis (PCA)**. Briefly, PCA computes the components providing the most difference between dataset's samples. Moreover, it allows to visualize multidimensional datasets in two dimensions. <br>

Let's now visualize our dataset using dimensionality reduction via PCA.

In [None]:
X_pcs = compute_pca(X)
plot_pca(X_pcs, "Original dataset")

It is clear that there are some strong outliers in our dataset. We should remove them before proceeding.

In [None]:
# remove outliers
visits_to_remove = X_pcs[(X_pcs.PC1 > 3500) | (X_pcs.PC2 > 1000)].index.tolist()
X.drop(visits_to_remove, axis=0, inplace=True)
# adjust Y accordingly
Y.drop(visits_to_remove, axis=0, inplace=True)
# recompute PCs
X_pcs = compute_pca(X)
# plot PCs
plot_pca(X_pcs, "Filtered dataset")

Now our data appear to be more distributed. However, they still seem to be very close to each other, and no clear separation can be observed. <br>
Let's try to process our data through **normalization** and **standard scaling**.

Let's begin with data standardization.

In [None]:
# standardize numerical data
X_std = X.copy()
numerical_columns = list(set(X_std.columns).difference(cat_vars))  # get numerical cols
X_num = X_std[numerical_columns]
scaler = StandardScaler()  # initialize scaler
scaler.fit(X_num)  # scale data
X_num = scaler.transform(X_num)
X_std[numerical_columns] = X_num
X_std.head()

Now, we can plot our data after standard scaling.

In [None]:
# compute PCs
X_pcs = compute_pca(X_std)
plot_pca(X_pcs, "Standard scaling")

Interestingly our dataset appears to be more distributed than before. Usually this is good, but let's see if there exists some separation between our two classes.

In [None]:
# add class labels to PCs
X_pcs["Death"] = Y.tolist()
# plot data
palette=["#D3880E", "#3B1375"]
f,ax = plt.subplots(1,1,figsize=(15,10))
sns.scatterplot(data=X_pcs, x="PC1", y="PC2", ax=ax, hue="Death", palette=palette)
ax.set_xlabel("PC1", size=16)
ax.set_ylabel("PC2", size=16)
ax.set_title("Standard scaling with labels", size=18)
plt.show()  # display plot

As it was clear at first glance, there is no clear separation between our two classes. Let's see if with normalization something changes (very unlikely!).

In [None]:
# normalize numerical data
X_norm = X.copy()
numerical_columns = list(set(X_norm.columns).difference(cat_vars))  # get numerical cols
X_num = X_norm[numerical_columns]
X_norm[numerical_columns] = normalize(X_num)  # normalize data
X_norm.head()

Let's now plot our normalized data.

In [None]:
# compute PCs
X_pcs = compute_pca(X_norm)
plot_pca(X_pcs, "Normalization")

The result is similar to standard scaling, so we don't expect that there will be a clear separation between our two classes.

In [None]:
# add class labels to PCs
X_pcs["Death"] = Y.tolist()
# plot data
palette=["#D3880E", "#3B1375"]
f,ax = plt.subplots(1,1,figsize=(15,10))
sns.scatterplot(data=X_pcs, x="PC1", y="PC2", ax=ax, hue="Death", palette=palette)
ax.set_xlabel("PC1", size=16)
ax.set_ylabel("PC2", size=16)
ax.set_title("Standard scaling with labels", size=18)
plt.show()  # display plot

## Models training and evaluation

Since we completed our exploratory analysis, we can now begin trying to fit some models to our data and see what happens.<br>

We will consider different ML models and methods suited for data classification:
- **K-means**
- **KNN**
- **Logistic regression**
- **SVM (linear kernel)**
- **Decision trees**

In the following we will use original, standardized, and normalized data.

Each model will be evaluated using:
- **Mean accuracy**
- **F1-score**

### K-means

Original data

In [None]:
# compute clusters with k-means
kmeans = KMeans(
    n_clusters=2, init="random", max_iter=1000, random_state=0
)
# run 10 times
f1 = []
for _ in range(10): 
    predictions = kmeans.fit_predict(X)
    # compute F1-score
    f1.append(f1_score(Y, predictions, average="binary"))
assert len(f1) == 10
# visualize results
visualize_results(X, Y, predictions, "K-means", np.mean(f1))

Standardized data

In [None]:
# compute clusters with k-means
kmeans = KMeans(
    n_clusters=2, init="random", max_iter=1000, random_state=0
)
# run 10 times
f1 = []
for _ in range(10): 
    predictions = kmeans.fit_predict(X_std)
    # compute F1-score
    f1.append(f1_score(Y, predictions, average="binary"))
assert len(f1) == 10
# visualize results
visualize_results(X_std, Y, predictions, "K-means", np.mean(f1))

Normalized data

In [None]:
# compute clusters with k-means
kmeans = KMeans(
    n_clusters=2, init="random", max_iter=1000, random_state=0
)
# run 10 times
f1 = []
for _ in range(10): 
    predictions = kmeans.fit_predict(X_norm)
    # compute F1-score
    f1.append(f1_score(Y, predictions, average="binary"))
assert len(f1) == 10
# visualize results
visualize_results(X_norm, Y, predictions, "K-means", np.mean(f1))

### KNN algorithm

Original data

In [None]:
# compute clusters with KNN
scores_tot = []
for n in range(3, 11):
    neigh = KNeighborsClassifier(n_neighbors=n)
    scores = cross_val_score(
        neigh, X, Y, cv=6, scoring="f1_macro"
    )
    scores_tot.append(np.mean(scores))
best = -1
best_idx = -1
for i,s in enumerate(scores_tot):
    if s > best:
        best = s
        best_idx = i
assert best > 0
assert best_idx >= 0
print(f"Best F1-score obtained using {best_idx + 3} neighbours")
print("Mean F1-score: %.2f" % (best))

Standardized data

In [None]:
# compute clusters with KNN
scores_tot = []
for n in range(3, 11):
    neigh = KNeighborsClassifier(n_neighbors=n)
    scores = cross_val_score(
        neigh, X_std, Y, cv=6, scoring="f1_macro"
    )
    scores_tot.append(np.mean(scores))
best = -1
best_idx = -1
for i,s in enumerate(scores_tot):
    if s > best:
        best = s
        best_idx = i
assert best > 0
assert best_idx >= 0
print(f"Best F1-score obtained using {best_idx + 3} neighbours")
print("Mean F1-score: %.2f" % (best))

Normalized data

In [None]:
# compute clusters with KNN
scores_tot = []
for n in range(3, 11):
    neigh = KNeighborsClassifier(n_neighbors=n)
    scores = cross_val_score(
        neigh, X_norm, Y, cv=6, scoring="f1_macro"
    )
    scores_tot.append(np.mean(scores))
best = -1
best_idx = -1
for i,s in enumerate(scores_tot):
    if s > best:
        best = s
        best_idx = i
assert best > 0
assert best_idx >= 0
print(f"Best F1-score obtained using {best_idx + 3} neighbours")
print("Mean F1-score: %.2f" % (best))

### Logistic regression

Original data

In [None]:
# classify data using logistic regression
clf = LogisticRegression(random_state=0, max_iter=10000)
# split dataset in training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.25
)
clf.fit(X_train, Y_train)
predictions = clf.predict(X_test)
acc = clf.score(X_test, Y_test)
f1 = f1_score(Y_test, predictions, average="binary")
print("Accuracy: %.2f" % (acc))
print("F1-score: %.2f" % (f1))
# visualize results
visualize_results(X_test, Y_test, predictions, "Original data", f1)

In [None]:
# evaluate the model with cross validation
clf = LogisticRegression(random_state=0, max_iter=10000)
scores = cross_val_score(
    clf, X, Y, cv=6, scoring="f1_macro"
)
print("Average F1-score: %.2f" % (np.mean(scores)))

Standardized data

In [None]:
# classify data using logistic regression
clf = LogisticRegression(random_state=0, max_iter=10000)
# split dataset in training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(
    X_std, Y, test_size=0.25
)
clf.fit(X_train, Y_train)
predictions = clf.predict(X_test)
acc = clf.score(X_test, Y_test)
f1 = f1_score(Y_test, predictions, average="binary")
print("Accuracy: %.2f" % (acc))
print("F1-score: %.2f" % (f1))
# visualize results
visualize_results(X_test, Y_test, predictions, "Original data", f1)

In [None]:
# evaluate the model with cross validation
clf = LogisticRegression(random_state=0, max_iter=10000)
scores = cross_val_score(
    clf, X_std, Y, cv=6, scoring="f1_macro"
)
print("Average F1-score: %.2f" % (np.mean(scores)))

Normalized data

In [None]:
# classify data using logistic regression
clf = LogisticRegression(random_state=0, max_iter=10000)
# split dataset in training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(
    X_norm, Y, test_size=0.25
)
clf.fit(X_train, Y_train)
predictions = clf.predict(X_test)
acc = clf.score(X_test, Y_test)
f1 = f1_score(Y_test, predictions, average="binary")
print("Accuracy: %.2f" % (acc))
print("F1-score: %.2f" % (f1))
# visualize results
visualize_results(X_test, Y_test, predictions, "Original data", f1)

In [None]:
# evaluate the model with cross validation
clf = LogisticRegression(random_state=0, max_iter=10000)
scores = cross_val_score(
    clf, X_norm, Y, cv=6, scoring="f1_macro"
)
print("Average F1-score: %.2f" % (np.mean(scores)))

The performance of our model are not bad, but neither good enough. There could be some variables introducing some bias within the model. Therefore, we should remove such variables, and train the model only on those features really characterizing the model.<br>

To perform this task we will use **LASSO**.

Original data

In [None]:
# try feature selection using LASSO
clf = LogisticRegression(C=0.01, penalty="l2", dual=False, max_iter=10000).fit(
    X, Y
)
model = SelectFromModel(clf, prefit=True)
X_feat_sel = model.transform(X)
print(
    f"Dataset shape before feature selection: {X.shape[0]} x {X.shape[1]}"
)
print(
    f"Dataset shape before feature selection: {X_feat_sel.shape[0]} x {X_feat_sel.shape[1]}"
)  # kept 27 vars

In [None]:
# evaluate our model with cross validation
clf = LogisticRegression(random_state=0, max_iter=10000)
scores = cross_val_score(
    clf, X_feat_sel, Y, cv=6, scoring="f1_macro"
)
print("Average F1-score: %.2f" % (np.mean(scores)))

Interestingly, we got better performances than before. Let's repeat the procedure on the standardized and normalized data.

Standardized data

In [None]:
# try feature selection using LASSO
clf = LogisticRegression(C=0.01, penalty="l2", dual=False, max_iter=10000).fit(
    X_std, Y
)
model = SelectFromModel(clf, prefit=True)
X_feat_sel = model.transform(X_std)
print(
    f"Dataset shape before feature selection: {X_std.shape[0]} x {X_std.shape[1]}"
)
print(
    f"Dataset shape before feature selection: {X_feat_sel.shape[0]} x {X_feat_sel.shape[1]}"
)  # kept 34 vars

In [None]:
# evaluate our model with cross validation
clf = LogisticRegression(random_state=0, max_iter=10000)
scores = cross_val_score(
    clf, X_feat_sel, Y, cv=6, scoring="f1_macro"
)
print("Average F1-score: %.2f" % (np.mean(scores)))

Normalized data

In [None]:
# try feature selection using LASSO
clf = LogisticRegression(C=0.01, penalty="l2", dual=False, max_iter=10000).fit(
    X_norm, Y
)
model = SelectFromModel(clf, prefit=True)
X_feat_sel = model.transform(X_norm)
print(
    f"Dataset shape before feature selection: {X_norm.shape[0]} x {X_norm.shape[1]}"
)
print(
    f"Dataset shape before feature selection: {X_feat_sel.shape[0]} x {X_feat_sel.shape[1]}"
)  # kept 19 vars

In [None]:
# evaluate our model with cross validation
clf = LogisticRegression(random_state=0, max_iter=10000)
scores = cross_val_score(
    clf, X_feat_sel, Y, cv=6, scoring="f1_macro"
)
print("Average F1-score: %.2f" % (np.mean(scores)))

#### Support Vector Machines

Original data

In [None]:
# classify data using SVM 
clf = LinearSVC(random_state=0, max_iter=100000)
# split dataset in training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.25
)
clf.fit(X_train, Y_train)
predictions = clf.predict(X_test)
acc = clf.score(X_test, Y_test)
f1 = f1_score(Y_test, predictions, average="binary")
print("Accuracy: %.2f" % (acc))
print("F1-score: %.2f" % (f1))
# visualize results
visualize_results(X_test, Y_test, predictions, "Original data", f1)

In [None]:
# evaluate our model with cross validation
clf = LinearSVC(random_state=0, max_iter=100000)
scores = cross_val_score(
    clf, X, Y, cv=6, scoring="f1_macro"
)
print("Average F1-score: %.2f" % (np.mean(scores)))

Standardized data

In [None]:
# classify data using SVM 
clf = LinearSVC(random_state=0, max_iter=100000)
# split dataset in training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(
    X_std, Y, test_size=0.25
)
clf.fit(X_train, Y_train)
predictions = clf.predict(X_test)
acc = clf.score(X_test, Y_test)
f1 = f1_score(Y_test, predictions, average="binary")
print("Accuracy: %.2f" % (acc))
print("F1-score: %.2f" % (f1))
# visualize results
visualize_results(X_test, Y_test, predictions, "Original data", f1)

In [None]:
# evaluate our model with cross validation
clf = LinearSVC(random_state=0, max_iter=100000)
scores = cross_val_score(
    clf, X_std, Y, cv=6, scoring="f1_macro"
)
print("Average F1-score: %.2f" % (np.mean(scores)))

Normalized data

In [None]:
# classify data using SVM 
clf = LinearSVC(random_state=0, max_iter=100000)
# split dataset in training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(
    X_norm, Y, test_size=0.25
)
clf.fit(X_train, Y_train)
predictions = clf.predict(X_test)
acc = clf.score(X_test, Y_test)
f1 = f1_score(Y_test, predictions, average="binary")
print("Accuracy: %.2f" % (acc))
print("F1-score: %.2f" % (f1))
# visualize results
visualize_results(X_test, Y_test, predictions, "Original data", f1)

In [None]:
# evaluate our model with cross validation
clf = LinearSVC(random_state=0, max_iter=100000)
scores = cross_val_score(
    clf, X_norm, Y, cv=6, scoring="f1_macro"
)
print("Average F1-score: %.2f" % (np.mean(scores)))

As previously done with logistic regression, let's try to perform some feature selection using LASSO to improve the performance of our SVM classifier.

Original data

In [None]:
# try feature selection using LASSO
clf = LinearSVC(C=0.01, penalty="l1", dual=False, max_iter=100000).fit(X, Y)
model = SelectFromModel(clf, prefit=True)
X_feat_sel = model.transform(X)
print(
    f"Dataset shape before feature selection: {X.shape[0]} x {X.shape[1]}"
)
print(
    f"Dataset shape before feature selection: {X_feat_sel.shape[0]} x {X_feat_sel.shape[1]}"
)  # kept 29 vars

In [None]:
# evaluate the model with cross-validation
clf = LinearSVC(max_iter=100000, random_state=0)
scores = cross_val_score(
    clf, X_feat_sel, Y, cv=6, scoring="f1_macro"
)
print("Average F1-score: %.2f" % (np.mean(scores)))

Standardized data

In [None]:
# try feature selection using LASSO
clf = LinearSVC(C=0.01, penalty="l1", dual=False, max_iter=100000).fit(X_std, Y)
model = SelectFromModel(clf, prefit=True)
X_feat_sel = model.transform(X_std)
print(
    f"Dataset shape before feature selection: {X_std.shape[0]} x {X_std.shape[1]}"
)
print(
    f"Dataset shape before feature selection: {X_feat_sel.shape[0]} x {X_feat_sel.shape[1]}"
)  # kept 21 vars

In [None]:
# evaluate the model with cross-validation
clf = LinearSVC(max_iter=100000, random_state=0)
scores = cross_val_score(
    clf, X_feat_sel, Y, cv=6, scoring="f1_macro"
)
print("Average F1-score: %.2f" % (np.mean(scores)))

Normalized data

In [None]:
# try feature selection using LASSO
clf = LinearSVC(C=0.01, penalty="l1", dual=False, max_iter=100000).fit(X_norm, Y)
model = SelectFromModel(clf, prefit=True)
X_feat_sel = model.transform(X_norm)
print(
    f"Dataset shape before feature selection: {X_norm.shape[0]} x {X_norm.shape[1]}"
)
print(
    f"Dataset shape before feature selection: {X_feat_sel.shape[0]} x {X_feat_sel.shape[1]}"
)  # kept 6 vars

In [None]:
# evaluate the model with cross-validation
clf = LinearSVC(max_iter=100000, random_state=0)
scores = cross_val_score(
    clf, X_feat_sel, Y, cv=6, scoring="f1_macro"
)
print("Average F1-score: %.2f" % (np.mean(scores)))

### Decision trees

Original data

In [None]:
# classify our data using decision trees
clf = DecisionTreeClassifier()
# split dataset in training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.25
)
# classify data
clf.fit(X_train, Y_train)
predictions = clf.predict(X_test)
f1 = f1_score(Y_test, predictions, average="binary")
print("F1-score: %.2f" % (f1))
# visualize the tree
plt.figure(figsize=(15,10))
plot_tree(clf)

In [None]:
# evaluate the model with cross-validation
clf = DecisionTreeClassifier()
scores = cross_val_score(
    clf, X, Y, scoring="f1_macro"
)
print("Average F1-score: %.2f" % (np.mean(scores)))

Standardized data

In [None]:
# classify our data using decision trees
clf = DecisionTreeClassifier()
# split dataset in training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(
    X_std, Y, test_size=0.25
)
# classify data
clf.fit(X_train, Y_train)
predictions = clf.predict(X_test)
f1 = f1_score(Y_test, predictions, average="binary")
print("F1-score: %.2f" % (f1))
# visualize the tree
plt.figure(figsize=(15,10))
plot_tree(clf)

In [None]:
# evaluate the model with cross-validation
clf = DecisionTreeClassifier()
scores = cross_val_score(
    clf, X_std, Y, scoring="f1_macro"
)
print("Average F1-score: %.2f" % (np.mean(scores)))

Normalized data

In [None]:
# classify our data using decision trees
clf = DecisionTreeClassifier()
# split dataset in training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(
    X_norm, Y, test_size=0.25
)
# classify data
clf.fit(X_train, Y_train)
predictions = clf.predict(X_test)
f1 = f1_score(Y_test, predictions, average="binary")
print("F1-score: %.2f" % (f1))
# visualize the tree
plt.figure(figsize=(15,10))
plot_tree(clf)

In [None]:
# evaluate the model with cross-validation
clf = DecisionTreeClassifier()
scores = cross_val_score(
    clf, X_norm, Y, scoring="f1_macro"
)
print("Average F1-score: %.2f" % (np.mean(scores)))