<a href="https://colab.research.google.com/github/Leomutz/NIDS1/blob/main/SACAIR2024_tut.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import libraries

In [None]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, f1_score
from sklearn.decomposition import PCA

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

!pip install umap-learn
import umap



## Make some data

The data has already been simulated we will load it from github.

```python
n_samples = 1000
# Generate some data
X0, y = make_classification(n_samples=n_samples, n_features=100,
                           n_informative=10, n_redundant=80,
                           n_repeated=10, random_state=1)
X1 = []
# add in some low variance features
for i in range(0, 50):
    loc = np.random.randint(0,10)
    scale = i * 0.02
    x = np.random.normal(loc=loc, scale=scale, size=(1, 1000)).tolist()[0]
    X1.append(x)

X1 = np.array(X1).T


data = (pd.concat([pd.DataFrame(X0), pd.DataFrame(X1)], axis=1))
new_index = np.arange(0, (X0.shape[1] + X1.shape[1]) )
data.columns = new_index

data = data[np.random.choice(new_index, len(new_index))]
data.columns = new_index
data['target'] = y
data.to_csv(f'{t}.csv', index=False)
```

In [None]:
path_to_data = "https://raw.githubusercontent.com/ArmandBester/SACAIR2024_dim_curse/refs/heads/main/candidate-2024-08-21_071931.647000.csv"
data =pd.read_csv(path_to_data)
data

In [None]:
data.plot(figsize=(12,4), legend=None);

In [None]:
data.shape

## Split the data into training and testing sets

In [None]:
X = data.drop("target", axis=1).values
y = data['target'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Look at variance in the features

In [None]:
feature_var_dict = {}
for i in range(X.shape[1]):
    feature = X_train[:,i]
    f_variance = np.var(feature)
    feature_var_dict[i] = f_variance

var_df = (pd.DataFrame.from_dict(feature_var_dict, orient='index').reset_index()
          .rename({'index': 'feature', 0: 'variance'}, axis=1)
          .sort_values("variance", ascending=False)
          .astype({"feature": str})
         )

var_df

px.bar(data_frame=var_df, x='feature', y='variance')

## Scale

In [None]:
min_max_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = min_max_scaler.transform(X_train)
X_test_scaled = min_max_scaler.transform(X_test)
plt.figure(figsize=(12,4))
plt.plot(X_train_scaled);

## First model

In [None]:
model1_RF = RandomForestClassifier()
model1_RF.fit(X_train_scaled, y_train)
model1_RF_pred = model1_RF.predict(X_test_scaled)

disp = ConfusionMatrixDisplay(
    confusion_matrix(y_test, model1_RF_pred),
    display_labels=model1_RF.classes_
)
fig, ax = plt.subplots(figsize=(3,3))
disp.plot(ax=ax);

print(classification_report(y_test, model1_RF_pred))

In [None]:
model1_KM = KMeans(n_clusters=2, random_state=0, n_init="auto")
model1_KM.fit(X_train_scaled, y_train)
model1_KM_pred = model1_KM.predict(X_test_scaled)

disp = ConfusionMatrixDisplay(
    confusion_matrix(y_test, model1_KM_pred)
)
fig, ax = plt.subplots(figsize=(3,3))
disp.plot(ax=ax);


print(classification_report(y_test, model1_KM_pred))

In [None]:
print(y_train.mean(), y_test.mean()) # since we have zeros and ones for our
                                     # classes we can use the mean to look
                                     # class balance

## Remove columns with very low variance

In [None]:
low_var_features = var_df.query("variance < 1")['feature'].astype(str).values.tolist()
len(low_var_features)

### Remove low variance features

In [None]:
data = data.drop(low_var_features, axis=1)

X = data.drop("target", axis=1).values
y = data['target'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

min_max_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = min_max_scaler.transform(X_train)
X_test_scaled = min_max_scaler.transform(X_test)

In [None]:
X_train_scaled.shape

In [None]:
model2_RF = RandomForestClassifier()
model2_RF.fit(X_train_scaled, y_train)
model2_RF_pred = model2_RF.predict(X_test_scaled)

disp = ConfusionMatrixDisplay(
    confusion_matrix(y_test, model2_RF_pred),
    display_labels=model2_RF.classes_
)
fig, ax = plt.subplots(figsize=(3,3))
disp.plot(ax=ax);

print(classification_report(y_test, model2_RF_pred))

In [None]:
model2_KM = KMeans(n_clusters=2, random_state=0, n_init="auto")
model2_KM.fit(X_train_scaled, y_train)
model2_KM_pred = model2_KM.predict(X_test_scaled)

disp = ConfusionMatrixDisplay(
    confusion_matrix(y_test, model2_KM_pred)
)
fig, ax = plt.subplots(figsize=(3,3))
disp.plot(ax=ax);


print(classification_report(y_test, model2_KM_pred))

## Get highly correlated features

* reduce redundancy

In [None]:
corrMat = data.drop('target', axis=1).corr()
sns.heatmap(corrMat, cmap='viridis');

In [None]:
corrMat

In [None]:
discard = (abs(corrMat)                  # we take the abs of the correlation matrix
           .stack()
           .reset_index()
           .rename({'level_0': 'A',
                    'level_1': 'B',
                    0: 'corr'},
                   axis=1)

            .query("A != B")
            .query("corr >=  0.85")        # this correlation cutoff can vary depending on what we know about our data
            .drop(['B', 'corr'], axis=1)
            .drop_duplicates()
            ['A']
            .values
          )
discard

In [None]:
len(discard)

In [None]:
data = data.drop(discard, axis=1)
X = data.drop("target", axis=1).values
y = data['target'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

min_max_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = min_max_scaler.transform(X_train)
X_test_scaled = min_max_scaler.transform(X_test)
X_train_scaled.shape

In [None]:
model3_RF = RandomForestClassifier()
model3_RF.fit(X_train_scaled, y_train)
model3_RF_pred = model3_RF.predict(X_test_scaled)

disp = ConfusionMatrixDisplay(
    confusion_matrix(y_test, model3_RF_pred),
    display_labels=model3_RF.classes_
)
fig, ax = plt.subplots(figsize=(3,3))
disp.plot(ax=ax);

print(classification_report(y_test, model3_RF_pred))

In [None]:
model3_KM = KMeans(n_clusters=2, random_state=0, n_init="auto")
model3_KM.fit(X_train_scaled, y_train)
model3_KM_pred = model3_KM.predict(X_test_scaled)

disp = ConfusionMatrixDisplay(
    confusion_matrix(y_test, model3_KM_pred),
    #display_labels=model1_KM.classes_
)
fig, ax = plt.subplots(figsize=(3,3))
disp.plot(ax=ax);


print(classification_report(y_test, model3_KM_pred))

## What if we don't discard correlated feature but combine them?

In [None]:
(corrMat                  # we take the abs of the correlation matrix
   .stack()
   .reset_index()
   .rename({'level_0': 'A',
            'level_1': 'B',
            0: 'corr'},
           axis=1)

    .query("A != B")
    .query("corr >=  0.85")        # this correlation cutoff can vary depending on what we know about our data
     ['corr']
    .mean()
 )

```
Even though in our case it won't make sense (since the variables are so highly correlated),
but one can also combine variables: multiplying them together, summing deviding ... meam.
Even better if you have some domain knowledge on the data, some of these methods of combining
would make more sense. As an excample, say you had a dataset for predicting house prises and you have width
and length, then multiplying these would give you the area. Now it makes sense to keep area
and remove width and length
```

## How can we find feature importance?

Let's import our data from scratch again and see what we can find

In [None]:
data = pd.read_csv(path_to_data)
X = data.drop("target", axis=1).values
y = data['target'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

rfc= RandomForestClassifier(random_state=42)
rfc.fit(X_train, y_train)

rfc_pred = rfc.predict(X_test)

disp = ConfusionMatrixDisplay(
    confusion_matrix(y_test, rfc_pred)
)
fig, ax = plt.subplots(figsize=(3,3))
disp.plot(ax=ax);


print(classification_report(y_test, rfc_pred))

In [None]:
importance = rfc.feature_importances_
names = data.drop("target", axis=1).columns
feature_importance_df = pd.DataFrame.from_dict({"feature": names,
                                               "importance": importance}
                                              ).sort_values("importance", ascending=False) # we sort the columns by feature importance
feature_importance_df

In [None]:
feature_importance_df.query("importance >= 0.005").plot(kind='bar', figsize=(16,4));

In [None]:
plt.scatter(data=data, x='99', y='149', c='target');

In [None]:
fig = px.scatter_3d(data_frame=data, x='99', y='149', z='142', opacity=0.7,
                    color='target', height=800, width=1000)
fig.update_traces(marker_size = 5)
fig.show()

## Test feature importance on models

In [None]:
def  transform_data(dataframe=data, top_n_features=2):

    selection = feature_importance_df.iloc[0:top_n_features,0].values
    print("Features in order:", selection, "\n")
    X = data[selection].values
    y = data['target'].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

    min_max_scaler = MinMaxScaler().fit(X_train)
    X_train_scaled = min_max_scaler.transform(X_train)
    X_test_scaled = min_max_scaler.transform(X_test)
    return X_train_scaled, X_test_scaled, y_train, y_test

In [None]:
# Slecect the number of features
X_train_scaled, X_test_scaled, y_train, y_test = transform_data(data, 11)

model4_KM = KMeans(n_clusters=2, random_state=0, n_init="auto")
model4_KM.fit(X_train_scaled, y_train)
model4_KM_pred = model4_KM.predict(X_test_scaled)

disp = ConfusionMatrixDisplay(
    confusion_matrix(y_test, model4_KM_pred),
    #display_labels=model1_KM.classes_
)
fig, ax = plt.subplots(figsize=(3,3))
disp.plot(ax=ax);

print(classification_report(y_test, model4_KM_pred))

### Write a for loop to look at number of top features and model performance

In [None]:
%%capture
f1_vs_n_features = {}
for n_features in range(1,150):
  X_train_scaled, X_test_scaled, y_train, y_test = transform_data(data, n_features)

  model4_KM = KMeans(n_clusters=2, random_state=0, n_init="auto")
  model4_KM.fit(X_train_scaled, y_train)
  model4_KM_pred = model4_KM.predict(X_test_scaled)

  f1 = f1_score(y_test, model4_KM_pred, average='weighted', zero_division='warn')
  f1_vs_n_features[n_features] = f1

f1Df  = pd.DataFrame.from_dict(f1_vs_n_features, orient='index').reset_index().rename({'index': 'n_features', 0: 'f1'}, axis=1)

In [None]:

px.line(data_frame=f1Df, x='n_features', y='f1')

# Dimentionality reduction and latent space

## Principle component analysis

In [None]:
data = pd.read_csv(path_to_data)

X = data.drop("target", axis=1).values
y = data['target'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

min_max_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = min_max_scaler.transform(X_train)
X_test_scaled = min_max_scaler.transform(X_test)

In [None]:
keep_fisrt_n = 10
pca = PCA(n_components=keep_fisrt_n)
pComponents = pca.fit_transform(X_train_scaled)
pComponents.shape

In [None]:

plt.plot(pca.explained_variance_ratio_)

In [None]:
pcaDf = pd.DataFrame(pComponents)
pcaDf['target'] = y_train
plt.scatter(pcaDf[0], pcaDf[1], c=pcaDf['target'])

In [None]:
fig = px.scatter_3d(data_frame=pcaDf, x=0, y=1, z=2, color='target', height=800, width=1000)
fig.update_traces(marker_size=3)
fig

In [None]:
pComponents.shape

In [None]:
pComponents_test = pca.transform(X_test_scaled)
pComponents_test.shape

In [None]:
PCA_KM = KMeans(n_clusters=2, random_state=0, n_init="auto")
PCA_KM.fit(pComponents, y_train)
PCA_KM_pred = PCA_KM.predict(pComponents_test)

disp = ConfusionMatrixDisplay(
    confusion_matrix(y_test, PCA_KM_pred),
    #display_labels=model1_KM.classes_
)
fig, ax = plt.subplots(figsize=(3,3))
disp.plot(ax=ax);

print(classification_report(y_test, PCA_KM_pred))

In [None]:
PCA_RFC = RandomForestClassifier()
PCA_RFC.fit(pComponents, y_train)
PCA_RFC_pred = PCA_RFC.predict(pComponents_test)

disp = ConfusionMatrixDisplay(
    confusion_matrix(y_test, PCA_RFC_pred),
    #display_labels=model1_KM.classes_
)
fig, ax = plt.subplots(figsize=(3,3))
disp.plot(ax=ax);

print(classification_report(y_test, PCA_RFC_pred))

## Uniform Manifold Approximation and Projection

In [None]:
reducer = umap.UMAP(n_neighbors=7, n_components=3, random_state=42, n_jobs=1)
reducer = reducer.fit(X_train_scaled)
embedding_train = reducer.transform(X_train_scaled)
embedding_test = reducer.transform(X_test_scaled)
embedding_train.shape, embedding_test.shape

In [None]:
umap_plot_df = pd.DataFrame(embedding_train)
umap_plot_df['target'] = y_train
sns.scatterplot(data=umap_plot_df, x=0, y=1, hue='target');

In [None]:
fig = px.scatter_3d(data_frame=umap_plot_df, x=0, y=1, z=2,
                    color='target', height=800, width=1000)
fig.update_traces(marker_size=3)

In [None]:
UMAP_KM = KMeans(n_clusters=2, random_state=0, n_init="auto")
UMAP_KM.fit(embedding_train, y_train)
UMAP_KM_pred = UMAP_KM.predict(embedding_test)

disp = ConfusionMatrixDisplay(
    confusion_matrix(y_test, UMAP_KM_pred),
    #display_labels=model1_KM.classes_
)
fig, ax = plt.subplots(figsize=(3,3))
disp.plot(ax=ax);

print(classification_report(y_test, UMAP_KM_pred))

In [None]:
UMAP_RFC = RandomForestClassifier()
UMAP_RFC.fit(embedding_train, y_train)
UMAP_RFC_pred = UMAP_RFC.predict(embedding_test)

disp = ConfusionMatrixDisplay(
    confusion_matrix(y_test, UMAP_RFC_pred),
    #display_labels=model1_KM.classes_
)
fig, ax = plt.subplots(figsize=(3,3))
disp.plot(ax=ax);

print(classification_report(y_test, UMAP_RFC_pred))