# Advanced Machine Learning

## Project 2

## Feature Selection part

In [None]:
import pandas as pd
import numpy as np

### Reading files and preprocessing

In [None]:
artificial_data = pd.read_csv('./data/artificial_train.data', delim_whitespace=True, header=None)
artificial_label = pd.read_csv('./data/artificial_train.labels', delim_whitespace=True, header=None).values.ravel()

In [None]:
spam = pd.read_csv('./data/sms_train.csv')
spam_data = spam['message']
spam_label = spam['label'].values.ravel()

Remove punctuation and lowercase

In [None]:
import string

spam_data = spam_data.str.translate(str.maketrans('', '', string.punctuation)).str.lower()

### Train-test split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
artificial_data, artificial_data_test, artificial_label, artificial_label_test = train_test_split(
    artificial_data, artificial_label, test_size=0.2, random_state=0, stratify=artificial_label
)

In [None]:
spam_data, spam_data_test, spam_label, spam_label_test = train_test_split(
    spam_data, spam_label, test_size=0.2, random_state=0, stratify=spam_label
)

### Dataset preprocessing

I will create three version of this dataset - with all words, with words more frequent than 0.1% and more frequent than 1%.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorisers = {
    'all': CountVectorizer().fit(spam_data),
    '0.01': CountVectorizer(min_df=0.01).fit(spam_data),
    '0.001': CountVectorizer(min_df=0.001).fit(spam_data)
}

In [None]:
spam_train_data = {
    key: pd.DataFrame.sparse.from_spmatrix(vectoriser.transform(spam_data))
    for key, vectoriser in vectorisers.items()
}

In [None]:
spam_test_data = {
    key: pd.DataFrame.sparse.from_spmatrix(vectoriser.transform(spam_data_test))
    for key, vectoriser in vectorisers.items()
}

### Testing selected methods

In [None]:
def save_preprocessed(X, y, columns, dataset_name, method_name, type, folder_name='preprocessed_raw'):
    X1 = X[columns].copy()
    X1['TARGET'] = y
    X1.to_csv(f'./{folder_name}/{dataset_name}_{type}_{method_name}_{str(len(columns))}.csv')

### Preparing folders

In [None]:
import os

In [None]:
os.makedirs('preprocessed_raw')
os.makedirs('preprocessed_preprocessed_freq_01_MI')
os.makedirs('preprocessed_freq_001')

#### Feature Importance based on Random Forest

In [None]:
np.random.seed(0)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
def select_features_from_feature_importance(m, n=5):
    # return indexes of columns with highest FI
    importances = m.feature_importances_
    return np.argpartition(importances, -n)[-n:]

##### Artificial data

In [None]:
rf1 = RandomForestClassifier().fit(artificial_data, artificial_label)

Saving preprocessed dataset with 5 and 10 best features

In [None]:
save_preprocessed(artificial_data, artificial_label, select_features_from_feature_importance(rf1, 5),
                  'artificial', 'feature_importance', 'train')
save_preprocessed(artificial_data_test, artificial_label_test, select_features_from_feature_importance(rf1, 5),
                  'artificial', 'feature_importance', 'test')

In [None]:
save_preprocessed(artificial_data, artificial_label, select_features_from_feature_importance(rf1, 10),
                  'artificial', 'feature_importance', 'train')
save_preprocessed(artificial_data_test, artificial_label_test, select_features_from_feature_importance(rf1, 10),
                  'artificial', 'feature_importance', 'test')

##### Spam data

Saving preprocessed dataset with 50, 100 and 150 best features

In [None]:
folder_names = {
    'all': 'preprocessed_raw',
    '0.01': 'preprocessed_preprocessed_freq_01_MI',
    '0.001': 'preprocessed_freq_001'
}

In [None]:
for key, df in spam_train_data.items():
    rf2 = RandomForestClassifier().fit(df, spam_label)

    save_preprocessed(df, spam_label, select_features_from_feature_importance(rf2, 50),
                      'spam', 'feature_importance', 'train', folder_names[key])
    save_preprocessed(spam_test_data[key], spam_label_test, select_features_from_feature_importance(rf2, 50),
                      'spam', 'feature_importance', 'test', folder_names[key])

    save_preprocessed(df, spam_label, select_features_from_feature_importance(rf2, 100),
                      'spam', 'feature_importance', 'train', folder_names[key])
    save_preprocessed(spam_test_data[key], spam_label_test, select_features_from_feature_importance(rf2, 100),
                      'spam', 'feature_importance', 'test', folder_names[key])

    save_preprocessed(df, spam_label, select_features_from_feature_importance(rf2, 150),
                      'spam', 'feature_importance', 'train', folder_names[key])
    save_preprocessed(spam_test_data[key], spam_label_test, select_features_from_feature_importance(rf2, 150),
                      'spam', 'feature_importance', 'test', folder_names[key])

#### Boruta Algorithm

In [None]:
np.random.seed(0)

In [None]:
from boruta import BorutaPy

##### Artificial data

In [None]:
boruta1 = BorutaPy(RandomForestClassifier(), n_estimators='auto', verbose=0).fit(np.asarray(artificial_data), artificial_label)

In [None]:
save_preprocessed(artificial_data, artificial_label, artificial_data.columns[boruta1.support_],
                  'artificial', 'boruta', 'train')
save_preprocessed(artificial_data_test, artificial_label_test, artificial_data.columns[boruta1.support_],
                  'artificial', 'boruta', 'test')

##### Spam data

In [None]:
for key, df in spam_train_data.items():
    boruta2 = BorutaPy(RandomForestClassifier(), n_estimators='auto', verbose=0).fit(np.asarray(df), spam_label)

    save_preprocessed(df, spam_label, spam_data.columns[boruta2.support_],
                       'spam', 'boruta', 'train', folder_names[key])
    save_preprocessed(spam_test_data[key], spam_label_test, spam_data.columns[boruta2.support_],
                      'spam', 'boruta', 'test', folder_names[key])

#### MRMR (minimum Redundancy - Maximum Relevance)

In [None]:
from mrmr import mrmr_classif

In [None]:
np.random.seed(0)

##### Artificial data

The algorithm be used to select 5 and 10 best features

In [None]:
mrmr1 = mrmr_classif(artificial_data, artificial_label, K=5)

In [None]:
save_preprocessed(artificial_data, artificial_label, mrmr1,
                  'artificial', 'mrmr', 'train')
save_preprocessed(artificial_data_test, artificial_label_test, mrmr1,
                  'artificial', 'mrmr', 'test')

In [None]:
mrmr2 = mrmr_classif(artificial_data, artificial_label, K=10)

In [None]:
save_preprocessed(artificial_data, artificial_label, mrmr2,
                  'artificial', 'mrmr', 'train')
save_preprocessed(artificial_data_test, artificial_label_test, mrmr2,
                  'artificial', 'mrmr', 'test')

##### Spam data

The algorithm will be used to select 50, 100 and 150 features

In [None]:
for key, df in spam_train_data.items():
    mrmr3 = mrmr_classif(df.sparse.to_dense(), spam_label, K=50)

    save_preprocessed(df, spam_label, mrmr3,
                      'spam', 'mrmr', 'train', folder_names[key])
    save_preprocessed(spam_test_data[key], spam_label_test, mrmr3,
                      'spam', 'mrmr', 'test', folder_names[key])

    mrmr4 = mrmr_classif(df.sparse.to_dense(), spam_label, K=100)

    save_preprocessed(df, spam_label, mrmr4,
                      'spam', 'mrmr', 'train', folder_names[key])
    save_preprocessed(spam_test_data[key], spam_label_test, mrmr4,
                      'spam', 'mrmr', 'test', folder_names[key])

    mrmr5 = mrmr_classif(df.sparse.to_dense(), spam_label, K=150)

    save_preprocessed(df, spam_label, mrmr5,
                      'spam', 'mrmr', 'train', folder_names[key])
    save_preprocessed(spam_test_data[key], spam_label_test, mrmr5,
                      'spam', 'mrmr', 'test', folder_names[key])

#### Test chi^2

A selected number with the highest value of the statistic will be used

In [None]:
np.random.seed(0)

In [None]:
from sklearn.feature_selection import chi2

In [None]:
def indices_of_n_max(tab, n):
    return np.argpartition(tab, -n)[-n:]

##### Artificial data

The algorithm be used to select 5 and 10 best features

In [None]:
statistic_values1, _ = chi2(artificial_data, artificial_label)

In [None]:
save_preprocessed(artificial_data, artificial_label, indices_of_n_max(statistic_values1, 5),
                  'artificial', 'chi2', 'train')
save_preprocessed(artificial_data_test, artificial_label_test, indices_of_n_max(statistic_values1, 5),
                  'artificial', 'chi2', 'test')

In [None]:
save_preprocessed(artificial_data, artificial_label, indices_of_n_max(statistic_values1, 10),
                  'artificial', 'chi2', 'train')
save_preprocessed(artificial_data_test, artificial_label_test, indices_of_n_max(statistic_values1, 10),
                  'artificial', 'chi2', 'test')

##### Spam data

The algorithm will be used to select 50, 100 and 150 features

In [None]:
for key, df in spam_train_data.items():
    statistic_values2, _ = chi2(df, spam_label)

    save_preprocessed(df, spam_label, indices_of_n_max(statistic_values2, 50),
                      'spam', 'chi2', 'train', folder_names[key])
    save_preprocessed(spam_test_data[key], spam_label_test, indices_of_n_max(statistic_values2, 50),
                      'spam', 'chi2', 'test', folder_names[key])

    save_preprocessed(df, spam_label, indices_of_n_max(statistic_values2, 100),
                      'spam', 'chi2', 'train', folder_names[key])
    save_preprocessed(spam_test_data[key], spam_label_test, indices_of_n_max(statistic_values2, 100),
                      'spam', 'chi2', 'test', folder_names[key])

    save_preprocessed(df, spam_label, indices_of_n_max(statistic_values2, 150),
                      'spam', 'chi2', 'train', folder_names[key])
    save_preprocessed(spam_test_data[key], spam_label_test, indices_of_n_max(statistic_values2, 150),
                      'spam', 'chi2', 'test', folder_names[key])

#### Mutual Information

A selected number with the highest value of the mutual information will be used

In [None]:
np.random.seed(0)

In [None]:
from sklearn.feature_selection import mutual_info_classif

In [None]:
def indices_of_n_max(tab, n):
    return np.argpartition(tab, -n)[-n:]

##### Artificial data

The algorithm be used to select 5 and 10 best features

In [None]:
mi1 = mutual_info_classif(artificial_data, artificial_label)

In [None]:
save_preprocessed(artificial_data, artificial_label, indices_of_n_max(mi1, 5),
                  'artificial', 'mutual_information', 'train', 'preprocessed_preprocessed_freq_01_MI')
save_preprocessed(artificial_data_test, artificial_label_test, indices_of_n_max(mi1, 5),
                  'artificial', 'mutual_information', 'test', 'preprocessed_preprocessed_freq_01_MI')

In [None]:
save_preprocessed(artificial_data, artificial_label, indices_of_n_max(mi1, 10),
                  'artificial', 'mutual_information', 'train', 'preprocessed_preprocessed_freq_01_MI')
save_preprocessed(artificial_data_test, artificial_label_test, indices_of_n_max(mi1, 10),
                  'artificial', 'mutual_information', 'test', 'preprocessed_preprocessed_freq_01_MI')

##### Spam data

The algorithm will be used to select 50, 100 and 150 features

In [None]:
for key, df in spam_train_data.items():
    mi2 = mutual_info_classif(df, spam_label)

    save_preprocessed(df, spam_label, indices_of_n_max(mi2, 50),
                      'spam', 'mutual_information', 'train', folder_names[key])
    save_preprocessed(spam_test_data[key], spam_label_test, indices_of_n_max(mi2, 50),
                      'spam', 'mutual_information', 'test', folder_names[key])

    save_preprocessed(df, spam_label, indices_of_n_max(mi2, 100),
                      'spam', 'mutual_information', 'train', folder_names[key])
    save_preprocessed(spam_test_data[key], spam_label_test, indices_of_n_max(mi2, 100),
                      'spam', 'mutual_information', 'test', folder_names[key])

    save_preprocessed(df, spam_label, indices_of_n_max(mi2, 150),
                      'spam', 'mutual_information', 'train', folder_names[key])
    save_preprocessed(spam_test_data[key], spam_label_test, indices_of_n_max(mi2, 150),
                      'spam', 'mutual_information', 'test', folder_names[key])