# Topics
- Feature selection based on Mutual Information Gain

# Libraries

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold, mutual_info_classif, mutual_info_regression
from sklearn.feature_selection import SelectPercentile, SelectKBest
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score

# Feature selection using Mutual Information - Classification

### Data

In [2]:
data = pd.read_csv('santander.csv', nrows=20000)

data.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


### Variables

In [3]:
X = data.drop('TARGET', axis=1)

y = data['TARGET']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

print('Initial number of features:', X_train.shape[1])

Initial number of features: 370


### Constant and quasi-constant features removal

In [4]:
constant_filter = VarianceThreshold(threshold=0.01)

constant_filter.fit(X_train)

X_train_filtered = constant_filter.transform(X_train)
X_test_filtered = constant_filter.transform(X_test)

print('Number of features after remove constant and quasi-constant features:', X_train_filtered.shape[1])

Number of features after remove constant and quasi-constant features: 255


### Duplicate features removal

In [5]:
X_train_filtered = X_train_filtered.T
X_test_filtered = X_test_filtered.T

X_train_filtered = pd.DataFrame(X_train_filtered)
X_test_filtered = pd.DataFrame(X_test_filtered)

duplicated_features = X_train_filtered.duplicated()

features_to_keep = [not index for index in duplicated_features]

X_train_filtered = X_train_filtered[features_to_keep].T
X_test_filtered = X_test_filtered[features_to_keep].T

print('Number of features after remove duplicate features:', X_train_filtered.shape[1])

Number of features after remove duplicate features: 235


### Mutual information filter

In [6]:
selector = SelectPercentile(mutual_info_classif, percentile=10).fit(X_train_filtered, y_train) # Defining percentile to 10 to keep 10% of features that have the highest scores.

X_train_filtered = selector.transform(X_train_filtered)
X_test_filtered = selector.transform(X_test_filtered)

print('Number of features after mutual information filter:', X_train_filtered.shape[1])

Number of features after mutual information filter: 24


### Model

In [7]:
# Function to create random forest models
def random_forest_model(X_train, X_test, y_train, y_test):
    random_forest_model = RandomForestClassifier(n_estimators=100, n_jobs=-1)
    random_forest_model.fit(X_train, y_train)
    prediction = random_forest_model.predict(X_test)
    print('Model accuracy:', accuracy_score(y_test, prediction))

### Comparing model performance

In [8]:
%%time

print('- Model with feature selection -')

random_forest_model(X_train_filtered, X_test_filtered, y_train, y_test)

- Model with feature selection -
Model accuracy: 0.956
Wall time: 355 ms


In [9]:
%%time

print('- Model without feature selection -')

random_forest_model(X_train, X_test, y_train, y_test)

- Model without feature selection -
Model accuracy: 0.95825
Wall time: 922 ms


# Feature selection using Mutual Information - Regression

### Data

In [10]:
boston_data = load_boston()

print('Data description:')

print(boston_data.DESCR)

Data description:
.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
   

### Variables

In [11]:
X = pd.DataFrame(boston_data.data, columns=boston_data.feature_names)

y = boston_data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print('Initial number of features:', X_train.shape[1])

Initial number of features: 13


### Mutual information filter

In [12]:
selector = SelectKBest(mutual_info_regression, k=9).fit(X_train, y_train) # Defining k to 9 to keep 9 of features that have the highest scores.

X_train_filtered = selector.transform(X_train)
X_test_filtered = selector.transform(X_test)

print('Number of features after mutual information filter:', X_train_filtered.shape[1])

Number of features after mutual information filter: 9


### Model

In [13]:
# Function to create linear regression models
def linear_regression_model(X_train, X_test, y_train, y_test):
    linear_regression_model = LinearRegression()
    linear_regression_model.fit(X_train, y_train)
    prediction = linear_regression_model.predict(X_test)
    print('R^2:', r2_score(y_test, prediction))
    print('RMSE:', np.sqrt(mean_squared_error(y_test, prediction)))
    print('Std of price:', np.std(y))

### Comparing models performance

In [16]:
%%time

print('- Model with feature selection -')

linear_regression_model(X_train_filtered, X_test_filtered, y_train, y_test)

- Model with feature selection -
R^2: 0.6399004317021795
RMSE: 4.503978641858521
Std of price: 9.188011545278203
Wall time: 1e+03 µs


In [17]:
%%time

print('- Model without feature selection -')

linear_regression_model(X_train, X_test, y_train, y_test)

- Model without feature selection -
R^2: 0.6284182823571604
RMSE: 4.575222201735078
Std of price: 9.188011545278203
Wall time: 4 ms
