In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Read the train and test datasets

In [2]:
x_train = pd.read_csv('/content/drive/MyDrive/Project/Mercedes-Greener-Manufact/Datasets/x_train.csv')
x_test = pd.read_csv('/content/drive/MyDrive/Project/Mercedes-Greener-Manufact/Datasets/x_test.csv')
y_train = pd.read_csv('/content/drive/MyDrive/Project/Mercedes-Greener-Manufact/Datasets/y_train.csv')
y_test = pd.read_csv('/content/drive/MyDrive/Project/Mercedes-Greener-Manufact/Datasets/y_test.csv')

In [3]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2946 entries, 0 to 2945
Columns: 401 entries, ID to X8_y
dtypes: int64(401)
memory usage: 9.0 MB


In [4]:
x_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1263 entries, 0 to 1262
Columns: 401 entries, ID to X8_y
dtypes: int64(401)
memory usage: 3.9 MB


In [5]:
y_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2946 entries, 0 to 2945
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   y       2946 non-null   float64
dtypes: float64(1)
memory usage: 23.1 KB


In [6]:
y_test.head(n = 5)

Unnamed: 0,y
0,97.94
1,96.41
2,105.83
3,79.09
4,108.69


## Split into numerical and categorical variables

In [7]:
num_cols = ['ID']
x_train_numerical = x_train[num_cols]

# get the names of the numerical columns
cat_cols = list(set(x_train.columns) - set(x_train_numerical.columns))
x_train_cat = x_train[cat_cols]

## Feature Selection

Getter function for mutual information (numerical variables)

In [8]:
from sklearn.feature_selection import mutual_info_regression

def mutual_info_df(x_train_variable, y_train, variable_type):
  # Compute mutual information between each feature in X and y
  mutual_info = mutual_info_regression(x_train_variable, y_train, random_state=42)

  # Create a DataFrame to store the mutual information scores for each feature
  mi_df = pd.DataFrame({'feature': variable_type, 'mi_score': mutual_info})

  # Sort the features by their mutual information scores in descending order
  mi_df = mi_df.sort_values(by='mi_score', ascending=False)

  # REturn the mutual information scores for each feature
  return mi_df

Getter function for iterative mutual information

In [9]:
def iterative_mi_final_df(x_train, y_train):
  archive_train = x_train
  master_cols = x_train.columns
  cols_to_drop = []
  final_mf = mutual_info_df(archive_train, y_train, archive_train.columns)
  # Filter out for 0 mutual information feature
  zero_mi_df = final_mf[final_mf["mi_score"] == 0]
  zero_cat = list(zero_mi_df["feature"])
  print(zero_cat)
  cols_to_drop.extend(zero_cat)
  archive_train = archive_train.drop(zero_cat, axis = 1)
  print("Ok")
  # Stopping criteria is there should not be any more 0 mutual information feature
  while len(zero_cat) !=0:
    temp_mf = mutual_info_df(archive_train, y_train, archive_train.columns)
    zero_mi_df = temp_mf[temp_mf["mi_score"] == 0]
    zero_cat = list(zero_mi_df["feature"])
    cols_to_drop.extend(zero_cat)
    archive_train = archive_train.drop(zero_cat, axis = 1)

  return temp_mf, cols_to_drop

#### Numerical variables

Mutual information gain

In [10]:
mf1 = mutual_info_df(x_train_numerical, y_train, num_cols)

  y = column_or_1d(y, warn=True)


In [11]:
mf1.tail(n = 10)

Unnamed: 0,feature,mi_score
0,ID,0.014409


#### Categorical variables

###### Mutual information

In [12]:
final_mf, cols_to_drop = iterative_mi_final_df(x_train, y_train["y"])

['X6_d', 'X6_k', 'X6_g', 'X6_j', 'X152', 'X8_h', 'X361', 'X8_c', 'X342', 'X6_a', 'X326', 'X203', 'X8_m', 'X8_p', 'X226', 'X294', 'X8_u', 'X291', 'X284', 'X8_x', 'X141', 'X82', 'X5_w', 'X2_z', 'X2_x', 'X1_h', 'X32', 'X2_q', 'X2_o', 'X41', 'X2_l', 'X1_r', 'X2_j', 'X2_h', 'X2_a', 'X2_av', 'X2_au', 'X2_at', 'X2_ar', 'X2_y', 'X3_b', 'X140', 'X1_f', 'X139', 'X5_n', 'X5_m', 'X5_l', 'X0_k', 'X5_d', 'X5_c', 'X5_ah', 'X77', 'X5_ae', 'X1_a', 'X5_ac', 'X5_Others', 'X3_g', 'X1_c', 'X1_s']
Ok


In [13]:
final_mf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 316 entries, 63 to 286
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   feature   316 non-null    object 
 1   mi_score  316 non-null    float64
dtypes: float64(1), object(1)
memory usage: 7.4+ KB


In [14]:
final_mf.head(n = 5)

Unnamed: 0,feature,mi_score
63,X127,0.4096
161,X314,0.365214
141,X261,0.345083
72,X136,0.158658
142,X263,0.158498


In [21]:
top_20_features = final_mf['feature'].head(20).tolist()

In [None]:
top_20_features

Check the list for the cols to drop

In [15]:
len(cols_to_drop)

85

For the first dataset, drop these 85 columns due to low mi score.

In [16]:
x_train = x_train.drop(cols_to_drop, axis = 1)
x_test = x_test.drop(cols_to_drop, axis = 1)

In [17]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2946 entries, 0 to 2945
Columns: 316 entries, ID to X8_y
dtypes: int64(316)
memory usage: 7.1 MB


In [18]:
x_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1263 entries, 0 to 1262
Columns: 316 entries, ID to X8_y
dtypes: int64(316)
memory usage: 3.0 MB


For the second dataset, select the top 20 features

In [25]:
x_train1 = x_train[top_20_features]
x_test1 = x_test[top_20_features]

In [27]:
x_train1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2946 entries, 0 to 2945
Data columns (total 20 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   X127    2946 non-null   int64
 1   X314    2946 non-null   int64
 2   X261    2946 non-null   int64
 3   X136    2946 non-null   int64
 4   X263    2946 non-null   int64
 5   X29     2946 non-null   int64
 6   X279    2946 non-null   int64
 7   X76     2946 non-null   int64
 8   X54     2946 non-null   int64
 9   X232    2946 non-null   int64
 10  X313    2946 non-null   int64
 11  X0_az   2946 non-null   int64
 12  X119    2946 non-null   int64
 13  X118    2946 non-null   int64
 14  X328    2946 non-null   int64
 15  X162    2946 non-null   int64
 16  X238    2946 non-null   int64
 17  X276    2946 non-null   int64
 18  X189    2946 non-null   int64
 19  X265    2946 non-null   int64
dtypes: int64(20)
memory usage: 460.4 KB


In [26]:
x_test1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1263 entries, 0 to 1262
Data columns (total 20 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   X127    1263 non-null   int64
 1   X314    1263 non-null   int64
 2   X261    1263 non-null   int64
 3   X136    1263 non-null   int64
 4   X263    1263 non-null   int64
 5   X29     1263 non-null   int64
 6   X279    1263 non-null   int64
 7   X76     1263 non-null   int64
 8   X54     1263 non-null   int64
 9   X232    1263 non-null   int64
 10  X313    1263 non-null   int64
 11  X0_az   1263 non-null   int64
 12  X119    1263 non-null   int64
 13  X118    1263 non-null   int64
 14  X328    1263 non-null   int64
 15  X162    1263 non-null   int64
 16  X238    1263 non-null   int64
 17  X276    1263 non-null   int64
 18  X189    1263 non-null   int64
 19  X265    1263 non-null   int64
dtypes: int64(20)
memory usage: 197.5 KB


For the 3rd dataset, choose top 9 features

The choice of features' number is taken from this website.
https://towardsdatascience.com/curse-of-dimensionality-a-curse-to-machine-learning-c122ee33bfeb

2^9 * 10 = 5120 and our training dataset has a size of 2946. This minimizes curse of dimensionality.

In [45]:
top_16_features = final_mf['feature'].head(16).tolist()

In [46]:
x_train2 = x_train[top_16_features]
x_test2 = x_test[top_16_features]

## Export the dataset

In [19]:
import os

In [20]:
x_train.to_csv('/content/drive/MyDrive/Project/Mercedes-Greener-Manufact/Datasets/x_train_clean.csv', index = False)
x_test.to_csv('/content/drive/MyDrive/Project/Mercedes-Greener-Manufact/Datasets/x_test_clean.csv', index = False)

In [28]:
x_train1.to_csv('/content/drive/MyDrive/Project/Mercedes-Greener-Manufact/Datasets/x_train_clean1.csv', index = False)
x_test1.to_csv('/content/drive/MyDrive/Project/Mercedes-Greener-Manufact/Datasets/x_test_clean1.csv', index = False)

In [47]:
x_train2.to_csv('/content/drive/MyDrive/Project/Mercedes-Greener-Manufact/Datasets/x_train_clean2.csv', index = False)
x_test2.to_csv('/content/drive/MyDrive/Project/Mercedes-Greener-Manufact/Datasets/x_test_clean2.csv', index = False)