In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Read the dataset

In [None]:
x_train = pd.read_csv('/content/drive/MyDrive/Project/Santander-value-prediction/Datasets/x_train.csv')
x_test = pd.read_csv('/content/drive/MyDrive/Project/Santander-value-prediction/Datasets/x_test.csv')
y_train = pd.read_csv('/content/drive/MyDrive/Project/Santander-value-prediction/Datasets/y_train.csv')
y_test = pd.read_csv('/content/drive/MyDrive/Project/Santander-value-prediction/Datasets/y_test.csv')

In [None]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3121 entries, 0 to 3120
Columns: 4730 entries, 48df886f9 to 9fc776466
dtypes: float64(1844), int64(2886)
memory usage: 112.6 MB


In [None]:
x_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Columns: 4730 entries, 48df886f9 to 9fc776466
dtypes: float64(1844), int64(2886)
memory usage: 48.3 MB


In [None]:
y_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3121 entries, 0 to 3120
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   target  3121 non-null   float64
dtypes: float64(1)
memory usage: 24.5 KB


In [None]:
y_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   target  1338 non-null   float64
dtypes: float64(1)
memory usage: 10.6 KB


## Feature Selection

#### Getter function for mutual information regression

In [None]:
from sklearn.feature_selection import mutual_info_regression

def mutual_info_df(x_train_variable, y_train, variable_type):
  # Compute mutual information between each feature in X and y
  mutual_info = mutual_info_regression(x_train_variable, y_train, random_state=42)

  # Create a DataFrame to store the mutual information scores for each feature
  mi_df = pd.DataFrame({'feature': variable_type, 'mi_score': mutual_info})

  # Sort the features by their mutual information scores in descending order
  mi_df = mi_df.sort_values(by='mi_score', ascending=False)

  # REturn the mutual information scores for each feature
  return mi_df

#### Getter function for iterative mutual information regression

In [None]:
def iterative_mi_final_df(x_train, y_train):
  archive_train = x_train
  master_cols = x_train.columns
  cols_to_drop = []
  final_mf = mutual_info_df(archive_train, y_train, archive_train.columns)
  # Filter out for 0 mutual information feature
  zero_mi_df = final_mf[final_mf["mi_score"] == 0]
  zero_cat = list(zero_mi_df["feature"])
  print(zero_cat)
  cols_to_drop.extend(zero_cat)
  archive_train = archive_train.drop(zero_cat, axis = 1)
  print("Ok")
  # Stopping criteria is there should not be any more 0 mutual information feature
  while len(zero_cat) !=0:
    temp_mf = mutual_info_df(archive_train, y_train, archive_train.columns)
    zero_mi_df = temp_mf[temp_mf["mi_score"] == 0]
    zero_cat = list(zero_mi_df["feature"])
    cols_to_drop.extend(zero_cat)
    archive_train = archive_train.drop(zero_cat, axis = 1)

  return temp_mf, cols_to_drop

#### log x_train and log y_train

In [None]:
# Apply logarithmic transformation (base e)
y_train1 = np.log1p(y_train)
y_test1 = np.log1p(y_test)
x_train_transformed = np.log1p(x_train)
x_test_transformed = np.log1p(x_test)

In [None]:
x_train_transformed

Unnamed: 0,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,20aa07010,dc5a8f1d8,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.948012,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,16.108045,0.0,0.0,0.000000,0.0,0.0,12.793862
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.201805,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3116,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.341567,0.0,...,0.000000,0.000000,15.392425,13.910822,0.0,0.0,0.000000,0.0,0.0,0.000000
3117,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.993564,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000
3118,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000
3119,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.201805,0.0,...,17.994196,13.617061,0.000000,0.000000,0.0,0.0,15.045151,0.0,0.0,0.000000


In [None]:
final_mf2, cols_to_drop2 = iterative_mi_final_df(x_train_transformed, y_train1)

  y = column_or_1d(y, warn=True)


['f950ac768', 'ed2f00d46', 'b0d770462', '235163358', 'a7e708ce5', 'f51378159', '8e48aed7b', 'aebe1ea16', '9abaeaeba', '8ca717e6d', 'c8ebd62ea', 'bea06dade', 'bfab69d22', '28b21c1d2', '920a04ee2', '955028f58', 'dee843499', '504e4b156', '1d802b493', 'db45da8ac', 'a3aaa5247', '3027b873d', '3694b34c5', 'ba9a5776d', '6f44294b2', 'c18b41ac3', '932b61d77', '3305c8063', 'ef6e31d6b', '2ca23426b', '7e7bf15be', '02dd79dd2', '69e1143e2', '5985f4c31', '295408598', '77da28065', 'cc03b5217', '7bde71e2f', 'bb37ae1db', '66e323ce9', 'c5fa49de1', '4366865e6', '538df95cd', '9ca45f3c8', '4a3c29696', '28a5ad41a', '49131c9e6', '0c9516742', '211314d56', '34d3974de', '87380f99d', '0ce078942', 'fed5f4046', '13f7f9c70', '43ef60caa', 'bbfff6091', 'b5c9b4e39', 'b9e8c09d1', 'a246962f5', '14a5969a6', '7b922ea8b', 'e926d89d3', 'f2af9300f', '18b4fa3f5', '82ba7a053', 'a2d1008bb', '084031585', '9e45b15cd', '36c3157a7', '2a984552e', 'aabb9253a', 'bacadce94', '2b89a6658', '3377a30e2', '96fec9b38', 'b22288a77', 'dc528471e'

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

In [13]:
len(final_mf2)

688

In [14]:
x_train2 = x_train_transformed.drop(cols_to_drop2, axis = 1)
x_test2 = x_test_transformed.drop(cols_to_drop2, axis = 1)

Export the datasets

In [16]:
final_mf2.to_csv('/content/drive/MyDrive/Project/Santander-value-prediction/Datasets/mf2.csv', index = False)

Export y log instead

In [17]:
# Assuming you have a NumPy array named y_train1 that you want to save
y_train1.to_csv('/content/drive/MyDrive/Project/Santander-value-prediction/Datasets/y_train_log.csv', index = False)
y_test1.to_csv('/content/drive/MyDrive/Project/Santander-value-prediction/Datasets/y_test_log.csv', index = False)

First dataset normal

Last dataset x_train log transformed

In [18]:
x_train2.to_csv('/content/drive/MyDrive/Project/Santander-value-prediction/Datasets/x_train_log.csv', index = False)
x_test2.to_csv('/content/drive/MyDrive/Project/Santander-value-prediction/Datasets/x_test_log.csv', index = False)

In [20]:
train_full = pd.read_csv('/content/drive/MyDrive/Project/Santander-value-prediction/Datasets/train.csv')
y_full = np.log1p(train_full["target"])
train_full_final1 = train_full.drop(["ID","target"], axis = 1)
train_full_final2 = train_full_final1.drop(cols_to_drop2, axis = 1)
train_full_final3 = np.log1p(train_full_final2)

In [21]:
train_full_final3.to_csv('/content/drive/MyDrive/Project/Santander-value-prediction/Datasets/train_log.csv', index = False)
y_full.to_csv('/content/drive/MyDrive/Project/Santander-value-prediction/Datasets/y_full_log.csv', index = False)

In [None]:
test_full = pd.read_csv('/content/drive/MyDrive/Project/Santander-value-prediction/Datasets/test.csv')
test_ID = test_full["ID"]
test_full_final1 = test_full.drop(["ID"], aixs = 1)
test_full_final2 = train_full_final1.drop(cols_to_drop2, axis = 1)
test_full_final3 = np.log1p(test_full_final2)

In [None]:
test_full_final3.to_csv('/content/drive/MyDrive/Project/Santander-value-prediction/Datasets/test_log.csv', index = False)
test_ID.to_csv('/content/drive/MyDrive/Project/Santander-value-prediction/Datasets/test_ID.csv')