In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Read the dataset

In [2]:
df = pd.read_csv('/content/drive/MyDrive/Project/Santander-value-prediction/Datasets/train.csv')

In [3]:
df.shape

(4459, 4993)

This is high dimensional dataset where the columns exceed the rows.

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4459 entries, 0 to 4458
Columns: 4993 entries, ID to 9fc776466
dtypes: float64(1845), int64(3147), object(1)
memory usage: 169.9+ MB


## 1) Remove columns with 0 mean and 0 variance.

Getter function for the prune columns

In [5]:
# This function returns the cols to drop list for 0 mean and 0 variance
def prune_1(df):
  # Obtain the list of columns
  cols = list(df.columns)
  
  # Remove the target and ID column from the cols
  cols.remove('ID')
  cols.remove('target')
  # Create a list for cols to drop
  cols_to_drop = []

  # Perform the iteration
  for col in cols:
    # Check for zero mean
    if df[col].mean() == 0:
      cols_to_drop.append(col)
      continue
    # Check for zero variance
    if df[col].std() == 0:
      cols_to_drop.append(col)
  return cols_to_drop

In [6]:
cols_to_drop1 = prune_1(df)
len(cols_to_drop1)

256

Drop the 256 columns found in cols_to_drop1 list

In [7]:
df1 = df.drop(cols_to_drop1, axis = 1)

In [8]:
df1.shape

(4459, 4737)

## 2) Check for duplicate columns

Getter functions for duplicate columns

In [9]:
def prune_2(df):
    duplicate_cols = []
    seen = set()

    for col in df.columns:
        col_values = tuple(df[col])
        if col_values in seen:
            duplicate_cols.append(col)
        else:
            seen.add(col_values)

    return duplicate_cols

In [10]:
duplicate_cols = prune_2(df1)

In [11]:
duplicate_cols

['d60ddde1b', '912836770', 'acc5b709d', 'f8d75792f', 'f333a5f60']

Drop the 5 duplicates cols

In [12]:
df2 = df1.drop(duplicate_cols, axis = 1)

In [13]:
df2.shape

(4459, 4732)

## 3) Split the datasets

Obtain the y

In [14]:
y = df2["target"]
df3 = df2.drop(["target","ID"], axis = 1)

In [15]:
from sklearn.model_selection import train_test_split
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df3, y, test_size=0.3,
                                                    random_state=42)

## 4) Export the datasets

In [16]:
import os

In [17]:
X_train.to_csv('/content/drive/MyDrive/Project/Santander-value-prediction/Datasets/x_train.csv', index = False)
X_test.to_csv('/content/drive/MyDrive/Project/Santander-value-prediction/Datasets/x_test.csv', index = False)
y_train.to_csv('/content/drive/MyDrive/Project/Santander-value-prediction/Datasets/y_train.csv', index = False)
y_test.to_csv('/content/drive/MyDrive/Project/Santander-value-prediction/Datasets/y_test.csv', index = False)

In [18]:
df3.to_csv('/content/drive/MyDrive/Project/Santander-value-prediction/Datasets/train_full.csv', index = False)
y.to_csv('/content/drive/MyDrive/Project/Santander-value-prediction/Datasets/y_full.csv', index = False)