## 1) Importing the Libraries

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

## 2) Data Analysis

In [None]:
# Now, we will practice the methods of Data Preprocessing

data = pd.read_csv("", na_values='?')

In [None]:
data.head()

In [None]:
data.shape

In [None]:
# Remove duplicate rows - Avoid Peeking 
data.drop_duplicates(inplace = True)

In [None]:
data.shape

In [None]:
# Find how many unique() and nunique() there are in each column
for col in data.columns:
  print(f"Column: {col} - {data[col].nunique()} unique values")
  print(data[col].unique())
  print("-----------------")

## 3) Checking for missing data

In [None]:
data.isnull().sum()

## 4) Before preprocessing the dataset, split the dataset

In [None]:
X = data.loc[:, data.columns != 'Label_Class_name']
y = data['Label_Class_name']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=, random_state = 0)

## 5) Handle missing data

### a) Drop the rows containing '?'

In [None]:
# Merge the training sets together, drop their missing values and then separate them again
train = pd.concat([X_train, y_train], axis=1)

# Merge the test sets together, drop their missing values and then separate them again
test = pd.concat([X_test, y_test], axis=1)

In [None]:
train.shape, test.shape

In [None]:
train = train.dropna()
test = test.dropna()

In [None]:
train.shape, test.shape

In [None]:
# Now separate them again

drop_X_train = train.loc[:, train.columns != 'Label_Class_name']
drop_y_train = train['Label_Class_name']

drop_X_test = test.loc[:, test.columns != 'Label_Class_name']
drop_y_test = test['Label_Class_name']

### b) Imputation: Simple Imputation by Sklearn - Mean/Median/Most Frequent

In [None]:
# Categorical and continuous features are handled differently. Separate them

# Choosing categorical features and continuous features 

# 1) Choose manually
categorical_features = ['','','']
continuous_features = ['','','']  

In [None]:
# 2) Choose based on str and int - not full proof

from pandas.api.types import is_string_dtype

categorical_features = []
for col in data.columns:
  if col=='Class':
    continue
  if is_string_dtype(data[col].dtype):
    categorical_features.append(col)
    
print(categorical_features)

In [None]:
# mean/median/most_frequent/constant
from sklearn.impute import SimpleImputer

In [None]:
si_X_train = pd.DataFrame() # create a new dataframe to save the train dataset
si_X_test = pd.DataFrame() # create a new dataframe to save the test dataset

for col in X_train.columns:
  if col in categorical_features:
    si = SimpleImputer(strategy='most_frequent')
  else:
    si = SimpleImputer(strategy='mean') # median or # constant
  si.fit(X_train[[col]])
  si_X_train[col] = si.transform(X_train[[col]]).flatten()
  si_X_test[col] = si.transform(X_test[[col]]).flatten() 

## 6) Conversion of data - after Imputation

### Conversion of Continuous Features

#### a) Binarization 

In [None]:
from sklearn.preprocessing import Binarizer

In [None]:
b_si_X_train = pd.DataFrame()
b_si_X_test = pd.DataFrame()

# applied on imputed dataset

for col in X_train.columns:
  if col not in categorical_features: 
    # selecting the continuous features only
    bin = Binarizer(threshold=si_X_train[col].mean())
    b_si_X_train[col] = bin.transform(si_X_train[[col]]).flatten()
    b_si_X_test[col] = bin.transform(si_X_test[[col]]).flatten()
  else:
    # else, just copy the values - no change
    b_si_X_train[col] = si_X_train[col].copy() 
    b_si_X_test[col] = si_X_test[col].copy() 

#### b) KBinsDiscretizer - Fixed Width Binning

In [None]:
from sklearn.preprocessing import KBinsDiscretizer

In [None]:
kb_si_X_train = pd.DataFrame()
kb_si_X_test = pd.DataFrame()

for col in X_train.columns:
  if col not in categorical_features:
    # selecting the continuous features only
    bin = KBinsDiscretizer(n_bins=4, encode='ordinal', strategy='uniform')
    bin.fit(si_X_train[[col]])
    kb_si_X_train[col] = bin.transform(si_X_train[[col]]).flatten()
    kb_si_X_test[col] = bin.transform(si_X_test[[col]]).flatten()
  else:
    # else, just copy the values - no change
    kb_si_X_train[col] = si_X_train[col].copy()
    kb_si_X_test[col] = si_X_test[col].copy()

### Handle Text features - Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [None]:
# Convert the Label Class if needed

y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [None]:
continuous_feature = ['','','']

l_X_train = pd.DataFrame() # Final dataset --> before scaling
l_X_test = pd.DataFrame() # Final dataset --> before scaling

In [None]:
# dataset will be either imputed dataset OR continous feature converted dataset

for col in X_train.columns:
  if col in categorical_features:
    # or if col not in continuous_features
    # le = LabelEncoder()
    le.fit(si_X_train[col])
    l_X_train[col] = le.transform(si_X_train[col])
    l_X_test[col] = le.transform(si_X_test[col])
  else:
    # else, just copy the value, no change
    l_X_train[col] = si_X_train[col].copy()
    l_X_test[col] = si_X_test[col].copy() 

## 7) Feature Scaling, Then Train, Test and Evaluation --> Template 1