# Preprocessing
As we analyzed in the previous notebook, the dataset is pretty clean, but we need to drop three columns as we did to analyze the data.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from imblearn.over_sampling import SMOTE

In [2]:
df = pd.read_csv("../data/raw/churn_modelling.csv")

# Drop unnecessary columns
df = df.drop(["RowNumber","CustomerId","Surname"], axis = 1)

df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
# Now we have to treat the categorical variables, so the model can understand them. (cat to num)

categorical_features = df.select_dtypes(object)
for column in categorical_features.columns:
    if df[column].nunique() == 2:
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
    else:
        dummies = pd.get_dummies(df[column], prefix = column)
        dummies = dummies.astype(int) # False, True --> 0, 1
        df = df.drop(column, axis = 1)
        df = pd.concat([df, dummies], axis = 1)

df.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,1,0,0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0,0,1
2,502,0,42,8,159660.8,3,1,0,113931.57,1,1,0,0
3,699,0,39,1,0.0,2,0,0,93826.63,0,1,0,0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,0,0,1


Bye characters!

In [4]:
# Now we have to scale the high numbers

numerical_features = df.select_dtypes(include = ["int64", "float64"])

for column in numerical_features.columns:
    if df[column].max() > 1:
        mms = MinMaxScaler()
        df[column] = mms.fit_transform(df[[column]])

df.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,0.538,0,0.324324,0.2,0.0,0.0,1,1,0.506735,1,1,0,0
1,0.516,0,0.310811,0.1,0.334031,0.0,0,1,0.562709,0,0,0,1
2,0.304,0,0.324324,0.8,0.636357,0.666667,1,0,0.569654,1,1,0,0
3,0.698,0,0.283784,0.1,0.0,0.333333,0,0,0.46912,0,1,0,0
4,1.0,0,0.337838,0.2,0.500246,0.0,1,1,0.3954,0,0,0,1


Perfect, now our values are all between 0 and 1.

In [5]:
# Now we have to separate features and target

X = df.drop("Exited", axis = 1)
y = df["Exited"]

In [6]:
# And finally, divide the X data in X_train, X_test, y_train, y_test.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# And save them
X_train.to_csv("../data/processed/X_train.csv", index=False)
X_test.to_csv("../data/processed/X_test.csv", index=False)
y_train.to_csv("../data/processed/y_train.csv", index=False)
y_test.to_csv("../data/processed/y_test.csv", index=False)