In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [42]:
data = pd.read_csv('../data/raw_data.csv')

In [None]:
data.head()

In [43]:
data = data.drop(columns=['name','city'])

In [44]:
X = data.drop(columns=['loan_approved'])
y = data[['loan_approved']] # data['loan_approved'] returns pandas.core.series.Series

In [45]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42) # Returns pandas.core.frame.DataFrames & pandas.core.series.Series

In [None]:
scaler = StandardScaler()
onehot = OneHotEncoder(sparse_output=False, drop='if_binary') # Avoid sparse matrix and get 2D array.

In [47]:
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)

In [48]:
X_train = scaler.fit_transform(X_train) # Returns np.array
X_test = scaler.transform(X_test) # Returns np.array

y_train = onehot.fit_transform(y_train) # Expected pd.Dataframe not Series
y_test = onehot.transform(y_test) # Expected pd.Dataframe not Series

In [49]:
type(X_train), type(X_test), type(y_train), type(y_train) # All of them numpy.ndarrays

(numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray)

In [50]:
y_train

array([[1.],
       [0.],
       [1.],
       ...,
       [0.],
       [1.],
       [0.]], shape=(1340, 1))

In [51]:
X_train = pd.DataFrame(X_train, columns=['income', 'credit_score', 'loan_amount', 'years_employed', 'points'])
X_test = pd.DataFrame(X_test, columns=['income', 'credit_score', 'loan_amount', 'years_employed', 'points'])
y_train = pd.DataFrame(y_train, columns=['loan_approved'])
y_test = pd.DataFrame(y_test, columns=['loan_approved'])

In [52]:
type(X_train), type(X_test), type(y_train), type(y_train)

(pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame)

In [53]:
# axis=1 means concat columns
train = pd.concat([X_train, y_train], ignore_index=True, axis=1)
test = pd.concat([X_test, y_test], ignore_index=True, axis=1)

In [59]:
# In order to keep the original columns name then:
train.columns = data.columns
test.columns = data.columns

In [60]:
train.head()

Unnamed: 0,income,credit_score,loan_amount,years_employed,points,loan_approved
0,-0.868786,1.272426,-0.398622,-1.558591,0.453156,1.0
1,0.630376,-0.139933,0.692489,0.038773,-0.344081,0.0
2,0.416243,-0.016583,-1.200687,0.459133,0.18741,1.0
3,-0.21058,-0.670339,0.908299,-0.633801,-0.875573,0.0
4,-0.437968,1.642477,1.059245,0.206917,0.984648,1.0


In [61]:
test.head()

Unnamed: 0,income,credit_score,loan_amount,years_employed,points,loan_approved
0,1.173612,-0.787522,-0.653463,-0.886017,-0.078336,0.0
1,-1.037615,1.438949,-1.099063,0.038773,1.51614,1.0
2,0.455456,0.760523,0.458796,1.215779,0.984648,1.0
3,-0.457342,0.452148,-1.353833,-0.045298,0.453156,1.0
4,-1.463706,1.401944,-0.944994,0.79542,0.718902,1.0


In [62]:
# index=False prevents writing the DataFrame index as a column in the CSV
train.to_csv('../data/train.csv', index=False)
test.to_csv('../data/test.csv', index=False)