In [1]:
import pandas as pd

In [2]:
credit_train = pd.read_csv('credit_train.csv')
credit_test = pd.read_csv('credit_test.csv')

In [3]:
credit_train_new = credit_train.copy()

 ## Remoção das variáveis de identificação (below).

In [4]:
credit_train_new = credit_train_new.drop(columns = {'Loan ID', 'Customer ID'})

  ## Remoção dos registros nulos [NaN] (below)

In [5]:
credit_train_new = credit_train_new.dropna(axis = 0)

In [6]:
credit_train_new["Loan Status"].value_counts()

Loan Status
Fully Paid     28972
Charged Off     7451
Name: count, dtype: int64

 ## Convertendo variáveis categóricas em variáveis Dummy (below)

In [7]:
df_term = pd.get_dummies(credit_train_new["Term"])

In [8]:
df_job = pd.get_dummies(credit_train_new["Years in current job"])

In [9]:
df_home = pd.get_dummies(credit_train_new["Home Ownership"])

In [10]:
df_purpose = pd.get_dummies(credit_train_new["Purpose"])

In [11]:
credit_train_new = pd.concat([credit_train_new,df_term,df_job,df_home,df_purpose], axis = 1)

In [12]:
credit_train_new.drop(columns = ['Term', 'Years in current job', 'Home Ownership', 'Purpose'], inplace=True)

# Separaçao da Classe (below)

In [13]:
df_class = pd.get_dummies(credit_train_new['Loan Status'])

In [14]:
df_class.drop(columns = ['Charged Off'], inplace=True)

In [15]:
df_class = df_class.rename(columns={'Fully Paid':'Status'})

In [16]:
credit_train_new = pd.concat([credit_train_new,df_class], axis = 1)

In [17]:
credit_train_new = credit_train_new.drop(columns=['Loan Status'])

In [18]:
credit_train_new

Unnamed: 0,Current Loan Amount,Credit Score,Annual Income,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,...,Other,Take a Trip,major_purchase,moving,other,renewable_energy,small_business,vacation,wedding,Status
2,99999999.0,741.0,2231892.0,29200.53,14.9,29.0,18.0,1.0,297996.0,750090.0,...,False,False,False,False,False,False,False,False,False,True
6,217646.0,730.0,1184194.0,10855.08,19.6,10.0,13.0,1.0,122170.0,272052.0,...,False,False,False,False,False,False,False,False,False,True
8,548746.0,678.0,2559110.0,18660.28,22.6,33.0,4.0,0.0,437171.0,555038.0,...,False,False,False,False,False,False,False,False,False,True
10,99999999.0,728.0,714628.0,11851.06,16.0,76.0,16.0,0.0,203965.0,289784.0,...,False,False,False,False,False,False,False,False,False,True
12,99999999.0,740.0,776188.0,11578.22,8.5,25.0,6.0,0.0,134083.0,220220.0,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99990,99999999.0,742.0,1190046.0,11969.81,20.1,16.0,9.0,0.0,37392.0,134442.0,...,False,False,False,False,True,False,False,False,False,True
99993,44484.0,717.0,1152426.0,6280.64,21.0,12.0,6.0,0.0,961932.0,0.0,...,False,False,False,False,False,False,True,False,False,True
99994,210584.0,719.0,783389.0,3727.61,17.4,18.0,6.0,0.0,456.0,259160.0,...,True,False,False,False,False,False,False,False,False,True
99996,99999999.0,732.0,1289416.0,13109.05,9.4,21.0,22.0,0.0,153045.0,509234.0,...,False,False,False,False,False,False,False,False,False,True


# Separação de dados de treinamento e dados de teste (below)

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
X_train, X_test, y_train, y_test = train_test_split(credit_train_new.loc[:,credit_train_new.columns != 'Status'], credit_train_new.loc[:,credit_train_new.columns == 'Status'], test_size = 0.3, random_state = 39)

# Nomalização dos dados (below)

In [21]:
from sklearn.preprocessing import MinMaxScaler

In [22]:
scalar_min_max = MinMaxScaler()

X_train_norm = pd.DataFrame(scalar_min_max.fit_transform(X_train))

In [23]:
y_train = y_train.reset_index().drop(columns = {'index'})

In [24]:
y_train

Unnamed: 0,Status
0,False
1,True
2,True
3,False
4,True
...,...
25491,True
25492,True
25493,True
25494,True


In [25]:
X_train_norm

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,35,36,37,38,39,40,41,42,43,44
0,0.007566,0.016318,0.055721,0.186408,0.197901,0.301136,0.446809,0.0,0.072972,0.001263,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.005768,0.017762,0.048678,0.167208,0.182909,0.250000,0.255319,0.0,0.061358,0.000927,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.002009,0.004332,0.041094,0.065482,0.137931,0.079545,0.276596,0.0,0.005297,0.000186,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.001517,0.022527,0.051123,0.056485,0.245877,0.375000,0.170213,0.0,0.024482,0.000502,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.001124,0.023105,0.049355,0.107783,0.392804,0.039773,0.106383,0.0,0.018544,0.000363,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25491,0.007344,0.013141,0.075268,0.028661,0.268366,0.107955,0.212766,0.0,0.253601,0.006500,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25492,0.003040,0.020072,0.026074,0.070374,0.376312,0.113636,0.340426,0.0,0.043083,0.001042,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25493,0.004134,0.023538,0.025707,0.089502,0.205397,0.289773,0.255319,0.0,0.062353,0.000988,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25494,0.007616,0.020939,0.122112,0.062548,0.283358,0.181818,0.212766,0.0,0.110102,0.001501,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# OVERSAMPLING (Below)

In [26]:
from imblearn.over_sampling import RandomOverSampler

oversample = RandomOverSampler(sampling_strategy = 'minority')
X_over, y_over = oversample.fit_resample(X_train_norm, y_train)

In [27]:
X_over

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,35,36,37,38,39,40,41,42,43,44
0,0.007566,0.016318,0.055721,0.186408,0.197901,0.301136,0.446809,0.000000,0.072972,0.001263,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.005768,0.017762,0.048678,0.167208,0.182909,0.250000,0.255319,0.000000,0.061358,0.000927,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.002009,0.004332,0.041094,0.065482,0.137931,0.079545,0.276596,0.000000,0.005297,0.000186,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.001517,0.022527,0.051123,0.056485,0.245877,0.375000,0.170213,0.000000,0.024482,0.000502,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.001124,0.023105,0.049355,0.107783,0.392804,0.039773,0.106383,0.000000,0.018544,0.000363,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40627,0.000224,0.018051,0.035187,0.090669,0.736132,0.096591,0.106383,0.066667,0.008127,0.000102,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40628,0.003007,0.018484,0.018858,0.045809,0.155922,0.255682,0.127660,0.000000,0.009603,0.000209,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40629,0.002640,0.004332,0.038923,0.043184,0.287856,0.079545,0.276596,0.000000,0.051039,0.000631,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40630,0.004081,0.018051,0.046475,0.131044,0.124438,0.357955,0.361702,0.066667,0.037740,0.000511,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
y_over.value_counts()

Status
False     20316
True      20316
Name: count, dtype: int64

In [30]:
X_over.to_csv('X_train.csv')

In [34]:
y_over.to_csv('y_train.csv')

In [35]:
X_test.to_csv('x_test.cvs')
y_test.to_csv('y_test.csv')