**TO-DO**
* verificar se para todos os valores nulos na operation, todos os k_symbol são interested credited
* verificar se a k_symbol acrescenta informação relevante à operation (sem ser os valores nulos corresponderem ao interested credited), visto que esta apenas especifica mais algumas operações
* agrupar withdrawal in cash e withdrawal do type (testar juntos e separados para ver os resultados)
* retirar coluna account - tem muitos valores a zero (verificar, pois poderá ser devido a um erro) (testar primeiro sem valores a zero e depois sem a coluna)
* retirar coluna bank
* provavelmente retirar a coluna type, pois poderá não acrescentar valor aos dados nada de relevante que já não esteja na coluna operation (testar). Coluna type é uma generalização da operation logo apenas simplifica a informação e não acrescenta nada.

## Table Of Contents <a id="index"></a>


- [Dataset managing](#dataset)  
- [District Data](#district-data)
- [Matrix](#matrix)
  

#### Models
- [**Decision Tree**](#decision-tree)
    - [**Parameter Tunning**](#parameter-tunning)
- [**K-Nearest Neighbor**](#k-nearest-neighbor)
    - [**Parameter Tunning**](#parameter-tunning-2)  
- [**Support-Vector Machines**](#support-vector-machines)
    - [**Parameter Tunning**](#parameter-tunning-3)
- [**Neural Networks**](#neural-networks)
    - [**Parameter Tunning**](#parameter-tunning-4)


In [None]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler

In [None]:
def dataset(x, missing = ""):
    return pd.read_csv('Dataset/' + x + '.csv', sep = ';', low_memory = False, na_values = missing_values).rename(str.strip, axis = 'columns')

In [None]:
missing_values = ['?', 'NA', '']
account_data = dataset('account', missing_values)
client_data = dataset('client', missing_values)
disp_data = dataset('disp', missing_values)
district_data = dataset('district', missing_values)
card_train = dataset('card_train', missing_values)
card_test = dataset('card_test')
loan_train = dataset('loan_train', missing_values)
loan_test = dataset('loan_test')
trans_train = dataset('trans_train', missing_values)
trans_test = dataset('trans_test')

In [None]:
# Removing name and region from district
district_data.drop(['name', 'region'], axis=1, inplace=True)

# only owner can issue permanent orders and ask for a loan
disp_owners = disp_data[disp_data.type.eq('OWNER')]
disp_owners.drop(['type'], axis=1, inplace=True)

In [None]:
# Normalize birthday dates and add a new column (Genre) to distinguish

birth_dates = client_data['birth_number']
dates_parsed = []
genre = []
for date in birth_dates:
    month = int(str(date)[2:4])
    if month > 12:
        genre.append(0)
        month = month - 50
        if month < 10:
            month = '0' + str(month)
        else:
            month = str(month)
    else:
        #print('AAAA: ' + str(month))
        if month < 10:
            month = '0' + str(month)
            #print('BBBB: ' + str(month))
        else:
            month = str(month)
        genre.append(1)
    dates_parsed.append(str(date)[:2] + '-' + month + '-' + str(date)[4:])
    
ages = []
for date in dates_parsed:
    born_year = '19' + date[:2]
    age = 2021 - int(born_year)
    ages.append(age)
    
client_data = client_data.drop(['birth_number'], axis = 1)
client_data['age'] = ages
client_data['genre'] = genre
client_data.head()

In [None]:
# build train dataset

train_data = loan_train
train_data = pd.merge(train_data, trans_train, on = 'account_id', suffixes = ('', '_trans'))
train_data = pd.merge(train_data, account_data, on = 'account_id', suffixes = ('', '_account'))
#train_data = train_data.dropna()
train_data = pd.merge(train_data, district_data.set_index('code'), left_on = 'district_id', right_index = True, suffixes = ('', '_district'))
train_data = pd.merge(train_data, disp_owners, on = 'account_id', suffixes = ('', '_disp'))
train_data = pd.merge(train_data, card_train, on = 'disp_id', how = 'outer', suffixes = ('', '_card'))
train_data = pd.merge(train_data, client_data, on = 'client_id', suffixes = ('', '_client'))
train_data = train_data.drop(['district_id_client'], axis=1)
train_data.info()

In [None]:
train_data.head()

In [None]:
# build test dataset

test_data = loan_test
test_data = pd.merge(test_data, trans_test, on = 'account_id', suffixes = ('', '_trans'))
test_data = pd.merge(test_data, account_data, on = 'account_id', suffixes = ('', '_account'))
test_data = pd.merge(test_data, district_data.set_index('code'), left_on = 'district_id', right_index = True, suffixes = ('', '_district'))
test_data = pd.merge(test_data, disp_owners, on = 'account_id', suffixes = ('', '_disp'))
test_data = pd.merge(test_data, card_test, on = 'disp_id', how = 'outer', suffixes = ('', '_card'))
test_data = pd.merge(test_data, client_data, on = 'client_id', suffixes = ('', '_client'))
test_data = test_data.drop(['district_id_client'], axis=1)
test_data.info()


In [None]:
test_data.head()

In [None]:
# Remove all dates from data, because only the number doesn't make sense. Probably transform it to age?
train_data.drop(['date', 'date_trans', 'date_account'], axis=1, inplace=True)
test_data.drop(['date', 'date_trans', 'date_account'], axis=1, inplace=True)

In [None]:
# Null values for each attribute
train_data.isnull().sum().plot(kind='bar', figsize=(18,8), fontsize=14,);
plt.ylabel('Null values');

In [None]:
print('Null values unemploymant rate in \'95:' + str(district_data['unemploymant rate \'95'].isnull().sum()))
print()
print('Null values no. of commited crimes \'95 :' + str(district_data['no. of commited crimes \'95'].isnull().sum()))
print()

[back](#index)
#### District Data <a id="district-data"></a>

In [None]:
district_scatter_plot = sb.PairGrid(district_data)
district_scatter_plot.map(plt.scatter)

In [None]:
plt.figure(figsize=(40,6))
plt.title('Distribution of district\'s unemploymant rate in \'95', fontdict={'fontsize': 14, 'fontweight': 'bold'})
plt.hist(district_data['unemploymant rate \'95'])

In [None]:
plt.figure(figsize=(40,6))
plt.title('Distribution of district\'s no. of commited crimes \'95', fontdict={'fontsize': 14, 'fontweight': 'bold'})
plt.hist(district_data['no. of commited crimes \'95'])

In [None]:
# Fill null values in district's unemploymant rate in '95 and district's no. of commited crimes '95
# See if it is better to use median or mean 

train_data['unemploymant rate \'95'].fillna(train_data['unemploymant rate \'95'].median(), inplace=True)

train_data['no. of commited crimes \'95'].fillna(train_data['no. of commited crimes \'95'].mean(), inplace=True)

In [None]:
print(train_data.operation.value_counts())
print('Null values: ' + str(train_data.operation.isnull().sum()))
print()
print(train_data.k_symbol.value_counts())
print('Null values: ' + str(train_data.k_symbol.isnull().sum()))
print()
print(train_data.bank.value_counts())
print('Null values: ' + str(train_data.bank.isnull().sum()))
print()
print(train_data.type_card.value_counts())
print('Null values: ' + str(train_data.type_card.isnull().sum()))
print()
print(train_data.type.value_counts())
print('Null values: ' + str(train_data.type.isnull().sum()))
print()

In [None]:
operations_null = train_data[train_data['operation'].isnull()]
operations_null.head()

In [None]:
operations_null.k_symbol.value_counts()

All rows that have 'operation' column with null value have 'interested credited' in 'k_symbol' column.

In [None]:
train_data.info()

[back](#index)
#### Dataset Managing <a id="dataset"></a>

In [None]:
# needs to be checked
option = 1
if option ==1:
    train_data.drop(['bank', 'k_symbol', 'account', 'operation', 'issued'], axis=1, inplace=True)
    test_data.drop(['bank', 'k_symbol', 'account', 'operation', 'issued'], axis=1, inplace=True)
#replaced withdrawal in cash with only withdrawal
elif option==2:
    train_data.drop(['bank', 'k_symbol', 'account', 'operation', 'issued'], axis=1, inplace=True)
    test_data.drop(['bank', 'k_symbol', 'account', 'operation', 'issued'], axis=1, inplace=True)
    train_data.replace('withdrawal in cash', 'withdrawal', inplace=True)
    test_data.replace('withdrawal in cash', 'withdrawal', inplace=True)
#filled nulls from operation with k_symbol
elif option==3:
    train_data['operation'].fillna(train_data['k_symbol'], inplace=True)
    test_data['operation'].fillna(test_data['k_symbol'], inplace=True)
    train_data.drop(['bank', 'k_symbol', 'account', 'issued'], axis=1, inplace=True)
    test_data.drop(['bank', 'k_symbol', 'account', 'issued'], axis=1, inplace=True)
#joined operation and k_symbol
elif option==4:
    train_data['operation'].fillna(train_data['k_symbol'], inplace=True)
    test_data['operation'].fillna(test_data['k_symbol'], inplace=True)
    train_data['k_symbol'].fillna(train_data['operation'], inplace=True)
    test_data['k_symbol'].fillna(test_data['operation'], inplace=True)
    train_data['operation']=train_data['operation'] + ' ' + train_data['k_symbol']
    test_data['operation']=test_data['operation'] + ' ' + test_data['k_symbol']
    train_data["operation"] = train_data["operation"].apply(lambda x: ' '.join(pd.unique(x.split())))
    test_data["operation"] = test_data["operation"].apply(lambda x: ' '.join(pd.unique(x.split())))
    train_data.drop(['bank', 'k_symbol', 'account', 'issued'], axis=1, inplace=True)
    test_data.drop(['bank', 'k_symbol', 'account', 'issued'], axis=1, inplace=True)
#joined operation and k_symbol and dropped type
elif option==5:
    train_data['operation'].fillna(train_data['k_symbol'], inplace=True)
    test_data['operation'].fillna(test_data['k_symbol'], inplace=True)
    train_data['k_symbol'].fillna(train_data['operation'], inplace=True)
    test_data['k_symbol'].fillna(test_data['operation'], inplace=True)
    train_data['operation']=train_data['operation'] + ' ' + train_data['k_symbol']
    test_data['operation']=test_data['operation'] + ' ' + test_data['k_symbol']
    train_data["operation"] = train_data["operation"].apply(lambda x: ' '.join(pd.unique(x.split())))
    test_data["operation"] = test_data["operation"].apply(lambda x: ' '.join(pd.unique(x.split())))
    train_data.drop(['bank', 'k_symbol', 'account', 'issued', 'type'], axis=1, inplace=True)
    test_data.drop(['bank', 'k_symbol', 'account', 'issued', 'type'], axis=1, inplace=True)
    

In [None]:
train_data = pd.get_dummies(train_data, columns=['frequency'], dtype=bool)
test_data = pd.get_dummies(test_data, columns=['frequency'], dtype=bool)

train_data = pd.get_dummies(train_data, columns=['type_card'], dtype = bool)
test_data = pd.get_dummies(test_data, columns=['type_card'], dtype = bool)

# train_data = pd.get_dummies(train_data, columns=['type_disp'], dtype = bool)
# test_data = pd.get_dummies(test_data, columns=['type_disp'], dtype = bool)

if option != 5:
    train_data = pd.get_dummies(train_data, columns=['type'], dtype = bool)
    test_data = pd.get_dummies(test_data, columns=['type'], dtype = bool)

if option == 1 or option == 2:
    train_data = pd.get_dummies(train_data, columns=['operation'], dtype = bool)
    test_data = pd.get_dummies(test_data, columns=['operation'], dtype = bool)

test_data = test_data.drop_duplicates(subset=['loan_id'], keep='first')

#train_data = pd.get_dummies(train_data)
#test_data = pd.get_dummies(test_data)

train_data.info()

In [None]:
train_data.head()

In [None]:
competition_inputs = test_data.drop(columns=["loan_id", "status"])
test_data = test_data.drop(columns=["status"])
all_ids_comp = test_data['loan_id'].values

[back](#index)
#### Matrix <a id="matrix"></a>

train_data_w /test_data_w : train/test data where withdrawals in cash are replaced by withdrawals

In [None]:
# Dropping ids
#train_data_no_ids = train_data.drop(['loan_id', 'account_id', 'district_id', 'disp_id', 'client_id', 'card_id', 'trans_id'], axis=1)
train_data_no_ids = train_data.drop(['client_id', 'account_id', 'district_id', 'disp_id', 'card_id', 'trans_id'], axis=1)


# Create correlation matrix
corr_matrix = train_data_no_ids.corr().abs()
plt.figure(figsize = (20,6))
sb.heatmap(corr_matrix, annot=True)

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find features with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

# Drop features 
train_data_no_ids.drop(to_drop, axis=1, inplace=True)

print("{} Dropped columns: {}".format(len(to_drop), to_drop) )

In [None]:
train_data_no_ids.head()

In [None]:
# Removing rows with nan values for status
#train_data_no_ids.dropna(subset=["status"], inplace=True)

#print(no_ids.drop_duplicates(inplace=True)
print(train_data_no_ids["status"].value_counts())

In [None]:
train_data_no_ids.info()

In [None]:
train_data_no_ids.head()

In [None]:
# Removing labels and creating another dataset for them
train_data_no_ids = train_data_no_ids.dropna()
default_ind_no = train_data_no_ids.loc[train_data_no_ids['status'] == -1]
default_ind_yes = train_data_no_ids.loc[train_data_no_ids['status'] == 1]

df_minority_upsampled = resample(default_ind_yes, replace=True, n_samples=len(default_ind_no),random_state=123)

train_data_no_ids = pd.concat([default_ind_no, df_minority_upsampled])

train_data_no_ids.status.value_counts()


In [None]:
all_inputs = train_data_no_ids[train_data_no_ids.columns.drop(['loan_id'])]
all_labels = train_data_no_ids['status'].values

competition_inputs = test_data.drop(columns=["loan_id"])
all_ids_comp = test_data['loan_id'].values

#all_inputs = train_data_no_ids.iloc[:, :-1].values
#all_labels = train_data_no_ids.iloc[:, -1].values

In [None]:
all_inputs.status.value_counts()

In [None]:
# get a test dataset with 25% of the credit_data_subset
(X_train, X_test, y_train, y_test) = train_test_split(all_inputs, all_labels, test_size=0.25, random_state=1)
#(X_train, X_test, y_train, y_test) = train_test_split(all_inputs, all_labels, random_state=1)

In [None]:
X_train.info()

#### [back](#index)
## Decision Tree <a class="anchor" id="decision-tree"></a>

In [None]:
# Create the classifier
decision_tree_classifier = DecisionTreeClassifier()

# Train the classifier on the training set
decision_tree_classifier.fit(X_train, y_train)

dtc_prediction = decision_tree_classifier.predict(X_test)

dtc_classification_report = classification_report(y_test, dtc_prediction, output_dict=True)

print(f"Classification report:\n{classification_report(y_test, dtc_prediction)}\n")

sb.set(font_scale=1.0)
 
ax = plt.subplot()

confusion_matrix_dtc = confusion_matrix(y_test, dtc_prediction)

sb.heatmap(confusion_matrix_dtc, annot=True, ax=ax, fmt="g")

ax.set_xlabel('Predicted Grades');
ax.set_ylabel('Observed Grades');
ax.set_title('Confusion Matrix');
plt.show()


### Parameter Tunning <a class="anchor" id="parameter-tunning"></a>

In [None]:
parameter_grid = {'criterion': ['gini', 'entropy'],
                  'splitter': ['best', 'random'],
                  'max_depth': range(10, 20),
                  'max_features': range(10,20)}

grid_search = GridSearchCV(DecisionTreeClassifier(),
                           param_grid=parameter_grid,
                           cv=10,
                           verbose=4,
                           n_jobs=-1)

grid_search.fit(X_train, y_train)
print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))
print('Best estimator: {}'.format(grid_search.best_estimator_))

In [None]:
decision_tree_classifier = grid_search.best_estimator_
dtc_prediction = decision_tree_classifier.predict(X_test)

best_dtc_classification_report = classification_report(y_test, dtc_prediction, output_dict=True)

print("--- Improved model ---\n")
print(f"Classification report:\n{classification_report(y_test, dtc_prediction)}\n")

sb.set(font_scale=1.0)

ax = plt.subplot()

confusion_matrix_dtc = confusion_matrix(y_test, dtc_prediction)

sb.heatmap(confusion_matrix_dtc, annot=True, ax=ax, fmt="g")

ax.set_xlabel('Predicted Grades');
ax.set_ylabel('Observed Grades');
ax.set_title('Confusion Matrix');
plt.show()

#### [back](#index)
## K-Nearest Neighbor <a class="anchor" id="k-nearest-neighbor"></a>

In [None]:
knn = KNeighborsClassifier()

knn.fit(X_train, y_train)
knn_prediction = knn.predict(X_test)

knn_classification_report = classification_report(y_test, knn_prediction, output_dict=True)

print(f"Classification report:\n{classification_report(y_test, knn_prediction, labels=np.unique(y_train))}\n")

sb.set(font_scale=1.0)

ax = plt.subplot()

confusion_matrix_knn = confusion_matrix(y_test, knn_prediction)

sb.heatmap(confusion_matrix_knn, annot=True, ax=ax, fmt="g")

ax.set_xlabel('Predicted Grades');
ax.set_ylabel('Observed Grades');
ax.set_title('Confusion Matrix');
plt.show()

### Parameter Tunning <a class="anchor" id="parameter-tunning-2"></a>

In [None]:
parameter_grid = {'n_neighbors': [5,10,15,20],
                  'weights': ['uniform', 'distance'],
                  'algorithm': ['ball_tree', 'kd_tree', 'brute']}

grid_search = GridSearchCV(KNeighborsClassifier(),
                           param_grid=parameter_grid,
                           scoring='precision_weighted',
                           cv=10,
                           n_jobs=3,
                           verbose=4)

grid_search.fit(X_train, y_train)
print(f"Best score: {grid_search.best_score_}")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best estimator: {grid_search.best_estimator_}")

In [None]:
knn = grid_search.best_estimator_
yk_pred = knn.predict(X_test)

best_knn_classification_report = classification_report(y_test, yk_pred, output_dict=True)

print("--- Improved model ---\n")
print(f"Classification report:\n{classification_report(y_test, yk_pred)}\n")

sb.set(font_scale=1.0)

ax = plt.subplot()

confusion_matrix_knn = confusion_matrix(y_test, yk_pred)

sb.heatmap(confusion_matrix_knn, annot=True, ax=ax, fmt="g")

ax.set_xlabel('Predicted Grades');
ax.set_ylabel('Observed Grades');
ax.set_title('Confusion Matrix');
plt.show()

#### [back](#index)
## Support-Vector Machines <a class="anchor" id="support-vector-machines"></a>

In [None]:
svc = SVC()

svc.fit(X_train, y_train)
svc_prediction = svc.predict(X_test)

svm_classification_report = classification_report(y_test, svc_prediction, output_dict=True)

print(f"Classification report:\n{classification_report(y_test, svc_prediction)}\n")

sb.set(font_scale=1.0)

ax = plt.subplot()

confusion_matrix_svm = confusion_matrix(y_test, svc_prediction)

sb.heatmap(confusion_matrix_svm, annot=True, ax=ax, fmt="g")

ax.set_xlabel('Predicted Grades');
ax.set_ylabel('Observed Grades');
ax.set_title('Confusion Matrix');
plt.show()

best_svm_classification_report = svm_classification_report

### Parameter Tunning <a class="anchor" id="parameter-tunning-3"></a>

In [None]:
parameter_grid = {'C' : [0.1, 1, 10], 
                'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}

grid_search = GridSearchCV(SVC(),
                           param_grid=parameter_grid,
                           cv=10,
                           verbose=4,
                           n_jobs=4)

grid_search.fit(X_train, y_train)
print(f"Best score: {grid_search.best_score_}")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best estimator: {grid_search.best_estimator_}")

In [None]:
svc = grid_search.best_estimator_
yk_pred = svc.predict(X_test)

best_svm_classification_report = classification_report(y_test, yk_pred, output_dict=True)

print("--- Improved model ---\n")
print(f"Classification report:\n{best_knn_classification_report(y_test, yk_pred)}\n")

sb.set(font_scale=1.0)

ax = plt.subplot()

confusion_matrix_svm = confusion_matrix(y_test, yk_pred)

sb.heatmap(confusion_matrix_svm, annot=True, ax=ax, fmt="g")

ax.set_xlabel('Predicted Grades');
ax.set_ylabel('Observed Grades');
ax.set_title('Confusion Matrix');
plt.show()

#### [back](#index)
## Neural Networks <a class="anchor" id="neural-networks"></a>

In [None]:
scaler = StandardScaler()

# Fit only to the training data
scaler.fit(X_train)

# Now apply the transformations to the data:
X_train_nn = scaler.transform(X_train)
X_test_nn = scaler.transform(X_test)

# Create the classifier
ANNClassifier = MLPClassifier(random_state=1, max_iter=500)

# Train the classifier on the training set
ANNClassifier.fit(X_train_nn, y_train)

predictions = ANNClassifier.predict(X_test_nn)

confusion_matrix_ann = confusion_matrix(y_test,predictions)

nn_classification_report = classification_report(y_test, predictions, output_dict=True)
print(classification_report(y_test,predictions))

sb.set(font_scale=1.0)

ax = plt.subplot()

sb.heatmap(confusion_matrix_ann, annot=True, ax=ax, fmt="g")

ax.set_xlabel('Predicted Grades');
ax.set_ylabel('Observed Grades');
ax.set_title('Confusion Matrix');
plt.show()

best_nn_classification_report = nn_classification_report

### Parameter Tunning <a class="anchor" id="parameter-tunning-4"></a>

In [None]:
parameter_grid = {'activation': ['tanh','identity','logistic','relu'],
                  'solver': ['adam','lbfgs','sgd'],
                  'hidden_layer_sizes': [3,5,8,13,21,34],
                  'verbose': [True]}

cross_validation = StratifiedKFold(n_splits=10, shuffle=True)

grid_search = GridSearchCV(ANNClassifier,
                           param_grid=parameter_grid,
                           cv=cross_validation)

grid_search.fit(X_train, y_train)

print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))
print('Best estimator: {}'.format(grid_search.best_estimator_))

In [None]:
ANNClassifier = grid_search.best_estimator_
yk_pred = ANNClassifier.predict(X_test)

best_nn_classification_report = classification_report(y_test, yk_pred, output_dict=True)

print("--- Improved model ---\n")
print(f"Classification report:\n{best_nn_classification_report(y_test, yk_pred)}\n")

sb.set(font_scale=1.0)

ax = plt.subplot()

confusion_matrix_ann = confusion_matrix(y_test, yk_pred)

sb.heatmap(confusion_matrix_ann, annot=True, ax=ax, fmt="g")

ax.set_xlabel('Predicted Grades');
ax.set_ylabel('Observed Grades');
ax.set_title('Confusion Matrix');
plt.show()