# Binary classification with Logistic regression 

## 1. Getting data

### 1.1. Import libraries 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sqlalchemy import create_engine
import pymysql
import getpass

In [None]:
password=getpass.getpass()



### 1.2. Connection to sql 

In [None]:
#connection string
connection_string = 'mysql+pymysql://root:'+password+'@localhost/bank'
engine= create_engine(connection_string)




### 1.3. Query as a dataframe

In [None]:
df=pd.read_sql_query('''
select l.loan_id, l.status, count(distinct t.trans_id) as nooftrans,
DATEDIFF(19981231, convert(a.date,date)) as ageindays, 
d.A12 as 95unemp, d.A13 as 96unemp, dp.type,
l.amount as loanamount, c.birth_number, d.A15 as crime95, d.A16 as crime96,
round((l.amount-l.payments)/l.amount,2) as ratiopaid
from loan l
left join trans t
using(account_id)
left join account a
using(account_id)
left join district d
on a.district_id = d.A1
left join disp dp
on a.account_id= dp.account_id 
left join client c
using(client_id)
where l.status in('A','B') and dp.type='OWNER'
group by loan_id, l.amount, status, d.A12, d.A13, c.birth_number, d.A15, d.A16, DATEDIFF(19981231, convert(a.date,date)),
 dp.type, round((l.amount-l.payments)/l.amount,2)
''', engine)



#### - verify the dataframe works as expected

In [None]:
df.head()




## 2. EDA - exploratory data analysis - get to know the data 

### 2.1. Check what columns look like

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.dtypes
#after looking at that we can decide droping loan_id 
# or changing birth data type, etc

In [None]:
df.birth_number.unique()



### 2.2. Histograms or Boxplots

In [None]:
sns.displot(df,x = 'nooftrans', hue='status')

In [None]:
sns.barplot(x='loanamount', y='status', data=df)

In [None]:
df['status'].value_counts()

In [None]:
sns.displot(df['ageindays'])

In [None]:
fig = sns.kdeplot(df['95unemp'], shade=True, color="r")
fig = sns.kdeplot(df['96unemp'], shade=True, color="b")
plt.show()


### 2.3. Check for multicollinearity 

In [None]:
#for logistic regression we don't really need to check 
#for multicollinearity, but it is a good practice to do it
df.corr()


In [None]:
corr_matrix = df.corr(method = 'pearson')
fig,ax=plt.subplots(figsize=(10,8))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
ax=sns.heatmap(corr_matrix, mask=mask, annot=True)

plt.show()


### 2.4. Clean and wrangling steps 

- clean /wrangling steps suggested :
- bucket into categories any fields 
- should we drop any columns ? (iterative process)
- extract gender from birth_number
- data type changes 
- drop highly correlated features 
- create avg of criminality / unempl rate 
- crime - divide by population 
- bring in any missing fields 
- change unempl into HML 
- change crime numbers into one column - sum the two columns 
- bring in the C and D statuses - and then using a function map to good or bad 
- OR multi class regression ?
- k symbol might be worth including 
- loan duration 

- feature engineering - take the columns and make more useful 



### 2.5. Split the data into num and cat --- > diff options cleaning / scaling


In [None]:
df.drop(['loan_id' ,'type'], axis=1, inplace=True)

In [None]:
df.head()



## 3. Pre processing     

### 3.1. Label / encode categorical columns 

In [None]:
cat = df.select_dtypes(include=object)
cat.head()
 



- we can select categoricals, but this time asigning a num to each category

In [None]:

categorical = pd.get_dummies(cat, columns = ['status'], drop_first = True)
categorical.head()

In [None]:
from sklearn.preprocessing import Normalizer

In [None]:
X=df.select_dtypes(include=np.number)

In [None]:
scaler=Normalizer().fit(X)
scaled=scaler.transform(X)
scaled_X=pd.DataFrame(scaled)
scaled_X.head()

### 3.2. Split off the dependant variable (label)

In [None]:
# y = dependentvariable - status

y = categorical['status_B']

# independent variables are scaled_X
X = scaled_X #big X

In [None]:
y

### 3.3. Train test split, get LOG REG model

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 40)

## 4. Apply model and train model 

In [None]:
classification = LogisticRegression(solver='liblinear', multi_class='ovr').fit(X_train, y_train)

### 4.1. Evaluate accuracy and test 

In [None]:
probabilities = classification.predict_proba(X_test)
preds = probabilities[:,1]
import sklearn.metrics as metrics
fpr, tpr, treshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)
print(roc_auc)

#### next steps

+ Visualise the accuracy of the predictions in some ways 

+ also think about - is there something I could do to improve my model accuracy?? 

### 4.2. Visualising accuracy - ROC / AUC 

In [None]:
#roc curve plot
plt.title('receiver Operating Characteristic')
plt.plot(fpr, tpr, label = 'AUC' %roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('false positive rate')
plt.ylabel('true positive rate')
plt.show()

#roc curve plot 
plt.title('Receiver Operating Characteristic')
plt.plot(fpr,tpr, label='AUC'%roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.ylabel('true positive rate')
plt.xlabel('false positive rate')
plt.show()


### 4.3. Visualising accuracy - Confusion Matrix

##### definitions 
+ tpr = true positive rate 
+ fpr = false positive rate

In [None]:
from sklearn.metrics import accuracy_score
predictions = classification.predict(X_test)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

In [None]:
accuracy_score(y_test, predictions)
confusion_matrix(y_test, predictions)
plot_confusion_matrix(classification, X_test, y_test)
plt.show()

### 4.5. Data is highly imbalanced

this is affecting the accuracy of our predictions 
- what can be done to resolve that ?


+ option 1 - SMOTE 

+ option 2 - TOMEK LINKS 

