**Run the following two cells before you begin.**

In [28]:
%autosave 10

Autosaving every 10 seconds


In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import math
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


______________________________________________________________________
**First, import your data set and define the sigmoid function.**
<details>
    <summary>Hint:</summary>
    The definition of the sigmoid is $f(x) = \frac{1}{1 + e^{-X}}$.
</details>

In [30]:
# Import the data set
cleaned_data=pd.read_csv(r"C:\Users\ekta hinduja\Downloads\Data_Exploration_and_Cleaning\cleaned_data.csv")



In [31]:
# Define the sigmoid function
def sig(x):
    s=1/(1+np.exp(-x))
    return s


**Now, create a train/test split (80/20) with `PAY_1` and `LIMIT_BAL` as features and `default payment next month` as values. Use a random state of 24.**

In [32]:
# Create a train/test split
features=['PAY_1','LIMIT_BAL']
X=cleaned_data[features]
y=cleaned_data['default payment next month']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2, random_state=24)


______________________________________________________________________
**Next, import LogisticRegression, with the default options, but set the solver to `'liblinear'`.**

In [33]:
logreg = LogisticRegression(solver='liblinear')


______________________________________________________________________
**Now, train on the training data and obtain predicted classes, as well as class probabilities, using the testing data.**

In [34]:
# Fit the logistic regression model on training data
logreg.fit(X_train, y_train)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [35]:
# Make predictions using `.predict()`
y_pred = logreg.predict(X_test)



In [36]:
# Find class probabilities using `.predict_proba()`
y_pred_proba = logreg.predict_proba(X_test)[::,1]
y_pred_proba


array([0.25173076, 0.415703  , 0.20395547, ..., 0.415703  , 0.17278502,
       0.33606565])

______________________________________________________________________
**Then, pull out the coefficients and intercept from the trained model and manually calculate predicted probabilities. You'll need to add a column of 1s to your features, to multiply by the intercept.**

In [37]:

cleaned_data.shape


(26664, 30)

In [44]:
#Add column of ones
ones=[]
i=0
for i in range(26664):
    ones.append(1)
cleaned_data['ones']=ones
cleaned_data.ones


0        1
1        1
2        1
3        1
4        1
        ..
26659    1
26660    1
26661    1
26662    1
26663    1
Name: ones, Length: 26664, dtype: int64

In [45]:
# Get coefficients and intercepts from trained model
print(logreg.coef_)
print(logreg.intercept_)



[[ 8.27451187e-11 -6.80876727e-06]]
[-6.57647457e-11]


In [46]:
# Manually calculate predicted probabilities
p=(8.27451187e-11)*cleaned_data['PAY_1']+(-6.80876727e-06)*cleaned_data['LIMIT_BAL']+(-6.57647457e-11)*cleaned_data['ones']
sig(p)


0        0.466009
1        0.306390
2        0.351423
3        0.415703
4        0.415703
           ...   
26659    0.182735
26660    0.264771
26661    0.449111
26662    0.367095
26663    0.415703
Length: 26664, dtype: float64

______________________________________________________________________
**Next, using a threshold of `0.5`, manually calculate predicted classes. Compare this to the class predictions output by scikit-learn.**

In [47]:
# Manually calculate predicted classes
m_pred=[]
for i in sig(p):
    if(i >= 0.5):
        m_pred.append(1)
    else:
        m_pred.append(0)
m_pred



[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [48]:
# Compare to scikit-learn's predicted classes
y_pred


array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

______________________________________________________________________
**Finally, calculate ROC AUC using both scikit-learn's predicted probabilities, and your manually predicted probabilities, and compare.**

In [49]:
# Use scikit-learn's predicted probabilities to calculate ROC AUC
from sklearn import metrics
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
auc


0.627207450280691

In [50]:
# Use manually calculated predicted probabilities to calculate ROC AUC
from sklearn import metrics
try:
    auc2=metrics.roc_auc_score(m_pred,sig(p))
    auc2
except ValueError:
    pass