In [4]:
import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
# import logistic_regression_util

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

#### Logistic Regression
- Fundamentals: 
 https://docs.google.com/presentation/d/1AzgB6opDhEuAdBHZS8GRbBV6BtQCqb9JSAElM4-H6nk/edit?usp=sharing
- logistic regression in sklearn

Pros and Cons

In [1]:
from pydataset import data

df = data('iris')
df.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa
5,5.0,3.6,1.4,0.2,setosa


In [2]:
# columns name change
df.columns = [col.lower().replace('.', '_') for col in df]
df.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

In [5]:
# we will have 2 different target variables 

dummies = pd.get_dummies(df['species'], drop_first = True)
dummies.head()

Unnamed: 0,versicolor,virginica
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0


In [6]:
# concat dummies and original df. Drop 'species column'
df = pd.concat([df, dummies], axis =1).drop(columns = ['species'])
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,versicolor,virginica
1,5.1,3.5,1.4,0.2,0,0
2,4.9,3.0,1.4,0.2,0,0
3,4.7,3.2,1.3,0.2,0,0
4,4.6,3.1,1.5,0.2,0,0
5,5.0,3.6,1.4,0.2,0,0


## Predict if species is versicolor or not

In [7]:
def train_validate_test_split(df, target, seed=123):
    '''
    This function takes in a dataframe, the name of the target variable
    (for stratification purposes), and an integer for a setting a seed
    and splits the data into train, validate and test. 
 
    '''
    train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=seed, 
                                            stratify=df[target])
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=seed,
                                       stratify=train_validate[target])
    return train, validate, test

In [8]:
train, validate, test = train_validate_test_split(df,
                                                  target = 'versicolor',
                                                  seed=123)

In [9]:
train.shape, validate.shape, test.shape

((84, 6), (36, 6), (30, 6))

In [10]:
train.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,versicolor,virginica
97,5.7,2.9,4.2,1.3,1,0
125,6.7,3.3,5.7,2.1,0,1
87,6.7,3.1,4.7,1.5,1,0
13,4.8,3.0,1.4,0.1,0,0
122,5.6,2.8,4.9,2.0,0,1


In [11]:
train.versicolor.value_counts()

0    56
1    28
Name: versicolor, dtype: int64

In [12]:
# Make new dataframes
X_train = train.drop(columns=['versicolor', 'virginica'])
y_train = train.versicolor

X_validate = validate.drop(columns=['versicolor', 'virginica'])
y_validate = validate.versicolor

X_test = test.drop(columns=['versicolor', 'virginica'])
y_test = test.versicolor

In [13]:
X_train.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
97,5.7,2.9,4.2,1.3
125,6.7,3.3,5.7,2.1
87,6.7,3.1,4.7,1.5
13,4.8,3.0,1.4,0.1
122,5.6,2.8,4.9,2.0


# Model 1

In [14]:
# Define the logistic regression model
logit = LogisticRegression(C=1, random_state=123)


In [15]:
#  fit the model on train data
logit.fit(X_train, y_train)

LogisticRegression(C=1, random_state=123)

In [16]:
# now use the model to make predictions
y_pred = logit.predict(X_train)

In [17]:
#take a look at predictions
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0], dtype=uint8)

In [28]:
# View raw probabilities (output from the model)

y_pred_proba = logit.predict_proba(X_train)
y_pred_proba = pd.DataFrame(y_pred_proba, columns = ['non-versicolor', 'versicolor'])
y_pred_proba

Unnamed: 0,non-versicolor,versicolor
0,0.587445,0.412555
1,0.791497,0.208503
2,0.680186,0.319814
3,0.701358,0.298642
4,0.630256,0.369744
5,0.941644,0.058356
6,0.948056,0.051944
7,0.747642,0.252358
8,0.679891,0.320109
9,0.709062,0.290938


In [27]:
# classification report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.89      0.83        56
           1       0.70      0.50      0.58        28

    accuracy                           0.76        84
   macro avg       0.74      0.70      0.71        84
weighted avg       0.75      0.76      0.75        84



## Model 2

In [29]:
# Change hyperparameter C = 0.1

logit2 = LogisticRegression(C=.1 ,random_state=123)

In [30]:
# fit the model
logit2.fit(X_train, y_train)

LogisticRegression(C=0.1, random_state=123)

In [31]:
# make prediction
y_pred2 = logit2.predict(X_train)

In [32]:
#classification report
print(classification_report(y_train, y_pred2))

              precision    recall  f1-score   support

           0       0.67      0.98      0.80        56
           1       0.50      0.04      0.07        28

    accuracy                           0.67        84
   macro avg       0.59      0.51      0.43        84
weighted avg       0.61      0.67      0.55        84



## Evaluate Model 1 and 2 performance on 'Validate'

In [33]:
# Make prediction for validate dataset

y_pred_validate = logit.predict(X_validate)
y_pred_validate2 = logit2.predict(X_validate)

In [39]:
print("Model 1: solver = lbfgs, c = 1")

print('Accuracy: {:.2f}'.format(logit.score(X_validate, y_validate)))

print(classification_report(y_validate, y_pred_validate))

print("Model 2: solver = lbfgs, c = .1")

print('Accuracy: {:.2f}'.format(logit2.score(X_validate, y_validate)))

print(classification_report(y_validate, y_pred_validate2))

Model 1: solver = lbfgs, c = 1
Accuracy: 0.75
              precision    recall  f1-score   support

           0       0.78      0.88      0.82        24
           1       0.67      0.50      0.57        12

    accuracy                           0.75        36
   macro avg       0.72      0.69      0.70        36
weighted avg       0.74      0.75      0.74        36

Model 2: solver = lbfgs, c = .1
Accuracy: 0.61
              precision    recall  f1-score   support

           0       0.65      0.92      0.76        24
           1       0.00      0.00      0.00        12

    accuracy                           0.61        36
   macro avg       0.32      0.46      0.38        36
weighted avg       0.43      0.61      0.51        36



## Select Model for evaluation on  'test'

- Model 1 does not seem overfitted/underfitted.
- Select Model 1 for evaluation on 'test' dataset


In [40]:
# Make prediction on X_test using model 1
y_pred_test = logit.predict(X_test)

In [41]:
# print classification report
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.68      0.75      0.71        20
           1       0.38      0.30      0.33        10

    accuracy                           0.60        30
   macro avg       0.53      0.53      0.52        30
weighted avg       0.58      0.60      0.59        30



### Hyperparameters
#### Regularization:
- Keep model simple
- Constraints the coefficients
- Discourages learning more complex model
- Minimizes overfitting
- avoid overfitting
- L1 - Lasso
- L2 - Ridge

#### C = Inverse of regularization strength:

- Lower C - higher regularization
- Lower C discourages learning more complex model
- minimizes overfitting

## Interpreting model coefficients

In [42]:
# look at model 1 coefficents
 
print('Coefficient: \n', logit.coef_)


Coefficient: 
 [[ 0.00257421 -2.34030054  0.60822925 -1.1991925 ]]


In [43]:
# look at model 1 coefficents only
logit.coef_[0]

array([ 0.00257421, -2.34030054,  0.60822925, -1.1991925 ])

#### Logistic Regression basics:

log(odds) = log(p/(1-p)) = $intercept$ + ($\beta_1$ * variable1) + ($\beta_2$ * variable2) + ($\beta_3$ * variable3)

**The coefficients above represents 'log odds'**

In [44]:
# Make a dataframe of coefficients and feature names

log_coeffs = pd.DataFrame(logit.coef_[0], index = X_train.columns,
                          columns = ['coeffs']).sort_values(by = 'coeffs', ascending = True)
log_coeffs

Unnamed: 0,coeffs
sepal_width,-2.340301
petal_width,-1.199193
sepal_length,0.002574
petal_length,0.608229


**It would be helpful to convert 'log odds' to 'odds'**

In [45]:
# convert from log odds to odds (exponentiate)
odds = np.exp(log_coeffs)
odds

Unnamed: 0,coeffs
sepal_width,0.096299
petal_width,0.301438
sepal_length,1.002578
petal_length,1.837175


What is odds?

odds = P(occurring) / P(not occurring)  = p / (1-p)

Toss a fair coin
odds = 0.5 / (1-0.5) = 1   i.e. Odd of landing tails vs heads is 1:1 for fair coin


#### Coefficient Interpretation (odds):


- **Example: petal_length: For every one unit increase in petal_length, the odds that observation is versicolor ('1') is 1.8 times higher than the odds that observation is not-versicolor('0'), assuming all other things remain same**
- **If the coefficient (odds) is 1 or close to 1, this means odds of being in class '1' (positive class) is same or close to being in class '0' (negative class). This means the feature with this coefficient is not a big driver for the target variable in this particular model**
- **If the coefficient value is << 1 (i.e. it is a fraction <1), that implies that increase in value of that feature will decrease the odds that target variable is in positive class**