In [11]:
import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import logistic_regression_util

import prepare

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [6]:
# Grab Iris Dataset
from pydataset import data


In [7]:
# columns name change
df.columns = [col.lower().replace('.', '_') for col in df]
df.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

In [9]:
# we will have 2 different target variables 
dummies = pd.get_dummies(df['species'], drop_first=True)
dummies.head()

Unnamed: 0,versicolor,virginica
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0


In [10]:
# concat dummies and original df. Drop 'species column'
df = pd.concat([df, dummies], axis = 1).drop(columns = ['species'])
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,versicolor,virginica
1,5.1,3.5,1.4,0.2,0,0
2,4.9,3.0,1.4,0.2,0,0
3,4.7,3.2,1.3,0.2,0,0
4,4.6,3.1,1.5,0.2,0,0
5,5.0,3.6,1.4,0.2,0,0


## Predict if species is versicolor or not

In [12]:
def train_validate_test_split(df, target, seed=123):
    '''
    This function takes in a dataframe, the name of the target variable
    (for stratification purposes), and an integer for a setting a seed
    and splits the data into train, validate and test. 
    Test is 20% of the original dataset, validate is .30*.80= 24% of the 
    original dataset, and train is .70*.80= 56% of the original dataset. 
    The function returns, in this order, train, validate and test dataframes. 
    '''
    train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=seed, 
                                            stratify=df[target])
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=seed,
                                       stratify=train_validate[target])
    return train, validate, test

In [13]:
train, validate, test = train_validate_test_split(df,
                                                  target = 'versicolor',
                                                  seed=123)

In [14]:
# Make new dataframes
X_train = train.drop(columns=['versicolor'])
y_train = train.versicolor

X_validate = validate.drop(columns=['versicolor'])
y_validate = validate.versicolor

X_test = test.drop(columns=['versicolor'])
y_test = test.versicolor

In [15]:
X_train.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,virginica
97,5.7,2.9,4.2,1.3,0
125,6.7,3.3,5.7,2.1,1
87,6.7,3.1,4.7,1.5,0
13,4.8,3.0,1.4,0.1,0
122,5.6,2.8,4.9,2.0,1


In [16]:
X_train.shape, y_train.shape

((84, 5), (84,))

### Hyperparameters
#### Regularization:
- Keep model simple
- Constraints the coefficients
- Discourages learning more complex model
- Minimizes overfitting
- avoid overfitting
- L1 - Lasso
- L2 - Ridge

#### C = Inverse of regularization strength:

- Lower C - higher regularization
- As C decreases, more coefficients become 0.
- Lower C discourages learning more complex model
- minimizes overfitting

# Model 1

In [21]:
# Define the logistic regression model
logit = LogisticRegression(C=1, class_weight={0:1, 1:99}, random_state = 123)

In [22]:
#  fit the model on train data
logit.fit(X_train, y_train)

LogisticRegression(C=1, class_weight={0: 1, 1: 99}, random_state=123)

In [24]:
# now use the model to make predictions
y_pred = logit.predict(X_train)

In [25]:
#take a look at predictions
y_pred

array([1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0], dtype=uint8)

In [28]:
# View raw probabilities (output from the model)
y_pred_proba = logit.predict_proba(X_train)

In [None]:
# classification report


## Model 2

In [None]:
# from sklearn.linear_model import LogisticRegression model(2)
# Change hyperparameter C = 0.1



In [None]:
# fit the model


In [None]:
# make prediction


In [None]:
#classification report


## Evaluate Model 1 and 2 performance on 'Validate'

In [None]:
# Make prediction for validate dataset


In [None]:
print("Model 1: solver = lbfgs, c = 1")

print('Accuracy: {:.2f}'.format(logit.score(X_validate, y_validate)))

print(confusion_matrix(y_validate, y_pred_validate))

print('-------------------------------')

print(classification_report(y_validate, y_pred_validate))

print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print("Model 2: solver = lbfgs, c = .1")

print('Accuracy: {:.2f}'.format(logit2.score(X_validate, y_validate)))

print(confusion_matrix(y_validate, y_pred_validate2))

print(classification_report(y_validate, y_pred_validate2))

## Select Model for evaluation on  'test'

- Model 1 does not seem overfitted/underfitted.
- Select Model 1 for evaluation on 'test' dataset


In [None]:
# Make prediction on X_test using model 1


In [None]:
# print classification report


## Interpreting model coefficients

In [None]:
# look at model 1 coefficents and intercept
 
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

In [None]:
# look at model 1 coefficents only
logit.coef_[0]

#### Logistic Regression basics:

log(odds) = log(p/(1-p)) = $intercept$ + ($\beta_1$ * variable1) + ($\beta_2$ * variable2) + ($\beta_3$ * variable3)

**The coefficients above represents 'log odds'**

In [None]:
# Make a dataframe of coefficients and feature names

log_coeffs = pd.DataFrame(logit.coef_[0], index = X_train.columns,
                          columns = ['coeffs']).sort_values(by = 'coeffs', ascending = True)
log_coeffs

**It would be helpful to convert 'log odds' to 'odds'**

In [None]:
# convert from log odds to odds (exponentiate)
odds = np.exp(log_coeffs)
odds

#### Coefficient Interpretation (odds):
- **Example: petal_length: For every one unit increase in petal_length, the odds that observation is versicolor ('1') is 7.1 times higher than the odds that observation is not-versicolor('0'), assuming all other things remain same**
- **If the coefficient (odds) is 1 or close to 1 (e.g. for petal_width), this means odds of being in class '1' (positive class) is same or close to being in class '0' (negative class). This means the feature with this coefficient is not a big driver for the target variable in this particular model**
- **If the coefficient value is << 1 (i.e. it is a fraction), that implies that increase in value of that feature will decrease the odds that target variable is in positive class**

### Choosing different probability threshold:



Default threshold value is 0.5   
We choose a **threshold t** such that if $P(y = 1) > t$, we predict 1, else we predict 0.

In [None]:
t = 0.3
y_pred1 = (y_pred_proba > t).astype(int)

In [None]:
y_pred1.head()

In [None]:
# classification report for threshold = t
print(classification_report(y_train, y_pred1.versicolor))

In [None]:
# plot metrics vs thresholds
logistic_regression_util.plot_metrics_by_thresholds(y_train, y_pred_proba.versicolor)