<a href="https://colab.research.google.com/github/austiezr/DS-Unit-2-Linear-Models/blob/master/module4-logistic-regression/LS_DS_214_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Lambda School Data Science

*Unit 2, Sprint 1, Module 4*

---

# Logistic Regression


## Assignment 🌯

You'll use a [**dataset of 400+ burrito reviews**](https://srcole.github.io/100burritos/). How accurately can you predict whether a burrito is rated 'Great'?

> We have developed a 10-dimensional system for rating the burritos in San Diego. ... Generate models for what makes a burrito great and investigate correlations in its dimensions.

- [X] Do train/validate/test split. Train on reviews from 2016 & earlier. Validate on 2017. Test on 2018 & later.
- [X] Begin with baselines for classification.
- [X] Use scikit-learn for logistic regression.
- [X] Get your model's validation accuracy. (Multiple times if you try multiple iterations.)
- [X] Get your model's test accuracy. (One time, at the end.)
- [X] Commit your notebook to your fork of the GitHub repo.


## Stretch Goals

- [ ] Add your own stretch goal(s) !
- [ ] Make exploratory visualizations.
- [X] Do one-hot encoding.
- [X] Do [feature scaling](https://scikit-learn.org/stable/modules/preprocessing.html).
- [ ] Get and plot your coefficients.
- [X] Try [scikit-learn pipelines](https://scikit-learn.org/stable/modules/compose.html).

In [0]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
import category_encoders as ce
from sklearn.feature_selection import SelectKBest, f_regression
import numpy as np

## Loading / Cleaning

### Given Code

In [0]:
%%capture
import sys

# If you're on Colab:
if 'google.colab' in sys.modules:
    DATA_PATH = 'https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Linear-Models/master/data/'
    !pip install category_encoders==2.*

# If you're working locally:
else:
    DATA_PATH = '../data/'

In [0]:
# Load data downloaded from https://srcole.github.io/100burritos/
import pandas as pd
df = pd.read_csv(DATA_PATH+'burritos/burritos.csv')

In [0]:
# Derive binary classification target:
# We define a 'Great' burrito as having an
# overall rating of 4 or higher, on a 5 point scale.
# Drop unrated burritos.
df = df.dropna(subset=['overall'])
df['Great'] = df['overall'] >= 4

In [0]:
# Clean/combine the Burrito categories
df['Burrito'] = df['Burrito'].str.lower()

california = df['Burrito'].str.contains('california')
asada = df['Burrito'].str.contains('asada')
surf = df['Burrito'].str.contains('surf')
carnitas = df['Burrito'].str.contains('carnitas')

df.loc[california, 'Burrito'] = 'California'
df.loc[asada, 'Burrito'] = 'Asada'
df.loc[surf, 'Burrito'] = 'Surf & Turf'
df.loc[carnitas, 'Burrito'] = 'Carnitas'
df.loc[~california & ~asada & ~surf & ~carnitas, 'Burrito'] = 'Other'

In [0]:
# Drop some high cardinality categoricals
df = df.drop(columns=['Notes', 'Location', 'Reviewer', 'Address', 'URL', 'Neighborhood'])

In [0]:
# Drop some columns to prevent "leakage"
df = df.drop(columns=['Rec', 'overall'])

### My Code

In [0]:
df['Date'] = pd.to_datetime(df['Date'])

In [0]:
df = df.replace([np.NaN, 'No', 'x', 'X', 'Yes'], [0, 0, 1, 1, 1])

## Train/Validate/Test Split & One-Hot Encoding

In [0]:
train = df[df['Date'] < '01/01/2017']
validate = df[(df['Date'] >= '01/01/2017') & (df['Date'] < '01/01/2018')]
test = df[df['Date'] >= '01/01/2018']
target = 'Great'
encoder = ce.OneHotEncoder(cols=['Burrito'])
train = encoder.fit_transform(train)
validate = encoder.transform(validate)
test = encoder.transform(test)
features = train.columns
features = features.drop('Mass (g)').drop('Density (g/mL)').drop('Length').drop('Circum').drop('Volume').drop('Date').drop('Great')

In [0]:
X_train = train[features]
X_validate = validate[features]
y_train = train[target]
y_validate = validate[target]
X_test = test[features]
y_test = test[target]

# Baseline Measures

In [141]:
majority_class = y_train.mode()[0]

y_pred = [majority_class] * len(y_train)
print(f'Train Baseline Accuracy: {accuracy_score(y_train, y_pred)}\n')

y_pred = [majority_class] * len(y_validate)
print(f'Validation Baseline Accuracy: {accuracy_score(y_validate, y_pred)}\n')

y_pred = [majority_class] * len(y_test)
print(f'Test Baseline Accuracy: {accuracy_score(y_test, y_pred)}')

Train Baseline Accuracy: 0.5906040268456376

Validation Baseline Accuracy: 0.5529411764705883

Test Baseline Accuracy: 0.42105263157894735


# Regression W/ Pipeline

## Testing

In [0]:
pipe = make_pipeline(StandardScaler(), SelectKBest(score_func=f_regression, k=10), LogisticRegression())

In [13]:
pipe.fit(X_train, y_train)

  corr /= X_norms
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('selectkbest',
                 SelectKBest(k=10,
                             score_func=<function f_regression at 0x7f6443a22598>)),
                ('logisticregression',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [14]:
y_pred = pipe.predict(X_validate)
f1_score(y_validate, y_pred)

0.8

## Logistic Regression

In [0]:
def LogRegression(X_train=X_train, y_train=y_train, 
                    X_test=X_validate, y_test=y_validate, 
                    acc_ref=0):
  
  with np.errstate(divide='ignore',invalid='ignore'):

    feature_list = []

    for k in range(1, len(X_train.columns)+1):
        pipe = make_pipeline(StandardScaler(), SelectKBest(score_func=f_regression, k=k), LogisticRegression())


        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)
        f1 = f1_score(y_test, y_pred)
        accuracy = pipe.score(X_test, y_test)
    

        try:
          if accuracy > acc_ref:
            f1_ref = f1
            final_k = k
            acc_ref = accuracy
        finally: 
          if k == len(X_train.columns):
            print(f'{final_k} Features Used\n')
            print(f'Accuracy: {acc_ref}\n')

### Validation

In [68]:
LogRegression()

4 Features Used

Accuracy: 0.8823529411764706



### Test

In [69]:
LogRegression(X_test=X_test, y_test=y_test)

5 Features Used

Accuracy: 0.7631578947368421



## Logisitc Regression CV


In [0]:
def LogCVRegression(X_train=X_train, y_train=y_train, 
                    X_test=X_validate, y_test=y_validate, 
                    acc_ref=0):
  
  with np.errstate(divide='ignore',invalid='ignore', ):

    f1_list = []
    acc_list = []

    for k in range(1, len(X_train.columns)+1):
        pipe = make_pipeline(StandardScaler(), SelectKBest(score_func=f_regression, k=k), LogisticRegressionCV(Cs=100, max_iter=10000))
        selector = SelectKBest(score_func=f_regression, k=k)
        

        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)
        f1 = f1_score(y_test, y_pred)
        f1_list.append(f1)
        accuracy = pipe.score(X_test, y_test)
        acc_list.append(accuracy)


        try:
          if accuracy > acc_ref:
            f1_ref = f1
            final_k = k
            acc_ref = accuracy
        finally: 
          if k == len(X_train.columns):
            print(f'{final_k} Features Used\n')
            print(f'Accuracy: {acc_ref}\n')
            print(f'Mean Accuracy: {np.mean(acc_list)}\n')

### Validation

In [72]:
LogCVRegression()

4 Features Used

Accuracy: 0.8941176470588236

Mean Accuracy: 0.7894957983193277



### Test

In [73]:
LogCVRegression(X_test=X_test, y_test=y_test)

8 Features Used

Accuracy: 0.7894736842105263

Mean Accuracy: 0.7396616541353384

