<a href="https://colab.research.google.com/github/trevorwjames/DS-Unit-2-Linear-Models/blob/master/module4-logistic-regression/Trevor_James_LS_DS_214_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Lambda School Data Science

*Unit 2, Sprint 1, Module 4*

---

# Logistic Regression


## Assignment 🌯

You'll use a [**dataset of 400+ burrito reviews**](https://srcole.github.io/100burritos/). How accurately can you predict whether a burrito is rated 'Great'?

> We have developed a 10-dimensional system for rating the burritos in San Diego. ... Generate models for what makes a burrito great and investigate correlations in its dimensions.

- [ ] Do train/validate/test split. Train on reviews from 2016 & earlier. Validate on 2017. Test on 2018 & later.
- [ ] Begin with baselines for classification.
- [ ] Use scikit-learn for logistic regression.
- [ ] Get your model's validation accuracy. (Multiple times if you try multiple iterations.)
- [ ] Get your model's test accuracy. (One time, at the end.)
- [ ] Commit your notebook to your fork of the GitHub repo.


## Stretch Goals

- [ ] Add your own stretch goal(s) !
- [ ] Make exploratory visualizations.
- [ ] Do one-hot encoding.
- [ ] Do [feature scaling](https://scikit-learn.org/stable/modules/preprocessing.html).
- [ ] Get and plot your coefficients.
- [ ] Try [scikit-learn pipelines](https://scikit-learn.org/stable/modules/compose.html).

In [259]:
%%capture
import sys

# If you're on Colab:
if 'google.colab' in sys.modules:
    DATA_PATH = 'https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Linear-Models/master/data/'
    !pip install category_encoders==2.*

# If you're working locally:
else:
    DATA_PATH = '../data/'

In [260]:
# Load data downloaded from https://srcole.github.io/100burritos/
import pandas as pd
df = pd.read_csv(DATA_PATH+'burritos/burritos.csv')

In [261]:
# Derive binary classification target:
# We define a 'Great' burrito as having an
# overall rating of 4 or higher, on a 5 point scale.
# Drop unrated burritos.
df = df.dropna(subset=['overall'])
df['Great'] = df['overall'] >= 4

In [262]:
# Clean/combine the Burrito categories
df['Burrito'] = df['Burrito'].str.lower()

california = df['Burrito'].str.contains('california')
asada = df['Burrito'].str.contains('asada')
surf = df['Burrito'].str.contains('surf')
carnitas = df['Burrito'].str.contains('carnitas')

df.loc[california, 'Burrito'] = 'California'
df.loc[asada, 'Burrito'] = 'Asada'
df.loc[surf, 'Burrito'] = 'Surf & Turf'
df.loc[carnitas, 'Burrito'] = 'Carnitas'
df.loc[~california & ~asada & ~surf & ~carnitas, 'Burrito'] = 'Other'

In [263]:
# Drop some high cardinality categoricals
df = df.drop(columns=['Notes', 'Location', 'Reviewer', 'Address', 'URL', 'Neighborhood'])

In [264]:
# Drop some columns to prevent "leakage"
df = df.drop(columns=['Rec', 'overall'])

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [267]:
df.head()

Unnamed: 0,Burrito,Date,Yelp,Google,Chips,Cost,Hunger,Mass (g),Density (g/mL),Length,Circum,Volume,Tortilla,Temp,Meat,Fillings,Meat:filling,Uniformity,Salsa,Synergy,Wrap,Unreliable,NonSD,Beef,Pico,Guac,Cheese,Fries,Sour cream,Pork,Chicken,Shrimp,Fish,Rice,Beans,Lettuce,Tomato,Bell peper,Carrots,Cabbage,Sauce,Salsa.1,Cilantro,Onion,Taquito,Pineapple,Ham,Chile relleno,Nopales,Lobster,Queso,Egg,Mushroom,Bacon,Sushi,Avocado,Corn,Zucchini,Great
0,California,1/18/2016,3.5,4.2,,6.49,3.0,,,,,,3.0,5.0,3.0,3.5,4.0,4.0,4.0,4.0,4.0,,,x,x,x,x,x,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False
1,California,1/24/2016,3.5,3.3,,5.45,3.5,,,,,,2.0,3.5,2.5,2.5,2.0,4.0,3.5,2.5,5.0,,,x,x,x,x,x,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False
2,Carnitas,1/24/2016,,,,4.85,1.5,,,,,,3.0,2.0,2.5,3.0,4.5,4.0,3.0,3.0,5.0,,,,x,x,,,,x,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False
3,Asada,1/24/2016,,,,5.25,2.0,,,,,,3.0,2.0,3.5,3.0,4.0,5.0,4.0,4.0,5.0,,,x,x,x,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False
4,California,1/27/2016,4.0,3.8,x,6.59,4.0,,,,,,4.0,5.0,4.0,3.5,4.5,5.0,2.5,4.5,4.0,,,x,x,,x,x,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,True


In [268]:
# Function for wrangling - Looking to get rid of the messy coulumns for now
def wrangle(df):
  df.drop(['Chips', 'Mass (g)',	'Density (g/mL)',	'Length',	'Circum',	'Volume', 'Unreliable',	'NonSD',
           'Beef', 'Pico',	'Guac',	'Cheese',	'Fries',	'Sour cream',	'Pork',	'Chicken',	'Shrimp', 	'Fish',	'Rice',
            'Beans',	'Lettuce',	'Tomato',	'Bell peper',	'Carrots',	'Cabbage',	'Sauce',	'Salsa.1',	'Cilantro',
           'Onion',	'Taquito',	'Pineapple',	'Ham',	'Chile relleno',	'Nopales',	'Lobster',	'Queso',	'Egg',	'Mushroom',
           'Bacon',	'Sushi',	'Avocado',	'Corn',	'Zucchini'], axis=1, inplace=True)
  df['Date'] = pd.to_datetime(df['Date'], infer_datetime_format=True)
  return df

In [269]:
# X and y Split
X = wrangle(df)

In [270]:
train = X[X['Date'] < '2017-01-01' ]
val = X[(X['Date'] > '2016-12-31') & (X['Date'] < '2018-01-01')]
test = X[X['Date'] > '2017-12-31']

In [271]:
train.tail()

Unnamed: 0,Burrito,Date,Yelp,Google,Cost,Hunger,Tortilla,Temp,Meat,Fillings,Meat:filling,Uniformity,Salsa,Synergy,Wrap,Great
296,California,2016-12-02,4.0,4.3,5.65,3.0,4.0,1.5,2.0,3.0,4.2,4.0,3.0,2.0,4.5,False
297,Other,2016-12-02,,,5.49,3.0,4.5,5.0,2.0,2.0,2.5,3.5,3.0,2.5,3.0,False
298,California,2016-12-10,3.5,3.7,7.75,4.0,3.5,2.5,3.0,3.3,1.4,2.3,2.2,3.3,4.5,False
299,Asada,2016-12-10,,,7.75,4.0,4.0,4.5,2.0,2.0,3.5,3.5,2.0,2.0,4.0,False
300,Other,2016-12-15,4.5,4.6,6.99,3.7,3.6,4.0,4.0,3.0,3.8,4.3,,3.8,2.0,False


In [272]:
test.head()

Unnamed: 0,Burrito,Date,Yelp,Google,Cost,Hunger,Tortilla,Temp,Meat,Fillings,Meat:filling,Uniformity,Salsa,Synergy,Wrap,Great
77,California,2026-04-25,,,8.0,4.0,4.5,5.0,5.0,5.0,4.5,5.0,3.0,5.0,5.0,True
386,California,2018-01-02,,,7.25,4.0,4.0,5.0,4.0,5.0,5.0,3.0,3.0,4.0,5.0,False
387,Other,2018-01-09,4.5,3.8,4.19,3.0,3.0,5.0,2.0,2.0,4.0,1.0,4.0,3.0,4.0,False
388,California,2018-01-12,3.5,4.3,7.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,4.0,5.0,5.0,True
389,Other,2018-01-12,,,8.5,4.0,4.0,4.0,3.0,3.5,1.0,2.0,3.0,3.0,1.0,False


In [273]:
y_train = train.pop('Great')

In [274]:
y_val = val.pop('Great')
y_test = test.pop('Great')

In [275]:
X_train = train.drop('Date', axis=1)
X_val = val.drop('Date', axis=1)
X_test = test.drop('Date', axis=1)

In [276]:
# Checking shapes - making sure they can run through in a train test split
print(y_train.shape)
print(X_train.shape)
print(y_val.shape)
print(X_val.shape)
print(X_test.shape)
print(y_test.shape)

(298,)
(298, 14)
(85,)
(85, 14)
(38, 14)
(38,)


In [277]:
X_train.head()

Unnamed: 0,Burrito,Yelp,Google,Cost,Hunger,Tortilla,Temp,Meat,Fillings,Meat:filling,Uniformity,Salsa,Synergy,Wrap
0,California,3.5,4.2,6.49,3.0,3.0,5.0,3.0,3.5,4.0,4.0,4.0,4.0,4.0
1,California,3.5,3.3,5.45,3.5,2.0,3.5,2.5,2.5,2.0,4.0,3.5,2.5,5.0
2,Carnitas,,,4.85,1.5,3.0,2.0,2.5,3.0,4.5,4.0,3.0,3.0,5.0
3,Asada,,,5.25,2.0,3.0,2.0,3.5,3.0,4.0,5.0,4.0,4.0,5.0
4,California,4.0,3.8,6.59,4.0,4.0,5.0,4.0,3.5,4.5,5.0,2.5,4.5,4.0


In [278]:
y_train

0      False
1      False
2      False
3      False
4       True
       ...  
296    False
297    False
298    False
299    False
300    False
Name: Great, Length: 298, dtype: bool

In [279]:
# finding baseline values for classification 
y_train.value_counts(normalize=True)

False    0.590604
True     0.409396
Name: Great, dtype: float64

Looks like the accuracy we need to beat is .59 or 59% 

In [280]:
# Now Can we throw this badboy into a pipeline... 

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from category_encoders import OneHotEncoder
from sklearn.linear_model import LogisticRegressionCV


C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False

In [281]:
# Intantiate pipeline with steps 
log_model = Pipeline([
                      ('ohe', OneHotEncoder(cols=['Burrito'])), 
                      ('imputer', SimpleImputer()), 
                      ('classifier', LogisticRegression())
])

log_model

Pipeline(memory=None,
         steps=[('ohe',
                 OneHotEncoder(cols=['Burrito'], drop_invariant=False,
                               handle_missing='value', handle_unknown='value',
                               return_df=True, use_cat_names=False,
                               verbose=0)),
                ('imputer',
                 SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                               missing_values=nan, strategy='mean',
                               verbose=0)),
                ('classifier',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='lbfgs', tol=0.0001, verbose=0,
            

In [282]:
# Fit the model to training data
log_model.fit(X_train, y_train);

In [283]:
log_model.score(X_train, y_train)

0.8926174496644296

In [284]:
log_model.score(X_val, y_val)

0.8470588235294118

In [285]:
# try a new model changing solver to 'liblinear'
# was recommended with smaller data
log_model1 = Pipeline([
                      ('ohe', OneHotEncoder(cols=['Burrito'])), 
                      ('imputer', SimpleImputer()), 
                      ('classifier', LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l2', random_state=42,
                                    solver='liblinear', tol=0.0001, verbose=0,
                                    warm_start=False))
])

In [286]:
# fit model1 to new data
log_model1.fit(X_train, y_train);

In [287]:
log_model1.score(X_train, y_train)

0.8657718120805369

In [288]:
# looking at score for validation values
log_model1.score(X_val, y_val)

0.8352941176470589

In [289]:
# Trying a new model changed n_jobs to -1
log_model2 = Pipeline([
                      ('ohe', OneHotEncoder(cols=['Burrito'])), 
                      ('imputer', SimpleImputer()), 
                      ('classifier', LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=-1,
                                    penalty='l2', random_state=42,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False))
])

In [290]:
# fit model
log_model2.fit(X_train, y_train);

In [291]:
log_model2.score(X_train, y_train)

0.8926174496644296

In [292]:
log_model2.score(X_val, y_val)

0.8470588235294118

In [293]:
log_model2.predict(X_val)

array([False, False,  True,  True, False, False,  True,  True, False,
        True,  True, False, False, False,  True, False, False,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False,  True,  True,  True, False, False, False, False, False,
       False,  True, False,  True, False, False,  True,  True,  True,
        True, False, False,  True, False, False, False,  True,  True,
       False, False, False, False, False, False, False, False, False,
       False, False,  True,  True,  True,  True, False,  True,  True,
        True,  True, False, False, False,  True,  True,  True,  True,
        True, False,  True,  True])

In [294]:
# trying LRCV fo see if there is any difference
# Needed to change n_jobs to -1 in order to get a result, crashed with normal amount
log_model3 = Pipeline([
                      ('ohe', OneHotEncoder(cols=['Burrito'])), 
                      ('imputer', SimpleImputer()), 
                      ('classifier', LogisticRegressionCV(n_jobs=-1))
])

In [295]:
# Fit to training data
log_model3.fit(X_train, y_train);

In [296]:
# CV give us a higher score than regular
log_model3.score(X_train, y_train)

0.8993288590604027

In [297]:
# also gives us a closer score than had above, here we have a smaller % difference 
log_model3.score(X_val, y_val)

0.8823529411764706

In [298]:
# going to run this model on the test data! 
y_test_pred = log_model3.predict(X_test)

print(y_test_pred)

[ True  True False  True False False  True  True False  True  True False
  True False  True  True  True False False False False  True  True  True
  True False False  True  True  True False  True  True False  True  True
  True  True]


In [299]:
# Score for test data on model. Test data is also significantly smaller

log_model3.score(X_test, y_test)

0.7894736842105263