# Project 5: Optimizing Evacuation Routes using Real-Time Traffic Information

Song May, Michael Daugherty, Kelly Slatery | US-DSI-10 | 02.21.2020

## Model Data

In [1]:
# Imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, \
                             GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix

# from sklearn.naive_bayes import MultinomialNB
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.svm import SVC

# Revise this imports list
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

## Prepare Data

In [2]:
# Import train data
df = pd.read_csv('./data/clean_train_data.csv')
df.shape

(168441, 5)

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,dates&time,user,tweet,category
0,4959,2020-02-06 22:03:12+00:00,DallasPD,DallasPD and dfrincidents are currently on loc...,1
1,5983,2019-08-30 21:16:20+00:00,DallasPD,Monday Sept on LaborDay Jack Evans Police Hd...,1
2,6062,2019-08-14 22:31:39+00:00,DallasPD,PIODPD is at the scene of a possible barricade...,1
3,6175,2019-07-13 22:04:27+00:00,DallasPD,PIODPD is on scene of an Officer Involved Shoo...,1
4,6177,2019-07-13 01:38:19+00:00,DallasPD,Major police incident in downtown Dallas Griff...,1


In [4]:
# Drop unnecessary column
df.drop(columns='Unnamed: 0', inplace=True)
df.head()

Unnamed: 0,dates&time,user,tweet,category
0,2020-02-06 22:03:12+00:00,DallasPD,DallasPD and dfrincidents are currently on loc...,1
1,2019-08-30 21:16:20+00:00,DallasPD,Monday Sept on LaborDay Jack Evans Police Hd...,1
2,2019-08-14 22:31:39+00:00,DallasPD,PIODPD is at the scene of a possible barricade...,1
3,2019-07-13 22:04:27+00:00,DallasPD,PIODPD is on scene of an Officer Involved Shoo...,1
4,2019-07-13 01:38:19+00:00,DallasPD,Major police incident in downtown Dallas Griff...,1


In [5]:
# Rename the category column
df.rename(columns={'category ': 'class'}, inplace=True)
df.head()

Unnamed: 0,dates&time,user,tweet,class
0,2020-02-06 22:03:12+00:00,DallasPD,DallasPD and dfrincidents are currently on loc...,1
1,2019-08-30 21:16:20+00:00,DallasPD,Monday Sept on LaborDay Jack Evans Police Hd...,1
2,2019-08-14 22:31:39+00:00,DallasPD,PIODPD is at the scene of a possible barricade...,1
3,2019-07-13 22:04:27+00:00,DallasPD,PIODPD is on scene of an Officer Involved Shoo...,1
4,2019-07-13 01:38:19+00:00,DallasPD,Major police incident in downtown Dallas Griff...,1


In [6]:
# Check for nulls
df.isnull().sum()

dates&time       0
user             0
tweet         1730
class            0
dtype: int64

In [7]:
# Remove rows with nulls
df = df[df['tweet'].notnull()]

In [8]:
# Check out the y value distribution
df['class'].value_counts(normalize=True)

0    0.948786
1    0.051214
Name: class, dtype: float64

Baseline Accuracy: 95%

In [9]:
# WHAT CAN WE DO WITH UNBALANCED CLASSES????
# bootstrap & the opposite (multiply rows of minority class, random sample rows of majority class)

In [10]:
# Define X and y
X = df['tweet']
y = df['class']

# Train tes split data
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=.25, 
                                                    stratify=y,
                                                    random_state=42)

## Logistic Regression: CountVecotorizer

In [11]:
# Set up pipeline
lr_cv_pipe = Pipeline([
    ('cvec', CountVectorizer()), 
    ('lr', LogisticRegression(solver='liblinear', verbose=1)) 
])

# Set up pipeline parameters
lr_cv_params = {
    'cvec__stop_words': [None, 'english'],
    'cvec__ngram_range': [(1,1), (1,2), (1,3)], 
    'cvec__max_features': [100, 500], 
    'cvec__max_df': [1.0, .95], 
    'cvec__min_df': [1, .05], 
    'lr__C': [1, 1e9], 
    'lr__penalty': ['l1', 'l2'],
    'lr__max_iter': [100, 500, 1000]
}

In [12]:
# Set up gridsearch
lr_cv_gs = GridSearchCV(lr_cv_pipe, lr_cv_params, cv=5, verbose=1)

# Fit the gridsearch to the training data
lr_cv_gs.fit(X_train, y_train);

Fitting 5 folds for each of 576 candidates, totalling 2880 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear

[Parallel(n_jobs=1)]: Done 2880 out of 2880 | elapsed: 239.7min finished


[LibLinear]

In [13]:
# What was our highest cross-val accuracy score across model hyperparameter combinations?
lr_cv_gs.best_score_

0.9972567242248046

In [14]:
# What model hyperparameters yielded the highest accuracy score?
lr_cv_gs.best_estimator_

Pipeline(memory=None,
         steps=[('cvec',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=500, min_df=1, ngram_range=(1, 2),
                                 preprocessor=None, stop_words=None,
                                 strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('lr',
                 LogisticRegression(C=1, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
                              

In [15]:
# What were the parameters?
lr_cv_gs.best_params_

{'cvec__max_df': 1.0,
 'cvec__max_features': 500,
 'cvec__min_df': 1,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': None,
 'lr__C': 1,
 'lr__max_iter': 100,
 'lr__penalty': 'l1'}

In [17]:
# How does this model perform on the test set?
lr_cv_gs.score(X_test, y_test)

0.9971927635683094

This model performs with high accuracy on both the train and test datasets. It is neither overfit nor underfit.

## Logistic Regression: TfidfVecotorizer

In [18]:
# Set up pipeline
lr_tv_pipe = Pipeline([
    ('tvec', TfidfVectorizer()), 
    ('lr', LogisticRegression(solver='liblinear', verbose=1)) 
])

# Set up pipeline parameters
lr_tv_params = {
    'tvec__stop_words': [None, 'english'],
    'tvec__ngram_range': [(1,1), (1,2), (1,3)], 
    'tvec__max_features': [100, 500], 
    'tvec__max_df': [1.0, .95], 
    'tvec__min_df': [1, .05], 
    'lr__C': [1, 1e9], 
    'lr__penalty': ['l1', 'l2'],
    'lr__max_iter': [100, 500, 1000]
}

In [20]:
# Set up gridsearch
lr_tv_gs = GridSearchCV(lr_tv_pipe, lr_tv_params, cv=5)

# Fit the gridsearch to the training data
lr_tv_gs.fit(X_train, y_train);

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear

In [21]:
# What was our highest cross-val accuracy score across model hyperparameter combinations?
lr_tv_gs.best_score_

0.9946734062207577

In [22]:
# What model hyperparameters yielded the highest accuracy score?
lr_tv_gs.best_estimator_

Pipeline(memory=None,
         steps=[('tvec',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=500,
                                 min_df=1, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('lr',
                 LogisticRegression(C=1000000000.0, class_weight=None,
                                    dual=False, fit_intercept=True,
           

In [23]:
# What were the parameters?
lr_tv_gs.best_params_

{'lr__C': 1000000000.0,
 'lr__max_iter': 100,
 'lr__penalty': 'l2',
 'tvec__max_df': 1.0,
 'tvec__max_features': 500,
 'tvec__min_df': 1,
 'tvec__ngram_range': (1, 2),
 'tvec__stop_words': None}

In [24]:
# How does this model perform on the test set?
lr_tv_gs.score(X_test, y_test)

0.9949853639809971

## Support Vector Machine: CountVectorizer 1

In [11]:
# Set up pipeline
svc_cv_pipe = Pipeline([
    ('cvec', CountVectorizer()), 
    ('svc', SVC(gamma='scale')) 
])

# Set up pipeline parameters -- not searching over any parameters
svc_cv_params = {
    'cvec__ngram_range': [(1,2)], 
    'svc__C': [100], 
}

In [12]:
# Set up gridsearch
svc_cv_gs = GridSearchCV(svc_cv_pipe, svc_cv_params, cv=5)

# Fit the gridsearch to the training data
svc_cv_gs.fit(X_train, y_train);

In [13]:
# What was our highest cross-val accuracy score across model hyperparameter combinations?
svc_cv_gs.best_score_

0.9953372309710237

In [14]:
# What model hyperparameters yielded the highest accuracy score?
svc_cv_gs.best_estimator_

Pipeline(memory=None,
         steps=[('cvec',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('svc',
                 SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
                     decision_function_shape='ovr', degree=3, gamma='scale',
                     kernel='rbf', max_iter=-1, probability=False,
                     random_state=None, shrinking=True, tol=0.001,
                     verbo

In [15]:
# What were the parameters?
svc_cv_gs.best_params_

{'cvec__ngram_range': (1, 2),
 'cvec__stop_words': None,
 'svc__C': 100,
 'svc__kernel': 'rbf'}

In [16]:
# How does this model perform on the test set?
svc_cv_gs.score(X_test, y_test)

0.9959451029320024

## Support Vector Machine: CountVectorizer 2

In [14]:
# Set up pipeline
svc_cv_pipe2 = Pipeline([
    ('cvec', CountVectorizer(max_features=500, ngram_range=(1,2))), 
    ('svc', SVC(gamma='scale')) 
])

# Set up pipeline parameters -- searching over SVC C parameter
svc_cv_params2 = {
    'svc__C': [1, 2, 3, 7, 10, 20, 50] 
}

In [15]:
# Set up gridsearch
svc_cv_gs2 = GridSearchCV(svc_cv_pipe2, svc_cv_params2, n_jobs=4, cv=5)

# Fit the gridsearch to the training data
svc_cv_gs2.fit(X_train, y_train);

In [16]:
# What was our highest cross-val accuracy score across model hyperparameter combinations?
svc_cv_gs2.best_score_

0.9964569353690625

In [17]:
# What model hyperparameters yielded the highest accuracy score?
svc_cv_gs2.best_estimator_

Pipeline(memory=None,
         steps=[('cvec',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=500, min_df=1, ngram_range=(1, 2),
                                 preprocessor=None, stop_words=None,
                                 strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('svc',
                 SVC(C=7, cache_size=200, class_weight=None, coef0=0.0,
                     decision_function_shape='ovr', degree=3, gamma='scale',
                     kernel='rbf', max_iter=-1, probability=False,
                     random_state=None, shrinking=True, tol=0.001,
                     verbose=

In [18]:
# What were the parameters?
svc_cv_gs2.best_params_

{'svc__C': 7}

In [19]:
# How does this model perform on the test set?
svc_cv_gs2.score(X_test, y_test)

0.9965689332501559

## Support Vector Machine: TfidfVectorizer

## SVC: do not use

In [12]:
# Set up pipeline
svc_cv_pipe = Pipeline([
    ('cvec', CountVectorizer()), 
    ('svc', SVC(gamma='scale')) 
])

# Set up pipeline parameters
svc_cv_params = {
    'cvec__stop_words': [None, 'english'],
    'cvec__ngram_range': [(1,1), (1,2), (1,3)], 
    'svc__C': np.logspace(-2, 3, 10), 
    'svc__kernel': ['rbf', 'linear'],
}

In [13]:
# Set up gridsearch
svc_cv_gs = GridSearchCV(svc_cv_pipe, svc_cv_params, cv=5)

# Fit the gridsearch to the training data
svc_cv_gs.fit(X_train, y_train);

KeyboardInterrupt: 

In [None]:
# What was our highest cross-val accuracy score across model hyperparameter combinations?
svc_cv_gs.best_score_

In [None]:
# What model hyperparameters yielded the highest accuracy score?
svc_cv_gs.best_estimator_

In [None]:
# What were the parameters?
svc_cv_gs.best_params_

In [None]:
# How does this model perform on the test set?
svc_cv_gs.score(X_test, y_test)

## Support Vector Machine: TfidfVectorizer

In [None]:
# Set up pipeline
svc_tv_pipe = Pipeline([
    ('tvec', TfidfVectorizer()), 
    ('svc', SVC(gamma='scale')) 
])

# Set up pipeline parameters
svc_tv_params = {
    'tvec__stop_words': [None, 'english'],
    'tvec__ngram_range': [(1,1), (1,2), (1,3)], 
    'svc__C': np.logspace(-2, 3, 10), 
    'svc__kernel': ['rbf', 'linear'],
}

In [None]:
# Set up gridsearch
svc_tv_gs = GridSearchCV(svc_tv_pipe, svc_tv_params, cv=5)

# Fit the gridsearch to the training data
svc_tv_gs.fit(X_train, y_train);

In [None]:
# What was our highest cross-val accuracy score across model hyperparameter combinations?
svc_tv_gs.best_score_

In [None]:
# What model hyperparameters yielded the highest accuracy score?
svc_tv_gs.best_estimator_

In [None]:
# What were the parameters?
svc_tv_gs.best_params_

In [None]:
# How does this model perform on the test set?
svc_tv_gs.score(X_test, y_test)