# New whiskey competition notebook for dspt1 

In [16]:
# Import all the things!!
import pandas as pd 
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [7]:
train = pd.read_csv('C:\\Users\\dakot\\Documents\\GitHub\\DS-Unit-4-Sprint-1-NLP\\train.csv')
test = pd.read_csv('C:\\Users\\dakot\\Documents\\GitHub\\DS-Unit-4-Sprint-1-NLP\\test.csv')
train.shape, test.shape

((2586, 3), (288, 2))

In [4]:
# Look at the training set to get an Idea
train.head()

Unnamed: 0,id,description,category
0,1,A marriage of 13 and 18 year old bourbons. A m...,2
1,2,There have been some legendary Bowmores from t...,1
2,3,This bottling celebrates master distiller Park...,2
3,4,What impresses me most is how this whisky evol...,1
4,9,"A caramel-laden fruit bouquet, followed by une...",2


In [6]:
# That looks pretty sparse? how much info is actually in the description
train.description[0]

'A marriage of 13 and 18 year old bourbons. A mature yet very elegant whiskey, with a silky texture and so easy to embrace with a splash of water. Balanced notes of honeyed vanilla, soft caramel, a basket of complex orchard fruit, blackberry, papaya, and a dusting of cocoa and nutmeg; smooth finish. Sophisticated, stylish, with well-defined flavors. A classic!'

In [8]:
#weird theres no authors. Which I think was present during my last experience with this competition. Whats in the test set 
test.head()

Unnamed: 0,id,description
0,955,"Think carnival aromas—the good ones, anyway—me..."
1,3532,"A blend of three bourbons, between 6 and 12 ye..."
2,1390,"The nose is focused on cereal, hints of fresh ..."
3,1024,Swiss-based Chapter 7 released this 19 year ol...
4,1902,Valkyrie replaces the current Dark Origins exp...


In [13]:
# So as expected we  only have test cases without category labels in the test set. 
# Lets check everything for nans
train.isna().sum().sum(), test.isna().sum().sum()
# No Null values found so that is good. Lets go straight for baselines and see what happens 
train.category.value_counts(normalize=True)

1    0.633024
2    0.173627
3    0.116009
4    0.077340
Name: category, dtype: float64

In [14]:
# So with 63% of the distribution category 1 is the most likely outcome with a 63%, thats the number to beat! 
# Lets see how it models 📸
vect = TfidfVectorizer(stop_words='english')
sgdc = SGDClassifier()
pipe = Pipeline([('vect', vect), ('classifier', sgdc)])

target = 'category'
X_train = train['description']
y_train = train[target]
X_test = test['description']

#Fit pipeline:
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

In [15]:
# See what the output looks like
y_pred

array([2, 2, 4, 1, 1, 1, 1, 1, 2, 1, 4, 4, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,
       1, 1, 4, 1, 1, 1, 3, 1, 4, 2, 1, 1, 1, 1, 1, 3, 4, 3, 2, 1, 1, 3,
       1, 1, 1, 2, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 4,
       2, 3, 1, 1, 1, 3, 1, 1, 4, 1, 3, 2, 1, 1, 4, 2, 2, 1, 1, 3, 2, 4,
       1, 3, 1, 1, 1, 1, 1, 4, 1, 1, 4, 3, 1, 1, 1, 2, 1, 1, 1, 2, 1, 2,
       3, 1, 1, 1, 1, 3, 1, 1, 1, 1, 3, 1, 2, 1, 1, 1, 1, 2, 2, 4, 1, 1,
       1, 1, 3, 2, 1, 1, 1, 1, 1, 3, 2, 1, 1, 3, 4, 1, 1, 1, 3, 1, 1, 1,
       1, 1, 2, 1, 1, 1, 1, 1, 4, 1, 1, 1, 3, 1, 2, 2, 1, 3, 3, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 4, 1, 3, 1, 4, 1, 1, 2, 2, 1, 1,
       2, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 4, 1, 1, 3, 1, 2, 1, 1, 1, 1, 1,
       1, 4, 2, 2, 2, 2, 1, 2, 2, 2, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 3,
       2, 2, 1, 3, 1, 3, 3, 3, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 2, 1, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 3, 4, 1, 1, 1, 3,
       2, 1], dtype=int64)

In [17]:
# So the output is an array of predicted categories as an int
# Lets initialize our parameters for a grid search
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'classifier__max_iter': (20, 10, 100)
}

grid_search = GridSearchCV(pipe, parameters, cv=5, n_jobs=-1, verbose=10)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done  27 out of  45 | elapsed:    7.7s remaining:    5.1s
[Parallel(n_jobs=-1)]: Done  32 out of  45 | elapsed:    8.0s remaining:    3.2s
[Parallel(n_jobs=-1)]: Done  37 out of  45 | elapsed:    8.2s remaining:    1.7s
[Parallel(n_jobs=-1)]: Done  42 out of  45 | elapsed:    8.4s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    8.6s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                          

In [19]:
# Lets grab the best estimations from the grid and package a quick submission before tuning and selecting better models

best = grid_search.best_estimator_
grid_y_pred = best.predict(X_test)

# We then package it in the correct form for the competition
best_df = pd.DataFrame(test.id)
best_df['category'] = grid_y_pred.astype('int')
best_df.to_csv('C:\\Users\\dakot\\Documents\\GitHub\\DS-Unit-4-Sprint-1-NLP\\base_grid.csv', index=False)

## So the basic stochastic gradient descent model and grid search cv netted us an accuracy of %95.348 lets keep tuning 

In [37]:
vect = TfidfVectorizer(stop_words='english')
sgdc = SGDClassifier()
pipe = Pipeline([('vect', vect), ('classifier', sgdc)])

parameters = {
    'vect__ngram_range': ((1,1),(1,2),(1,3)),
    'vect__max_df': (0.25, 0.33, 0.5, 0.75, 1.0),
    'classifier__max_iter': (20, 100, 1000) #increasing max iterations to hopefully find convergence 
}

rand_search = RandomizedSearchCV(pipe, parameters, cv=7, n_jobs=-1, verbose=10)
rand_search.fit(X_train, y_train)

best = rand_search.best_estimator_
rand_y_pred = best.predict(X_test)

best_df = pd.DataFrame(test.id)
best_df['category'] = rand_y_pred.astype('int')
best_df.to_csv('C:\\Users\\dakot\\Documents\GitHub\\DS-Unit-4-Sprint-1-NLP\\rand_grid2.csv', index=False)

Fitting 7 folds for each of 10 candidates, totalling 70 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   10.9s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   12.0s
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   12.5s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   14.0s
[Parallel(n_jobs=-1)]: Done  55 out of  70 | elapsed:   15.6s remaining:    4.2s
[Parallel(n_jobs=-1)]: Done  63 out of  70 | elapsed:   16.1s remaining:    1.7s
[Parallel(n_jobs=-1)]: Done  70 out of  70 | elapsed:   18.9s finished


In [38]:
rand_search.best_score_

0.9423820572312451