## Kaggle Competition

In [43]:
import pandas as pd
import numpy as np
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
# ---------------- Pandas settings --------------- #
# Removes rows and columns truncation of '...'
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)


from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import TruncatedSVD

from joblib import dump, load

import xgboost as xgb
import spacy


nlp = spacy.load("en_core_web_lg")

## Load data

In [2]:
test = pd.read_csv('./data/test.csv')
train = pd.read_csv('./data/train.csv')

## Quick EDA

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2586 entries, 0 to 2585
Data columns (total 3 columns):
id             2586 non-null int64
description    2586 non-null object
category       2586 non-null int64
dtypes: int64(2), object(1)
memory usage: 60.7+ KB


In [4]:
train[train.duplicated()]

Unnamed: 0,id,description,category


In [5]:
train.isnull().sum()

id             0
description    0
category       0
dtype: int64

In [6]:
train.head()

Unnamed: 0,id,description,category
0,1,A marriage of 13 and 18 year old bourbons. A m...,2
1,2,There have been some legendary Bowmores from t...,1
2,3,This bottling celebrates master distiller Park...,2
3,4,What impresses me most is how this whisky evol...,1
4,9,"A caramel-laden fruit bouquet, followed by une...",2


In [7]:
def wrangle(df):
    df = df.copy()
    df['description'] = df['description'].str.lower().str.strip().str.replace(r"’", "'")
    return df

In [8]:
train = wrangle(train)
test = wrangle(test)

In [9]:
train.loc[0, 'description']

'a marriage of 13 and 18 year old bourbons. a mature yet very elegant whiskey, with a silky texture and so easy to embrace with a splash of water. balanced notes of honeyed vanilla, soft caramel, a basket of complex orchard fruit, blackberry, papaya, and a dusting of cocoa and nutmeg; smooth finish. sophisticated, stylish, with well-defined flavors. a classic!'

In [10]:
train.loc[10, 'description']

"another excellent stagg, and considering its alcohol level, it's also a good value if you can get it at this price. notes of toffee, pot still rum, nougat, dates, tobacco, roasted nuts, polished oak, and leather. great depth and nicely balanced. a masculine bourbon of character and structure."

In [11]:
import string
from spacy.lang.en.stop_words import STOP_WORDS

def tokenize(text):
    tokens = []
    doc = nlp(text)
    for token in doc:
        if (token.is_stop == False) and (token.is_punct == False or token.text not in string.punctuation):
            tokens.append(token.text)
    return tokens

In [12]:
text = train.loc[10, 'description']
tokenize(text)

['another',
 'excellent',
 'stagg',
 'and',
 'considering',
 'its',
 'alcohol',
 'level',
 'it',
 "'s",
 'also',
 'a',
 'good',
 'value',
 'if',
 'you',
 'can',
 'get',
 'it',
 'at',
 'this',
 'price',
 'notes',
 'of',
 'toffee',
 'pot',
 'still',
 'rum',
 'nougat',
 'dates',
 'tobacco',
 'roasted',
 'nuts',
 'polished',
 'oak',
 'and',
 'leather',
 'great',
 'depth',
 'and',
 'nicely',
 'balanced',
 'a',
 'masculine',
 'bourbon',
 'of',
 'character',
 'and',
 'structure']

#### Submission

In [13]:
def submission(model, file_name_suffix):
    # Predictions on test sample
    preds = model.predict(test['description'])
    
    # Convert predictions to dataframe
    submission = pd.DataFrame({'id': test['id'], 'category':preds})
    submission['category'] = submission['category'].astype(int)
    
    # Save your Submission File
    file_path = f'./data/submission_{file_name_suffix}.csv'
    submission.to_csv(file_path, index=False)
    print(f'File saved at: {file_path}')
    print(submission.head())

#### Pickle 🥒 Functions

In [48]:
def pickle_model(model, file_name_suffix):
    model_path = f'./model/{file_name_suffix}.joblib'
    dump(model, model_path)
    print(f'Dumped model at: {model_path}')

In [45]:
def load_model(file_name_suffix):
    model_path = f'./model/{file_name_suffix}.joblib'
    model = load(model_path)
    return model

#### Timer

In [14]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

### TFID + RF

#### Load Model

In [None]:
grid_search1 = load_model('TFIDF_RF')

In [15]:
# rfc = RandomForestClassifier()
# vect = TfidfVectorizer(stop_words='english')

# pipe1 = Pipeline([('vect', vect), ('rfc', rfc)])

# parameters1 = {
#     'vect__max_df': (0.5, 0.75, 0.9, 0.95, 0.99),
#     'vect__min_df': (0.02, 0.05, 0.1, 0.15),
#     'vect__max_features': (100, 500, 1000),
#     'rfc__n_estimators': (100, 200, 300, 400, 500),
# }

# start_time = timer(None) # timing starts from this point for "start_time" variable
# grid_search1 = GridSearchCV(pipe1, parameters1, cv=5, n_jobs=-1, verbose=10)
# grid_search1.fit(train['description'], train['category'])
# timer(start_time) # timing ends here for "start_time" variable

Fitting 5 folds for each of 300 candidates, totalling 1500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.4s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:   11.3s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:   16.3s
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:   19.5s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   22.8s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   25.0s
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:   29.7s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   33.2s
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed:   36.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   


 Time taken: 0 hours 12 minutes and 36.15 seconds.


In [16]:
grid_search1.best_estimator_

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=500, min_df=0.02,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [17]:
grid_search1.best_params_

{'rfc__n_estimators': 500,
 'vect__max_df': 0.5,
 'vect__max_features': 500,
 'vect__min_df': 0.02}

In [18]:
grid_search1.best_score_

0.8982985305491106

#### Save `submission.csv`

In [19]:
submission(grid_search1, 'TFIDF_RF')

File saved at: ./data/submission_TFIDF_RF.csv
     id  category
0   955         2
1  3532         3
2  1390         1
3  1024         1
4  1902         1


#### Pickle Model 🥒

In [49]:
pickle_model(grid_search1, 'TFIDF_RF')

Dumped model at: ./model/TFIDF_RF.joblib


### TFIDF + SGDC

#### Load Model

In [None]:
grid_search2 = load_model('TFIDF_SGDC')

In [22]:
# %%time

# sgdc = SGDClassifier()
# vect = TfidfVectorizer(stop_words='english')

# pipe2 = Pipeline([('vect', vect), 
#                   ('sgdc', sgdc)])

# parameters2 = {
#     'vect__max_df': (0.5, 0.75, 0.9, 0.95, 0.99),
#     'vect__min_df': (0.02, 0.05, 0.1, 0.15),
#     'vect__max_features': (100, 500, 1000),
#     'sgdc__max_iter': (300, 1000, 3000),
# }

# grid_search2 = GridSearchCV(pipe2, parameters2, cv=5, n_jobs=-1, verbose=10)
# grid_search2.fit(train['description'], train['category'])

Fitting 5 folds for each of 180 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:   10.1s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   11.6s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:   14.9s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   16.5s
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed:   18.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   

CPU times: user 6.48 s, sys: 597 ms, total: 7.08 s
Wall time: 3min 33s


In [23]:
grid_search2.best_estimator_

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=1000, min_df=0.02,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=Tru...m_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))])

In [24]:
grid_search2.best_params_

{'sgdc__max_iter': 1000,
 'vect__max_df': 0.5,
 'vect__max_features': 1000,
 'vect__min_df': 0.02}

In [25]:
grid_search2.best_score_

0.9021655065738592

In [26]:
submission(grid_search2, 'TFIDF_SGDC')

File saved at: ./data/submission_TFIDF_SGDC.csv
     id  category
0   955         2
1  3532         2
2  1390         1
3  1024         1
4  1902         1


#### Pickle Model 🥒

In [51]:
pickle_model(grid_search2, 'TFIDF_SGDC')

Dumped model at: ./model/TFIDF_SGDC.joblib


### TFID + XGBC

#### Load Model

In [None]:
grid_search3 = load_model('TFIDF_XGBC')

In [27]:
# %%time 

# xgbc = xgb.XGBClassifier()
# vect = TfidfVectorizer(stop_words='english')
# pipe3 = Pipeline([('vect', vect), 
#                   ('xgbc', xgbc)])

# # parameters3 = {
# #         'vect__max_df': (0.5, 0.75, 0.95, 0.99, 1),
# #         'vect__min_df': (.02,),
# #         'xgbc__learning_rate': (0.01, 0.05, 0.1),
# #         'xgbc__n_estimators': (100, 500, 800, 1000),
# #         'xgbc__min_child_weight': [1, 5, 10],
# #         'xgbc__gamma': [0.5, 1, 1.5, 2, 5],
# #         'xgbc__subsample': [0.6, 0.8, 1.0],
# #         'xgbc__colsample_bytree': [0.6, 0.8, 1.0],
# #         'xgbc__max_depth': [3, 5, 10, 15, 20],
# #         'xgbc__booster':['booster', 'gblinear', 'gbtree']
# # }

# # parameters3 = {
# #         'vect__max_df': (0.5, 0.75, 0.95, 0.99, 1), 
# #         'vect__min_df': (.02,), 
# #         'xgbc__learning_rate': (0.01, 0.05, 0.1),
# #         'xgbc__n_estimators': (100, 500, 800, 1000),
# #         'xgbc__max_depth': [3, 5, 10, 15, 20]
# # }

# parameters3 = {
#         'vect__max_df': (0.7,), 
#         'vect__min_df': (.02,), 
#         'xgbc__n_estimators': (100, 400, 800),
# }

# grid_search3 = GridSearchCV(pipe3, parameters3, cv=5, n_jobs=-1, verbose=10)
# grid_search3.fit(train['description'], train['category'])

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of  15 | elapsed:    4.6s remaining:   29.6s
[Parallel(n_jobs=-1)]: Done   4 out of  15 | elapsed:    4.6s remaining:   12.8s
[Parallel(n_jobs=-1)]: Done   6 out of  15 | elapsed:   14.7s remaining:   22.1s
[Parallel(n_jobs=-1)]: Done   8 out of  15 | elapsed:   15.0s remaining:   13.1s
[Parallel(n_jobs=-1)]: Done  10 out of  15 | elapsed:   19.7s remaining:    9.9s
[Parallel(n_jobs=-1)]: Done  12 out of  15 | elapsed:   28.3s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   32.3s finished


CPU times: user 16.9 s, sys: 15.9 ms, total: 17 s
Wall time: 49.1 s


In [28]:
grid_search3.best_estimator_

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.7, max_features=None, min_df=0.02,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=Tru...
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1))])

In [29]:
grid_search3.best_params_

{'vect__max_df': 0.7, 'vect__min_df': 0.02, 'xgbc__n_estimators': 800}

In [30]:
grid_search3.best_score_

0.9021655065738592

In [31]:
submission(grid_search3, 'TFIDF_XGBC')

File saved at: ./data/submission_TFIDF_XGBC.csv
     id  category
0   955         2
1  3532         3
2  1390         1
3  1024         1
4  1902         1


#### Pickle Model 🥒

In [52]:
pickle_model(grid_search3, 'TFIDF_XGBC')

Dumped model at: ./model/TFIDF_XGBC.joblib


### TFID + SVD + SGDC

In [32]:
vect = TfidfVectorizer(stop_words='english', ngram_range=(1, 3), min_df=0.08)

sparse = vect.fit_transform(train['description'])

dtm = pd.DataFrame(sparse.todense(), columns=vect.get_feature_names())
print(dtm.shape)
dtm.head()

(2586, 885)


Unnamed: 0,000,000 bottles,10,10 year,10 year old,100,12,12 year,12 year old,15,15 year,15 year old,16,17,18,18 year,18 year old,20,2015,2016,21,25,30,30 year,30 year old,375,375 ml,40,45,46,50,500,60,70,80,add,added,adding,addition,additional,adds,age,age statement,aged,aged bourbon,aged years,aggressive,aging,ago,alcohol,allowing,allspice,almond,almonds,alongside,amber,amber color,american,american oak,anise,aniseed,antique,appears,appetizing,apple,apples,apricot,apricots,aroma,aromas,aromatic,arran,available,background,baked,baking,baking spices,balance,balanced,banana,barley,barrel,barrels,base,batch,beautiful,beautifully,bed,begins,berries,berry,best,better,big,bit,bitter,bitterness,black,black pepper,blackberry,...,sweeter,sweetness,syrup,taffy,takes,tangerine,tannic,tannins,tar,tarry,tart,taste,tasted,tastes,tea,teasing,texture,textured,thing,things,think,time,tinged,tinned,toasted,tobacco,toffee,tongue,touch,trace,traditional,travel,travel retail,travel retail exclusive,treacle,tropical,tropical fruit,tropical fruits,true,turn,turns,typical,ultimately,underlying,used,using,value,vanilla,vanilla caramel,vanilla cream,vanilla fudge,vanilla honey,variant,ve,version,vibrant,vintage,viscous,walnuts,want,warehouse,warm,warming,water,water brings,waxy,way,weight,wet,wheat,whiff,whiskey,whiskeys,whiskies,whisky,white,white chocolate,white pepper,wine,wine casks,wisp,wood,wood smoke,woody,work,world,worth,year,year old,year old expression,years,years old,yes,yields,young,younger,youth,youthful,zest,zesty
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.212281,0.233001,0.234217,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.147286,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.243818,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.173912,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.085817,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.140816,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.136108,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.119665,0.128763,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.186185,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.170648,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.199707,0.0,0.0,0.0,0.0,0.0,0.0,0.20473,0.226971,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.145004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.106727,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.158906,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.114213,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058826,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.133002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.186599,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.164057,0.088265,0.0,0.09185,0.0,0.0,0.0,0.0,0.0,0.157345,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.137397,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.154506,0.10927,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.201269,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.29023,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.156394,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.159277,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.161313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083086,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Load Model

In [None]:
grid_search4 = load_model('TFIDF_SVD_SGDC')

In [35]:
# %%time

# vect = TfidfVectorizer(stop_words='english', ngram_range=(1, 3))

# svd = TruncatedSVD(algorithm='randomized')

# sgdc = SGDClassifier(early_stopping=True)

# pipe4 = Pipeline([('vect', vect), ('svd', svd), ('sgdc', sgdc)])

# parameters4 = { 
#     'svd__n_iter': (10, 15, 20),
#     'svd__n_components': (100, 300, 500, 1000),
#     'sgdc__max_iter': (300, 1000, 3000)
# }


# grid_search4 = GridSearchCV(pipe4, parameters4, cv=5, n_jobs=-1, verbose=10)
# grid_search4.fit(train['description'], train['category'])

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   29.9s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  9.3min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed: 17.3min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed: 34.6min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed: 40.5min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed: 47.3min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed: 56.6min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed: 103.8min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed: 132.7min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 152.6min
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed: 187.9min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed

CPU times: user 11min 17s, sys: 37.9 s, total: 11min 55s
Wall time: 4h 29min 35s


In [36]:
grid_search4.best_estimator_

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
...m_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))])

In [37]:
grid_search4.best_params_

{'sgdc__max_iter': 1000, 'svd__n_components': 1000, 'svd__n_iter': 20}

In [38]:
grid_search4.best_score_

0.9373549883990719

#### Save submission

In [39]:
submission(grid_search4, 'TFIDF_SVD_SGDC')

File saved at: ./data/submission_TFIDF_SVD_SGDC.csv
     id  category
0   955         2
1  3532         3
2  1390         4
3  1024         1
4  1902         1


#### Pickle Model 🥒

In [53]:
pickle_model(grid_search4, 'TFIDF_SVD_SGDC')

Dumped model at: ./model/TFIDF_SVD_SGDC.joblib


### TFID + ASVD + SGDC

#### Load Model

In [76]:
grid_search5 = load_model('TFIDF_ASVD_SGDC')

In [75]:
# %%time

# vect = TfidfVectorizer(stop_words='english', ngram_range=(1, 3))

# svd = TruncatedSVD(algorithm='randomized')

# sgdc = SGDClassifier(average=True)

# pipe5 = Pipeline([('vect', vect), ('svd', svd), ('sgdc', sgdc)])

# parameters5 = { 
#     'svd__n_iter': (20,),
#     'svd__n_components': (1000,)
# }

# grid_search5 = GridSearchCV(pipe5, parameters5, cv=5, n_jobs=-1, verbose=10)
# grid_search5.fit(train['description'], train['category'])

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed: 11.6min remaining: 17.4min
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed: 11.9min remaining:  7.9min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 15.5min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 15.5min finished


CPU times: user 10min 48s, sys: 37.7 s, total: 11min 26s
Wall time: 26min 53s




In [77]:
grid_search5.best_estimator_

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
...m_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))])

In [78]:
grid_search5.best_params_

{'svd__n_components': 1000, 'svd__n_iter': 20}

In [79]:
grid_search5.best_score_

0.9160866202629544

#### Save submission

In [80]:
submission(grid_search5, 'TFIDF_ASVD_SGDC')

File saved at: ./data/submission_TFIDF_ASVD_SGDC.csv
     id  category
0   955         2
1  3532         3
2  1390         1
3  1024         1
4  1902         1


#### Pickle Model 🥒

In [81]:
pickle_model(grid_search5, 'TFIDF_ASVD_SGDC')

Dumped model at: ./model/TFIDF_ASVD_SGDC.joblib


### TFIDF W/ STOPWORDS + SVD + SGDC

In [98]:
vect = TfidfVectorizer()

sparse = vect.fit_transform(train['description'].sample(50))

dtm = pd.DataFrame(sparse.todense(), columns=vect.get_feature_names())
print(dtm.shape)
dtm.head()

(50, 1110)


Unnamed: 0,10,12,13,15,16,18,19,1990,1996,21,25,30,337,34,375,45,450,46,53,65,72,75,87,95,99,aberlour,about,accompanied,according,accordingly,activity,add,added,adding,addition,adds,advertised,age,aged,aging,ago,alcohol,alike,all,allowing,allspice,almond,almonds,almost,along,alongside,also,amber,american,an,and,angel,angus,another,any,anytime,aperitif,appearance,appears,appetizing,apple,apples,approachable,appropriate,apricot,are,area,aroma,aromas,arran,array,arrival,arrives,as,assemble,assertive,assured,astringent,at,attempts,attic,attractive,back,background,backing,baked,baker,balance,balanced,balances,balmenach,balsamic,balvenie,barley,barnyard,...,trade,traditional,transitioning,treacle,trees,trick,trilogy,trio,tropical,true,two,types,ultimately,unbaked,unconventional,unctuous,under,underlying,undisclosed,unmistakable,up,us,used,utah,valley,value,vanilla,variety,ve,venison,version,very,vibrant,village,vinegar,vintage,vintages,viscous,wafer,warm,was,wash,water,waxiness,waxy,we,weight,welcome,well,were,west,what,wheel,when,where,which,whiff,while,whiskey,whiskeys,whiskies,whisky,white,who,why,wide,wild,will,wine,wines,winter,with,without,wonderful,wood,woodiness,woody,working,worth,would,woven,wrapped,wrote,ximenez,year,years,yeasty,yellow,yet,yield,yielding,yields,york,you,young,younger,youthful,yuzu,zest,zestiness
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.157564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.157564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.113727,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.101653,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.157564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.073877,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.157564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.116726,0.0,0.0,0.0,0.0,0.0,0.0,0.110995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083651,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.067897,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.127606,0.0,0.0,0.0,0.141103,0.0,0.0,0.0,0.0,0.0,0.0,0.141103,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.141103,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.141103,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.091033,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.127606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.141103,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.141103,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.109902,0.0,0.0,0.0,0.0,0.0,0.0,0.124093,0.0,0.0,0.0,0.0,0.142771,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.124093,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.148353,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.069558,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03938,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.092023,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.092023,0.0,0.0,0.0,0.0,0.0,0.148353
3,0.0,0.0,0.0,0.170848,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.170848,0.0,0.0,0.0,0.0,0.0,0.127132,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.181812,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.170848,0.0,0.0,0.093536,0.0,0.0,0.0,0.0,0.121882,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.158026,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.170848,0.088579,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.050149,0.170848,0.0,0.0,0.0,0.0,0.0,0.0,0.139954,0.0,0.0,0.0,0.0,0.0,0.117186,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.117186,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.077422,0.134114,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.121285,0.0,0.129069,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.336548,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.132803,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.134114,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.134114,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.086524,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.121285,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.094476,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.106802,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.134114,0.0,0.0,0.0,0.0,0.0,0.0,0.083191,0.0,0.0,0.0,0.0,0.0,0.0


#### Load Model

In [118]:
grid_search6 = load_model('TFIDF_W_STOPWORDS_SVD_SGDC')

In [99]:
# vect = TfidfVectorizer()

# svd = TruncatedSVD(algorithm='randomized')

# sgdc = SGDClassifier(early_stopping=True)

# pipe6 = Pipeline([('vect', vect), ('svd', svd), ('sgdc', sgdc)])

# parameters6 = { 
#     'svd__n_iter': (5, 10, 15, 20),
#     'svd__n_components': (100, 300, 500, 1000),
#     'sgdc__max_iter': (1000,),
# }

# grid_search6 = GridSearchCV(pipe6, parameters6, cv=5, n_jobs=-1, verbose=10)
# start_time = timer(None)
# grid_search6.fit(train['description'], train['category'])
# timer(start_time)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   22.6s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   24.5s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   58.6s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done  74 out of  80 | elapsed:  7.2min remaining:   35.2s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  8.3min finished



 Time taken: 0 hours 9 minutes and 6.94 seconds.


In [100]:
grid_search6.best_estimator_

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...m_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))])

In [101]:
grid_search6.best_params_

{'sgdc__max_iter': 1000, 'svd__n_components': 1000, 'svd__n_iter': 10}

In [102]:
grid_search6.best_score_

0.9319412219644239

#### Save submission

In [106]:
submission(grid_search6, 'TFIDF_W_STOPWORDS_SVD_SGDC')

File saved at: ./data/submission_TFIDF_W_STOPWORDS_SVD_SGDC.csv
     id  category
0   955         2
1  3532         2
2  1390         4
3  1024         1
4  1902         1


#### Pickle Model 🥒

In [105]:
pickle_model(grid_search6, 'TFIDF_W_STOPWORDS_SVD_SGDC')

Dumped model at: ./model/TFIDF_W_STOPWORDS_SVD_SGDC.joblib


### Custom Tokenizer (nouns) + TFIDF + SVD + SGDC

In [107]:
def tokenize(text):
    tokens = []
    doc = nlp(text)
    for chunk in doc.noun_chunks:
        tokens.append(chunk.lemma_)
    return tokens

In [108]:
text = train.loc[10, 'description']
tokenize(text)

['another excellent stagg',
 '-PRON- alcohol level',
 '-PRON-',
 'a good value',
 '-PRON-',
 '-PRON-',
 'this price',
 'note',
 'toffee',
 'pot',
 'nougat',
 'date',
 'tobacco',
 'roasted nut',
 'polished oak',
 'leather',
 'great depth',
 'a masculine bourbon',
 'character',
 'structure']

In [117]:
vect = TfidfVectorizer(tokenizer=tokenize, min_df=.02, max_df=.9)

sparse = vect.fit_transform(train['description'].sample(100))

dtm = pd.DataFrame(sparse.todense(), columns=vect.get_feature_names())
print(dtm.shape)
dtm.head()

(100, 171)


Unnamed: 0,( u.s,-PRON-,-PRON- aroma,-PRON- way,375 ml,50 %,a bit,a dry finish,a hint,a lovely example,a mix,a nice whisky,a nose,a secondary period,a suggestion,a touch,a whisky,almond,anise,aniseed,banana,black cherry,black currant,black pepper,black raspberry,bourbon,bourbon cask,bramble,brazil nut,bright fruit,brine,brown sugar,buffalo trace,butterscotch,caramel,cask,cask strength,char,chewy toffee,chili,chocolate fudge,cinnamon,citrus,citrus fruit,clove,cocktail cherry,cocoa,cocoa powder,coconut,coconut cream,corn,crème brûlée,currant,dark chocolate,date,dry spice,earth,editor 's choice,espresso,everything,fig,flavor,fresh fruit,fruit,ginger,golden raisin,gooseberry,grain,grass,hazelnut,heather honey,hint,honey,honeyed vanilla,i,just the hint,leather,length,licorice,licorice root,light,linseed,lot,malt,maple syrup,marshmallow,marzipan,mass,maturation,milk chocolate,milky coffee,nectarine,none,note,nougat,nutmeg,nutty toffee,oak,orange,orchard fruit,other,palate,part,peach,peat,pepper,pineapple,plain chocolate,plenty,plum,polished leather,polished oak,price,raisin,raspberry,ripe berry,rum,salt,seaweed,sherry,sherry cask,smoke,solvent,something,spice,strawberry,sultana,sweet fruit,sweet sherry,sweetness,syrup,tangerine,texture,the barrel,the bunch,the cask,the finish,the flavor,the heat,the impression,the long finish,the medium - length finish,the middle,the mouth,the nose,the palate,the same time,the second release,the series,the smoke,the tongue,the u.s,the whisky,the wood,this bourbon,this expression,this one,this whiskey,this whisky,time,tinned peach,tobacco,toffee,vanilla,vanilla fudge,virginia,walnut,water,what,whisky,white pepper
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.289048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.42047,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.364828,0.0,0.0,0.0,0.0,0.364828,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.449074,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.202001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.202001,0.177629,0.0,0.0,0.0,0.0,0.0,0.0,0.398283,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.680973,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.680973,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.269355,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.137628,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.386616,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.386616,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.386616,0.0,0.0,0.0,0.0,0.243322,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.386616,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.386616,0.0,0.0,0.0,0.0,0.314087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.275396,0.0,0.0,0.0
3,0.0,0.16116,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.423885,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.423885,0.0,0.0,0.0,0.0,0.322485,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.383242,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.203642,0.0,0.0,0.0,0.0,0.0,0.452721,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.342599,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.353915,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.465437,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4971,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.465437,0.0,0.0,0.0,0.440877,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
vect = TfidfVectorizer(tokenizer=tokenize, min_df=.02, max_df=.9)

svd = TruncatedSVD(algorithm='randomized')

sgdc = SGDClassifier(early_stopping=True)

pipe7 = Pipeline([('vect', vect), ('svd', svd), ('sgdc', sgdc)])

parameters7 = { 
    'svd__n_iter': (5, 10, 15),
    'svd__n_components': (30, 50, 70, 80, 90, 100, 150)
}

grid_search7 = GridSearchCV(pipe7, parameters7, cv=5, n_jobs=-1, verbose=10)
grid_search7.fit(train['description'], train['category'])

#### Load Model

In [None]:
# model = load_model('TFIDF_RF')

In [None]:
grid_search7.best_estimator_

In [None]:
grid_search7.best_params_

In [None]:
grid_search7.best_score_

#### Save submission

In [None]:
submission(grid_search7, 'CUSTOM_TOKENIZER_NOUNS_TFIDF_SVD_SGDC')

#### Pickle Model 🥒

In [None]:
pickle(grid_search7, 'CUSTOM_TOKENIZER_NOUNS_TFIDF_SVD_SGDC')

### Spacy Embeddings + SGDC

In [None]:
def get_word_vectors(docs):
    return [nlp(doc).vector for doc in docs]

train_desc_embeddings = get_word_vectors(train['description']) 
test_desc_embeddings = get_word_vectors(test['description']) 

In [None]:
sgdc = SGDClassifier(early_stopping=True)

In [None]:
submission(, 'CUSTOM_TOKENIZER_TFIDF_SVD_SGDC')

#### Save submission

In [None]:
submission(, 'CUSTOM_TOKENIZER_LEMMAS_TFIDF_SVD_SGDC')

#### Pickle Model 🥒

In [None]:
pickle(, 'CUSTOM_TOKENIZER_LEMMAS_TFIDF_SVD_SGDC')

### Majority Vote

In [None]:
# import pandas as pd

# # Filenames of your submissions you want to ensemble
# files = ['submission-01.csv', 'submission-02.csv', 'submission-03.csv']

# target = 'status_group'
# submissions = (pd.read_csv(file)[[target]] for file in files)
# ensemble = pd.concat(submissions, axis='columns')
# majority_vote = ensemble.mode(axis='columns')[0]

# sample_submission = pd.read_csv('sample_submission.csv')
# submission = sample_submission.copy()
# submission[target] = majority_vote
# submission.to_csv('my-ultimate-ensemble-submission.csv', index=False)

## Bayesian Optimization

In [82]:
# from bayes_opt import BayesianOptimization


# def sgdc_pipe(n_iter, n_components, max_iter):
    
#     vect = TfidfVectorizer(tokenizer=tokenize, ngram_range=(1, 3))
#     svd = TruncatedSVD(algorithm='randomized')
#     sgdc = SGDClassifier(early_stopping=True)

#     pipeline = Pipeline([('vect', vect), ('svd', svd), ('sgdc', sgdc)])
    
#     return pipeline


# def sgdc_cv():
#     scores = cross_val_score(pipeline, data, targets, scoring='auc')
# pbounds = { 
#     'svd__n_iter': (10, 20),
#     'svd__n_components': (100, 1000),
#     'sgdc__max_iter': (1000, 1500)
# }

