## Kaggle Competition

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
# ---------------- Pandas settings --------------- #
# Removes rows and columns truncation of '...'
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)


from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import TruncatedSVD

from joblib import dump, load

import xgboost as xgb
import spacy


nlp = spacy.load("en_core_web_lg")

## Load data

In [2]:
test = pd.read_csv('./data/test.csv')
train = pd.read_csv('./data/train.csv')

## Quick EDA

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2586 entries, 0 to 2585
Data columns (total 3 columns):
id             2586 non-null int64
description    2586 non-null object
category       2586 non-null int64
dtypes: int64(2), object(1)
memory usage: 60.7+ KB


In [4]:
train[train.duplicated()]

Unnamed: 0,id,description,category


In [5]:
train.isnull().sum()

id             0
description    0
category       0
dtype: int64

In [6]:
train.head()

Unnamed: 0,id,description,category
0,1,A marriage of 13 and 18 year old bourbons. A m...,2
1,2,There have been some legendary Bowmores from t...,1
2,3,This bottling celebrates master distiller Park...,2
3,4,What impresses me most is how this whisky evol...,1
4,9,"A caramel-laden fruit bouquet, followed by une...",2


In [7]:
def wrangle(df):
    df = df.copy()
    df['description'] = df['description'].str.lower().str.strip().str.replace(r"’", "'")
    return df

In [8]:
train = wrangle(train)
test = wrangle(test)

In [9]:
train.loc[0, 'description']

'a marriage of 13 and 18 year old bourbons. a mature yet very elegant whiskey, with a silky texture and so easy to embrace with a splash of water. balanced notes of honeyed vanilla, soft caramel, a basket of complex orchard fruit, blackberry, papaya, and a dusting of cocoa and nutmeg; smooth finish. sophisticated, stylish, with well-defined flavors. a classic!'

In [10]:
train.loc[10, 'description']

"another excellent stagg, and considering its alcohol level, it's also a good value if you can get it at this price. notes of toffee, pot still rum, nougat, dates, tobacco, roasted nuts, polished oak, and leather. great depth and nicely balanced. a masculine bourbon of character and structure."

In [11]:
import string
from spacy.lang.en.stop_words import STOP_WORDS

def tokenize(text):
    tokens = []
    doc = nlp(text)
    for token in doc:
        if (token.is_stop == False) and (token.is_punct == False or token.text not in string.punctuation):
            tokens.append(token.text)
    return tokens

In [12]:
text = train.loc[10, 'description']
tokenize(text)

['another',
 'excellent',
 'stagg',
 'and',
 'considering',
 'its',
 'alcohol',
 'level',
 'it',
 "'s",
 'also',
 'a',
 'good',
 'value',
 'if',
 'you',
 'can',
 'get',
 'it',
 'at',
 'this',
 'price',
 'notes',
 'of',
 'toffee',
 'pot',
 'still',
 'rum',
 'nougat',
 'dates',
 'tobacco',
 'roasted',
 'nuts',
 'polished',
 'oak',
 'and',
 'leather',
 'great',
 'depth',
 'and',
 'nicely',
 'balanced',
 'a',
 'masculine',
 'bourbon',
 'of',
 'character',
 'and',
 'structure']

#### Submission

In [13]:
def submission(model, file_name_suffix):
    # Predictions on test sample
    preds = model.predict(test['description'])
    
    # Convert predictions to dataframe
    submission = pd.DataFrame({'id': test['id'], 'category':preds})
    submission['category'] = submission['category'].astype(int)
    
    # Save your Submission File
    file_path = f'./data/submission_{file_name_suffix}.csv'
    submission.to_csv(file_path, index=False)
    print(f'File saved at: {file_path}')
    print(submission.head())

#### Pickle 🥒 Functions

In [14]:
def pickle_model(model, file_name_suffix):
    model_path = f'./model/{file_name_suffix}.joblib'
    dump(model, model_path)
    print(f'Dumped model at: {model_path}')

In [15]:
def load_model(file_name_suffix):
    model_path = f'./model/{file_name_suffix}.joblib'
    model = load(model_path)
    return model

#### Timer

In [16]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

### TFID + RF

#### Load Model

In [None]:
# grid_search1 = load_model('TFIDF_RF')

In [None]:
rfc = RandomForestClassifier()
vect = TfidfVectorizer(stop_words='english')

pipe1 = Pipeline([('vect', vect), ('rfc', rfc)])

parameters1 = {
    'vect__max_df': (0.5, 0.75, 0.9, 0.95, 0.99),
    'vect__min_df': (0.02, 0.05, 0.1, 0.15),
    'vect__max_features': (100, 500, 1000),
    'rfc__n_estimators': (100, 200, 300, 400, 500),
}

start_time = timer(None) # timing starts from this point for "start_time" variable
grid_search1 = GridSearchCV(pipe1, parameters1, cv=5, n_jobs=-1, verbose=10)
grid_search1.fit(train['description'], train['category'])
timer(start_time) # timing ends here for "start_time" variable

Fitting 5 folds for each of 300 candidates, totalling 1500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    9.5s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   11.5s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   16.3s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   19.6s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   24.6s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   28.1s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:   34.7s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   40.0s
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:   47.0s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   52.1s
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:  1

In [None]:
grid_search1.best_estimator_

In [None]:
grid_search1.best_params_

In [None]:
grid_search1.best_score_

#### Save `submission.csv`

In [None]:
submission(grid_search1, 'TFIDF_RF')

#### Pickle Model 🥒

In [None]:
pickle_model(grid_search1, 'TFIDF_RF')

### TFIDF + SGDC

#### Load Model

In [None]:
# grid_search2 = load_model('TFIDF_SGDC')

In [None]:
%%time

sgdc = SGDClassifier()
vect = TfidfVectorizer(stop_words='english')

pipe2 = Pipeline([('vect', vect), 
                  ('sgdc', sgdc)])

parameters2 = {
    'vect__max_df': (0.5, 0.75, 0.9, 0.95, 0.99),
    'vect__min_df': (0.02, 0.05, 0.1, 0.15),
    'vect__max_features': (100, 500, 1000),
    'sgdc__max_iter': (300, 1000),
}

grid_search2 = GridSearchCV(pipe2, parameters2, cv=5, n_jobs=-1, verbose=10)
grid_search2.fit(train['description'], train['category'])

In [None]:
grid_search2.best_estimator_

In [None]:
grid_search2.best_params_

In [None]:
grid_search2.best_score_

In [None]:
submission(grid_search2, 'TFIDF_SGDC')

#### Pickle Model 🥒

In [None]:
pickle_model(grid_search2, 'TFIDF_SGDC')

### TFID + XGBC

#### Load Model

In [None]:
# grid_search3 = load_model('TFIDF_XGBC')

In [None]:
%%time 

xgbc = xgb.XGBClassifier()
vect = TfidfVectorizer(stop_words='english')
pipe3 = Pipeline([('vect', vect), 
                  ('xgbc', xgbc)])

# parameters3 = {
#         'vect__max_df': (0.5, 0.75, 0.95, 0.99, 1),
#         'vect__min_df': (.02,),
#         'xgbc__learning_rate': (0.01, 0.05, 0.1),
#         'xgbc__n_estimators': (100, 500, 800, 1000),
#         'xgbc__min_child_weight': [1, 5, 10],
#         'xgbc__gamma': [0.5, 1, 1.5, 2, 5],
#         'xgbc__subsample': [0.6, 0.8, 1.0],
#         'xgbc__colsample_bytree': [0.6, 0.8, 1.0],
#         'xgbc__max_depth': [3, 5, 10, 15, 20],
#         'xgbc__booster':['booster', 'gblinear', 'gbtree']
# }

# parameters3 = {
#         'vect__max_df': (0.5, 0.75, 0.95, 0.99, 1), 
#         'vect__min_df': (.02,), 
#         'xgbc__learning_rate': (0.01, 0.05, 0.1),
#         'xgbc__n_estimators': (100, 500, 800, 1000),
#         'xgbc__max_depth': [3, 5, 10, 15, 20]
# }

parameters3 = {
        'vect__max_df': (0.7,), 
        'vect__min_df': (.02,), 
        'xgbc__n_estimators': (100, 400, 800),
}

grid_search3 = GridSearchCV(pipe3, parameters3, cv=5, n_jobs=-1, verbose=10)
grid_search3.fit(train['description'], train['category'])

In [None]:
grid_search3.best_estimator_

In [None]:
grid_search3.best_params_

In [None]:
grid_search3.best_score_

In [None]:
submission(grid_search3, 'TFIDF_XGBC')

#### Pickle Model 🥒

In [None]:
pickle_model(grid_search3, 'TFIDF_XGBC')

### TFID + SVD + SGDC

In [None]:
vect = TfidfVectorizer(stop_words='english', ngram_range=(1, 3), min_df=0.08)

sparse = vect.fit_transform(train['description'])

dtm = pd.DataFrame(sparse.todense(), columns=vect.get_feature_names())
print(dtm.shape)
dtm.head()

#### Load Model

In [None]:
# grid_search4 = load_model('TFIDF_SVD_SGDC')

In [None]:
%%time

vect = TfidfVectorizer(stop_words='english', ngram_range=(1, 3))

svd = TruncatedSVD(algorithm='randomized')

sgdc = SGDClassifier(early_stopping=True)

pipe4 = Pipeline([('vect', vect), ('svd', svd), ('sgdc', sgdc)])

parameters4 = { 
    'svd__n_iter': (10, 15, 20),
    'svd__n_components': (100, 300, 500, 1000),
    'sgdc__max_iter': (1000,)
}


grid_search4 = GridSearchCV(pipe4, parameters4, cv=5, n_jobs=-1, verbose=10)
grid_search4.fit(train['description'], train['category'])

In [None]:
grid_search4.best_estimator_

In [None]:
grid_search4.best_params_

In [None]:
grid_search4.best_score_

#### Save submission

In [None]:
submission(grid_search4, 'TFIDF_SVD_SGDC')

#### Pickle Model 🥒

In [None]:
pickle_model(grid_search4, 'TFIDF_SVD_SGDC')

### TFID + ASVD + SGDC

#### Load Model

In [None]:
grid_search5 = load_model('TFIDF_ASVD_SGDC')

In [None]:
%%time

vect = TfidfVectorizer(stop_words='english', ngram_range=(1, 3))

svd = TruncatedSVD(algorithm='randomized')

sgdc = SGDClassifier(average=True)

pipe5 = Pipeline([('vect', vect), ('svd', svd), ('sgdc', sgdc)])

parameters5 = { 
    'svd__n_iter': (20,),
    'svd__n_components': (1000,)
}

grid_search5 = GridSearchCV(pipe5, parameters5, cv=5, n_jobs=-1, verbose=10)
grid_search5.fit(train['description'], train['category'])

In [None]:
grid_search5.best_estimator_

In [None]:
grid_search5.best_params_

In [None]:
grid_search5.best_score_

#### Save submission

In [None]:
submission(grid_search5, 'TFIDF_ASVD_SGDC')

#### Pickle Model 🥒

In [None]:
pickle_model(grid_search5, 'TFIDF_ASVD_SGDC')

### TFIDF W/ STOPWORDS + SVD + SGDC

In [None]:
vect = TfidfVectorizer()

sparse = vect.fit_transform(train['description'].sample(50))

dtm = pd.DataFrame(sparse.todense(), columns=vect.get_feature_names())
print(dtm.shape)
dtm.head()

#### Load Model

In [None]:
grid_search6 = load_model('TFIDF_W_STOPWORDS_SVD_SGDC')

In [99]:
# vect = TfidfVectorizer()

# svd = TruncatedSVD(algorithm='randomized')

# sgdc = SGDClassifier(early_stopping=True)

# pipe6 = Pipeline([('vect', vect), ('svd', svd), ('sgdc', sgdc)])

# parameters6 = { 
#     'svd__n_iter': (5, 10, 15, 20),
#     'svd__n_components': (100, 300, 500, 1000),
#     'sgdc__max_iter': (1000,),
# }

# grid_search6 = GridSearchCV(pipe6, parameters6, cv=5, n_jobs=-1, verbose=10)
# start_time = timer(None)
# grid_search6.fit(train['description'], train['category'])
# timer(start_time)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   22.6s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   24.5s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   58.6s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done  74 out of  80 | elapsed:  7.2min remaining:   35.2s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  8.3min finished



 Time taken: 0 hours 9 minutes and 6.94 seconds.


In [100]:
grid_search6.best_estimator_

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...m_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))])

In [101]:
grid_search6.best_params_

{'sgdc__max_iter': 1000, 'svd__n_components': 1000, 'svd__n_iter': 10}

In [102]:
grid_search6.best_score_

0.9319412219644239

#### Save submission

In [106]:
submission(grid_search6, 'TFIDF_W_STOPWORDS_SVD_SGDC')

File saved at: ./data/submission_TFIDF_W_STOPWORDS_SVD_SGDC.csv
     id  category
0   955         2
1  3532         2
2  1390         4
3  1024         1
4  1902         1


#### Pickle Model 🥒

In [105]:
pickle_model(grid_search6, 'TFIDF_W_STOPWORDS_SVD_SGDC')

Dumped model at: ./model/TFIDF_W_STOPWORDS_SVD_SGDC.joblib


### Custom Tokenizer (nouns) + TFIDF + SVD + SGDC

In [107]:
def tokenize(text):
    tokens = []
    doc = nlp(text)
    for chunk in doc.noun_chunks:
        tokens.append(chunk.lemma_)
    return tokens

In [108]:
text = train.loc[10, 'description']
tokenize(text)

['another excellent stagg',
 '-PRON- alcohol level',
 '-PRON-',
 'a good value',
 '-PRON-',
 '-PRON-',
 'this price',
 'note',
 'toffee',
 'pot',
 'nougat',
 'date',
 'tobacco',
 'roasted nut',
 'polished oak',
 'leather',
 'great depth',
 'a masculine bourbon',
 'character',
 'structure']

In [117]:
vect = TfidfVectorizer(tokenizer=tokenize, min_df=.02, max_df=.9)

sparse = vect.fit_transform(train['description'].sample(100))

dtm = pd.DataFrame(sparse.todense(), columns=vect.get_feature_names())
print(dtm.shape)
dtm.head()

(100, 171)


Unnamed: 0,( u.s,-PRON-,-PRON- aroma,-PRON- way,375 ml,50 %,a bit,a dry finish,a hint,a lovely example,a mix,a nice whisky,a nose,a secondary period,a suggestion,a touch,a whisky,almond,anise,aniseed,banana,black cherry,black currant,black pepper,black raspberry,bourbon,bourbon cask,bramble,brazil nut,bright fruit,brine,brown sugar,buffalo trace,butterscotch,caramel,cask,cask strength,char,chewy toffee,chili,chocolate fudge,cinnamon,citrus,citrus fruit,clove,cocktail cherry,cocoa,cocoa powder,coconut,coconut cream,corn,crème brûlée,currant,dark chocolate,date,dry spice,earth,editor 's choice,espresso,everything,fig,flavor,fresh fruit,fruit,ginger,golden raisin,gooseberry,grain,grass,hazelnut,heather honey,hint,honey,honeyed vanilla,i,just the hint,leather,length,licorice,licorice root,light,linseed,lot,malt,maple syrup,marshmallow,marzipan,mass,maturation,milk chocolate,milky coffee,nectarine,none,note,nougat,nutmeg,nutty toffee,oak,orange,orchard fruit,other,palate,part,peach,peat,pepper,pineapple,plain chocolate,plenty,plum,polished leather,polished oak,price,raisin,raspberry,ripe berry,rum,salt,seaweed,sherry,sherry cask,smoke,solvent,something,spice,strawberry,sultana,sweet fruit,sweet sherry,sweetness,syrup,tangerine,texture,the barrel,the bunch,the cask,the finish,the flavor,the heat,the impression,the long finish,the medium - length finish,the middle,the mouth,the nose,the palate,the same time,the second release,the series,the smoke,the tongue,the u.s,the whisky,the wood,this bourbon,this expression,this one,this whiskey,this whisky,time,tinned peach,tobacco,toffee,vanilla,vanilla fudge,virginia,walnut,water,what,whisky,white pepper
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.289048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.42047,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.364828,0.0,0.0,0.0,0.0,0.364828,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.449074,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.202001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.202001,0.177629,0.0,0.0,0.0,0.0,0.0,0.0,0.398283,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.680973,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.680973,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.269355,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.137628,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.386616,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.386616,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.386616,0.0,0.0,0.0,0.0,0.243322,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.386616,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.386616,0.0,0.0,0.0,0.0,0.314087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.275396,0.0,0.0,0.0
3,0.0,0.16116,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.423885,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.423885,0.0,0.0,0.0,0.0,0.322485,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.383242,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.203642,0.0,0.0,0.0,0.0,0.0,0.452721,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.342599,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.353915,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.465437,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4971,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.465437,0.0,0.0,0.0,0.440877,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
vect = TfidfVectorizer(tokenizer=tokenize, min_df=.02, max_df=.9)

svd = TruncatedSVD(algorithm='randomized')

sgdc = SGDClassifier(early_stopping=True)

pipe7 = Pipeline([('vect', vect), ('svd', svd), ('sgdc', sgdc)])

parameters7 = { 
    'svd__n_iter': (5, 10, 15),
    'svd__n_components': (30, 50, 70, 80, 90, 100, 150)
}

grid_search7 = GridSearchCV(pipe7, parameters7, cv=5, n_jobs=-1, verbose=10)
grid_search7.fit(train['description'], train['category'])

#### Load Model

In [None]:
# model = load_model('TFIDF_RF')

In [None]:
grid_search7.best_estimator_

In [None]:
grid_search7.best_params_

In [None]:
grid_search7.best_score_

#### Save submission

In [None]:
submission(grid_search7, 'CUSTOM_TOKENIZER_NOUNS_TFIDF_SVD_SGDC')

#### Pickle Model 🥒

In [None]:
pickle(grid_search7, 'CUSTOM_TOKENIZER_NOUNS_TFIDF_SVD_SGDC')

### Spacy Embeddings + SGDC

In [None]:
def get_word_vectors(docs):
    return [nlp(doc).vector for doc in docs]

train_desc_embeddings = get_word_vectors(train['description']) 
test_desc_embeddings = get_word_vectors(test['description']) 

In [None]:
sgdc = SGDClassifier(early_stopping=True)

In [None]:
submission(, 'CUSTOM_TOKENIZER_TFIDF_SVD_SGDC')

#### Save submission

In [None]:
submission(, 'CUSTOM_TOKENIZER_LEMMAS_TFIDF_SVD_SGDC')

#### Pickle Model 🥒

In [None]:
pickle(, 'CUSTOM_TOKENIZER_LEMMAS_TFIDF_SVD_SGDC')

### Majority Vote

In [None]:
# import pandas as pd

# # Filenames of your submissions you want to ensemble
# files = ['submission-01.csv', 'submission-02.csv', 'submission-03.csv']

# target = 'status_group'
# submissions = (pd.read_csv(file)[[target]] for file in files)
# ensemble = pd.concat(submissions, axis='columns')
# majority_vote = ensemble.mode(axis='columns')[0]

# sample_submission = pd.read_csv('sample_submission.csv')
# submission = sample_submission.copy()
# submission[target] = majority_vote
# submission.to_csv('my-ultimate-ensemble-submission.csv', index=False)

## Bayesian Optimization

In [82]:
# from bayes_opt import BayesianOptimization


# def sgdc_pipe(n_iter, n_components, max_iter):
    
#     vect = TfidfVectorizer(tokenizer=tokenize, ngram_range=(1, 3))
#     svd = TruncatedSVD(algorithm='randomized')
#     sgdc = SGDClassifier(early_stopping=True)

#     pipeline = Pipeline([('vect', vect), ('svd', svd), ('sgdc', sgdc)])
    
#     return pipeline


# def sgdc_cv():
#     scores = cross_val_score(pipeline, data, targets, scoring='auc')
# pbounds = { 
#     'svd__n_iter': (10, 20),
#     'svd__n_components': (100, 1000),
#     'sgdc__max_iter': (1000, 1500)
# }

