## Kaggle Competition

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
# ---------------- Pandas settings --------------- #
# Removes rows and columns truncation of '...'
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import TruncatedSVD

from joblib import dump, load

import xgboost as xgb
import spacy


nlp = spacy.load("en_core_web_lg")

## Load data

In [3]:
test = pd.read_csv('./data/test.csv')
train = pd.read_csv('./data/train.csv')

## Quick EDA

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2586 entries, 0 to 2585
Data columns (total 3 columns):
id             2586 non-null int64
description    2586 non-null object
category       2586 non-null int64
dtypes: int64(2), object(1)
memory usage: 60.7+ KB


In [5]:
train[train.duplicated()]

Unnamed: 0,id,description,category


In [6]:
train.isnull().sum()

id             0
description    0
category       0
dtype: int64

In [7]:
train.head()

Unnamed: 0,id,description,category
0,1,A marriage of 13 and 18 year old bourbons. A m...,2
1,2,There have been some legendary Bowmores from t...,1
2,3,This bottling celebrates master distiller Park...,2
3,4,What impresses me most is how this whisky evol...,1
4,9,"A caramel-laden fruit bouquet, followed by une...",2


In [8]:
def wrangle(df):
    df = df.copy()
    df['description'] = df['description'].str.lower().str.strip().str.replace(r"’", "'")
    return df

In [9]:
train = wrangle(train)
test = wrangle(test)

In [10]:
train.loc[0, 'description']

'a marriage of 13 and 18 year old bourbons. a mature yet very elegant whiskey, with a silky texture and so easy to embrace with a splash of water. balanced notes of honeyed vanilla, soft caramel, a basket of complex orchard fruit, blackberry, papaya, and a dusting of cocoa and nutmeg; smooth finish. sophisticated, stylish, with well-defined flavors. a classic!'

In [11]:
train.loc[10, 'description']

"another excellent stagg, and considering its alcohol level, it's also a good value if you can get it at this price. notes of toffee, pot still rum, nougat, dates, tobacco, roasted nuts, polished oak, and leather. great depth and nicely balanced. a masculine bourbon of character and structure."

In [12]:
import string
from spacy.lang.en.stop_words import STOP_WORDS

def tokenize(text):
    tokens = []
    doc = nlp(text)
    for token in doc:
        if (token.is_stop == False) and (token.is_punct == False or token.text not in string.punctuation):
            tokens.append(token.text)
    return tokens

In [13]:
text = train.loc[10, 'description']
tokenize(text)

['excellent',
 'stagg',
 'considering',
 'alcohol',
 'level',
 'good',
 'value',
 'price',
 'notes',
 'toffee',
 'pot',
 'rum',
 'nougat',
 'dates',
 'tobacco',
 'roasted',
 'nuts',
 'polished',
 'oak',
 'leather',
 'great',
 'depth',
 'nicely',
 'balanced',
 'masculine',
 'bourbon',
 'character',
 'structure']

#### Submission

In [18]:
def submission(model, file_name_suffix):
    # Predictions on test sample
    preds = model.predict(test['description'])
    
    # Convert predictions to dataframe
    submission = pd.DataFrame({'id': test['id'], 'category':preds})
    submission['category'] = submission['category'].astype(int)
    
    # Save your Submission File
    file_path = f'./data/submission_{file_name_suffix}.csv'
    submission.to_csv(file_path, index=False)
    print(f'File saved at: {file_path}')
    print(submission.head())

#### Pickle 🥒 Functions

In [19]:
def pickle_model(model, file_name_suffix):
    model_path = f'./model/{file_name_suffix}.joblib'
    dump(model, model_path)
    print(f'Dumped model at: {model_path}')

In [20]:
def load_model(file_name_suffix):
    model_path = f'./model/{file_name_suffix}.joblib'
    model = load(model_path)
    return model

#### Timer

In [21]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

### TFID + RF

#### Load Model

In [22]:
grid_search1 = load_model('TFIDF_RF')

In [23]:
# rfc = RandomForestClassifier()
# vect = TfidfVectorizer(stop_words='english')

# pipe1 = Pipeline([('vect', vect), ('rfc', rfc)])

# parameters1 = {
#     'vect__max_df': (0.5, 0.75, 0.9, 0.95, 0.99),
#     'vect__min_df': (0.02, 0.05, 0.1, 0.15),
#     'vect__max_features': (100, 500, 1000),
#     'rfc__n_estimators': (100, 200, 300, 400, 500),
# }

# start_time = timer(None) # timing starts from this point for "start_time" variable
# grid_search1 = GridSearchCV(pipe1, parameters1, cv=5, n_jobs=-1, verbose=10)
# grid_search1.fit(train['description'], train['category'])
# timer(start_time) # timing ends here for "start_time" variable

In [24]:
grid_search1.best_estimator_

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=500, min_df=0.02,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [25]:
grid_search1.best_params_

{'rfc__n_estimators': 500,
 'vect__max_df': 0.5,
 'vect__max_features': 500,
 'vect__min_df': 0.02}

In [26]:
grid_search1.best_score_

0.8979118329466357

#### Save `submission.csv`

In [27]:
# submission(grid_search1, 'TFIDF_RF')

#### Pickle Model 🥒

In [28]:
# pickle_model(grid_search1, 'TFIDF_RF')

### TFIDF + SGDC

#### Load Model

In [29]:
grid_search2 = load_model('TFIDF_SGDC')

In [30]:
# sgdc = SGDClassifier()
# vect = TfidfVectorizer(stop_words='english')

# pipe2 = Pipeline([('vect', vect), 
#                   ('sgdc', sgdc)])

# parameters2 = {
#     'vect__max_df': (0.5, 0.75, 0.9, 0.95, 0.99),
#     'vect__min_df': (0.02, 0.05, 0.1, 0.15),
#     'vect__max_features': (100, 500, 1000),
#     'sgdc__max_iter': (300, 1000),
# }
# start_time = timer(None)
# grid_search2 = GridSearchCV(pipe2, parameters2, cv=5, n_jobs=-1, verbose=10)
# grid_search2.fit(train['description'], train['category'])
# timer(start_time)

In [31]:
grid_search2.best_estimator_

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=500, min_df=0.02,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True...m_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))])

In [32]:
grid_search2.best_params_

{'sgdc__max_iter': 1000,
 'vect__max_df': 0.5,
 'vect__max_features': 500,
 'vect__min_df': 0.02}

In [33]:
grid_search2.best_score_

0.9025522041763341

In [34]:
# submission(grid_search2, 'TFIDF_SGDC')

#### Pickle Model 🥒

In [35]:
# pickle_model(grid_search2, 'TFIDF_SGDC')

### TFID + XGBC

#### Load Model

In [36]:
grid_search3 = load_model('TFIDF_XGBC')

In [37]:
# %%time 

# xgbc = xgb.XGBClassifier()
# vect = TfidfVectorizer(stop_words='english')
# pipe3 = Pipeline([('vect', vect), 
#                   ('xgbc', xgbc)])

# # parameters3 = {
# #         'vect__max_df': (0.5, 0.75, 0.95, 0.99, 1),
# #         'vect__min_df': (.02,),
# #         'xgbc__learning_rate': (0.01, 0.05, 0.1),
# #         'xgbc__n_estimators': (100, 500, 800, 1000),
# #         'xgbc__min_child_weight': [1, 5, 10],
# #         'xgbc__gamma': [0.5, 1, 1.5, 2, 5],
# #         'xgbc__subsample': [0.6, 0.8, 1.0],
# #         'xgbc__colsample_bytree': [0.6, 0.8, 1.0],
# #         'xgbc__max_depth': [3, 5, 10, 15, 20],
# #         'xgbc__booster':['booster', 'gblinear', 'gbtree']
# # }

# # parameters3 = {
# #         'vect__max_df': (0.5, 0.75, 0.95, 0.99, 1), 
# #         'vect__min_df': (.02,), 
# #         'xgbc__learning_rate': (0.01, 0.05, 0.1),
# #         'xgbc__n_estimators': (100, 500, 800, 1000),
# #         'xgbc__max_depth': [3, 5, 10, 15, 20]
# # }

# parameters3 = {
#         'vect__max_df': (0.7,), 
#         'vect__min_df': (.02,), 
#         'xgbc__n_estimators': (100, 400, 800),
# }

# grid_search3 = GridSearchCV(pipe3, parameters3, cv=5, n_jobs=-1, verbose=10)
# grid_search3.fit(train['description'], train['category'])

In [38]:
grid_search3.best_estimator_

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.7, max_features=None, min_df=0.02,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=Tru...
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1))])

In [39]:
grid_search3.best_params_

{'vect__max_df': 0.7, 'vect__min_df': 0.02, 'xgbc__n_estimators': 800}

In [40]:
grid_search3.best_score_

0.9021655065738592

In [41]:
# submission(grid_search3, 'TFIDF_XGBC')

#### Pickle Model 🥒

In [42]:
# pickle_model(grid_search3, 'TFIDF_XGBC')

### TFID + SVD + SGDC

In [43]:
vect = TfidfVectorizer(stop_words='english', ngram_range=(1, 3), min_df=0.08)

sparse = vect.fit_transform(train['description'])

dtm = pd.DataFrame(sparse.todense(), columns=vect.get_feature_names())
print(dtm.shape)
dtm.head()

(2586, 80)


Unnamed: 0,age,aged,balanced,barrel,black,bottles,bottling,bourbon,caramel,cask,casks,chocolate,cinnamon,citrus,clean,color,complex,corn,creamy,dark,distilled,distillery,dried,dry,expression,finish,flavors,fresh,fruit,fruits,fruity,ginger,good,hint,honey,hot,just,leather,light,like,long,malt,matured,medium,mouth,new,nicely,nose,notes,oak,old,orange,palate,peat,pepper,quite,release,rich,ripe,rye,sherry,single,slightly,smoke,soft,spice,spices,spicy,subtle,sweet,sweetness,toffee,vanilla,water,whiskey,whisky,wood,year,year old,years
0,0.0,0.0,0.327069,0.0,0.0,0.0,0.0,0.0,0.252358,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.33985,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.145307,0.313372,0.0,0.205946,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.187799,0.0,0.241391,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.284168,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.190568,0.312701,0.302246,0.0,0.0,0.265732,0.285935,0.0
1,0.0,0.0,0.0,0.0,0.0,0.43511,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.402315,0.0,0.0,0.0,0.0,0.287176,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.40981,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.231059,0.446193,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.382009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.237351,0.0,0.0,0.230929,0.177876,0.0,0.0,0.0,0.0,0.180318,0.208739,0.0,0.0,0.233304,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.099752,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.215127,0.0,0.189318,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.131626,0.331425,0.218441,0.113753,0.0,0.0,0.213529,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.130823,0.0,0.414978,0.0,0.0,0.364846,0.196292,0.204265
3,0.0,0.0,0.0,0.0,0.2508,0.0,0.0,0.0,0.0,0.0,0.0,0.19648,0.0,0.0,0.0,0.0,0.259054,0.0,0.0,0.240951,0.0,0.0,0.0,0.0,0.0,0.0,0.238871,0.0,0.156984,0.0,0.0,0.0,0.0,0.20546,0.0,0.0,0.0,0.238871,0.0,0.0,0.210038,0.0,0.0,0.0,0.0,0.0,0.0,0.138895,0.143151,0.146154,0.0,0.0,0.252615,0.0,0.0,0.0,0.0,0.0,0.243363,0.0,0.0,0.0,0.0,0.208824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.199457,0.0,0.0,0.0,0.529775,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.308937,0.0,0.0,0.31555,0.321557,0.0,0.0,0.0,0.416045,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.177885,0.0,0.0,0.252119,0.0,0.0,0.0,0.0,0.0,0.343368,0.0,0.0,0.38363,0.0,0.0,0.337324,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.233294,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Load Model

In [44]:
grid_search4 = load_model('TFIDF_SVD_SGDC')

In [45]:
# %%time

# vect = TfidfVectorizer(stop_words='english', ngram_range=(1, 3))

# svd = TruncatedSVD(algorithm='randomized')

# sgdc = SGDClassifier(early_stopping=True)

# pipe4 = Pipeline([('vect', vect), ('svd', svd), ('sgdc', sgdc)])

# parameters4 = { 
#     'svd__n_iter': (10, 15, 20),
#     'svd__n_components': (100, 300, 500, 1000),
#     'sgdc__max_iter': (1000,)
# }


# grid_search4 = GridSearchCV(pipe4, parameters4, cv=5, n_jobs=-1, verbose=10)
# grid_search4.fit(train['description'], train['category'])

In [46]:
grid_search4.best_estimator_

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
...m_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))])

In [47]:
grid_search4.best_params_

{'sgdc__max_iter': 1000, 'svd__n_components': 500, 'svd__n_iter': 20}

In [48]:
grid_search4.best_score_

0.9381283836040216

#### Save submission

In [49]:
# submission(grid_search4, 'TFIDF_SVD_SGDC')

#### Pickle Model 🥒

In [50]:
# pickle_model(grid_search4, 'TFIDF_SVD_SGDC')

### TFID + ASVD + SGDC

#### Load Model

In [51]:
grid_search5 = load_model('TFIDF_ASVD_SGDC')

In [52]:
# %%time

# vect = TfidfVectorizer(stop_words='english')

# svd = TruncatedSVD(algorithm='randomized')

# sgdc = SGDClassifier(average=True)

# pipe5 = Pipeline([('vect', vect), ('svd', svd), ('sgdc', sgdc)])

# parameters5 = { 
#     'svd__n_iter': (20,),
#     'svd__n_components': (1000,)
# }

# grid_search5 = GridSearchCV(pipe5, parameters5, cv=5, n_jobs=-1, verbose=10)
# grid_search5.fit(train['description'], train['category'])

In [53]:
grid_search5.best_estimator_

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...m_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))])

In [54]:
grid_search5.best_params_

{'svd__n_components': 1000, 'svd__n_iter': 20}

In [55]:
grid_search5.best_score_

0.9191802010827533

#### Save submission

In [56]:
# submission(grid_search5, 'TFIDF_ASVD_SGDC')

#### Pickle Model 🥒

In [58]:
# pickle_model(grid_search5, 'TFIDF_ASVD_SGDC')

### TFIDF W/ STOPWORDS + SVD + SGDC

In [59]:
vect = TfidfVectorizer()

sparse = vect.fit_transform(train['description'].sample(50))

dtm = pd.DataFrame(sparse.todense(), columns=vect.get_feature_names())
print(dtm.shape)
dtm.head()

(50, 1252)


Unnamed: 0,10,100,12,120,131,16,176,18,19,1972,1974,1996,1997,20,2003,2007,21,25,26,263,288,30,33,3524,375,375ml,39,46,55,60,69,70,72,80,80s,85,89,90,93,about,above,abv,accented,acceptable,according,add,added,addition,additional,after,again,age,aged,aging,airiness,akin,alberta,alcohol,ale,all,allspice,almonds,almost,along,alongside,also,amazing,amber,american,amounts,an,and,anise,aniseed,annual,another,anti,antique,any,anyone,apart,apparent,appearing,apple,apples,applewood,approach,approachable,apricot,archetypal,arcs,ardbeg,are,armchair,aroma,aromas,aromatic,around,arran,art,...,uk,ultimately,ultra,underfoot,underlying,understated,understatement,unlike,unpeated,unsalted,unseeded,until,unusual,unusually,unwanted,up,us,use,used,vague,valhalla,validates,valley,value,vanilla,variant,variants,varnish,veritable,version,very,vetiver,vibrancy,vin,vineyard,vintage,viscous,vol,wafer,walls,want,warm,warming,was,washed,watching,water,watering,waxed,way,we,weight,weighty,welcome,well,were,werther,west,what,wheat,when,where,whereas,which,whiff,while,whiskey,whiskies,whisky,white,whittled,whole,wholly,whose,will,willie,wine,wisp,wisps,with,wonderful,wood,woodpile,work,worthwhile,would,year,years,yes,yet,yielded,you,young,younger,your,youth,youthful,zest,zesty,zing
0,0.144304,0.0,0.0,0.0,0.0,0.0,0.0,0.125074,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.159568,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.159568,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.092116,0.112937,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.095392,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.159568,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.144304,0.066022,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.12459,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.113775,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.145153,0.0,0.0,0.0,0.145153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.13698,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.060058,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.121416,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.037778,0.0,0.0,0.0,0.0,0.0,0.0,0.293039,0.0,0.0,0.131268,0.0,0.0,0.0,0.0,0.0,0.0,0.107531,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.152194,0.0,0.0,0.152194,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.152194,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.137635,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.107212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.287248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.137635,0.0,...,0.0,0.152194,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.119294,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.118833,0.0,0.0,0.0,0.0,0.0,0.0,0.102418,0.107212,0.0,0.0,0.152194,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.151025,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.126328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.10689,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.151025,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.153868,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.151025,0.0,0.0,0.111881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.117329,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.12974,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.153043,0.12974,0.0,0.0,0.108524,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.108524,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.12974,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.080477,0.0,0.0,0.0,0.108524,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.067534,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.091395,0.0,0.0,0.0,0.0,0.325571,0.0,0.108524,0.0,0.096113,0.0,0.0,0.0


#### Load Model

In [60]:
grid_search6 = load_model('TFIDF_W_STOPWORDS_SVD_SGDC')

In [61]:
# vect = TfidfVectorizer()

# svd = TruncatedSVD(algorithm='randomized')

# sgdc = SGDClassifier(early_stopping=True)

# pipe6 = Pipeline([('vect', vect), ('svd', svd), ('sgdc', sgdc)])

# parameters6 = { 
#     'svd__n_iter': (5, 10, 15, 20),
#     'svd__n_components': (100, 300, 500, 1000),
#     'sgdc__max_iter': (1000,),
# }

# grid_search6 = GridSearchCV(pipe6, parameters6, cv=5, n_jobs=-1, verbose=10)
# start_time = timer(None)
# grid_search6.fit(train['description'], train['category'])
# timer(start_time)

In [62]:
grid_search6.best_estimator_

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...m_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))])

In [63]:
grid_search6.best_params_

{'sgdc__max_iter': 1000, 'svd__n_components': 500, 'svd__n_iter': 10}

In [64]:
grid_search6.best_score_

0.934261407579273

#### Save submission

In [65]:
# submission(grid_search6, 'TFIDF_W_STOPWORDS_SVD_SGDC')

#### Pickle Model 🥒

In [66]:
# pickle_model(grid_search6, 'TFIDF_W_STOPWORDS_SVD_SGDC')

### TFIDF W/ STOPWORDS + SVD + SGDC OPTIMIZED

In [67]:
grid_search7 = load_model('TFIDF_W_STOPWORDS_SVD_SGDC_OPTIMIZED')

In [68]:
# vect = TfidfVectorizer()

# svd = TruncatedSVD(algorithm='randomized', n_iter=15, n_components=1000)

# sgdc = SGDClassifier(early_stopping=True, learning_rate='optimal')

# pipe7 = Pipeline([('vect', vect), ('svd', svd), ('sgdc', sgdc)])

parameters = { 
    'svd__n_iter': (15,),
    'svd__n_components': (500,),
    'sgdc__class_weight': ('balanced',),
    'sgdc__loss': ('hinge', 'log'),
    'sgdc__alpha': (0.0007,),
    'sgdc__tol' : (.0015,), 
    'sgdc__average' : (True, False),
    'sgdc__penalty' : ('l2', 'l1', 'elasticnet')
}

# grid_search7 = GridSearchCV(pipe7, parameters7, cv=5, n_jobs=-1, verbose=10)
# start_time = timer(None)
# grid_search7.fit(train['description'], train['category'])
# timer(start_time)

In [69]:
pd.DataFrame(grid_search7.cv_results_)



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_sgdc__alpha,param_sgdc__average,param_sgdc__class_weight,param_sgdc__loss,param_sgdc__penalty,param_sgdc__tol,param_svd__n_components,param_svd__n_iter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,17.312605,0.187871,0.134886,0.029705,0.0007,True,balanced,hinge,l2,0.0015,500,15,"{'sgdc__alpha': 0.0007, 'sgdc__average': True,...",0.942085,0.936293,0.947776,0.934236,0.906977,0.933488,0.014055,2,0.969052,0.970019,0.967134,0.968101,0.971498,0.969161,0.001513
1,16.689517,1.601079,0.115269,0.035624,0.0007,True,balanced,hinge,l1,0.0015,500,15,"{'sgdc__alpha': 0.0007, 'sgdc__average': True,...",0.909266,0.903475,0.918762,0.89942,0.868217,0.899845,0.017076,11,0.87766,0.881044,0.875302,0.893185,0.899517,0.885342,0.009389
2,16.329554,1.040065,0.112262,0.042725,0.0007,True,balanced,hinge,elasticnet,0.0015,500,15,"{'sgdc__alpha': 0.0007, 'sgdc__average': True,...",0.92471,0.916988,0.924565,0.924565,0.889535,0.916087,0.013582,7,0.921663,0.925532,0.928951,0.929435,0.94058,0.929232,0.006322
3,18.612582,1.726033,0.117998,0.051418,0.0007,True,balanced,log,l2,0.0015,500,15,"{'sgdc__alpha': 0.0007, 'sgdc__average': True,...",0.938224,0.920849,0.934236,0.920696,0.883721,0.919567,0.019228,5,0.954062,0.958414,0.956984,0.958917,0.95942,0.95756,0.001929
4,15.814015,1.442457,0.062558,0.007501,0.0007,True,balanced,log,l1,0.0015,500,15,"{'sgdc__alpha': 0.0007, 'sgdc__average': True,...",0.907336,0.888031,0.911025,0.912959,0.854651,0.894818,0.021928,12,0.878143,0.885397,0.884002,0.894635,0.899034,0.888242,0.007555
5,14.858299,0.908665,0.082908,0.032778,0.0007,True,balanced,log,elasticnet,0.0015,500,15,"{'sgdc__alpha': 0.0007, 'sgdc__average': True,...",0.903475,0.899614,0.924565,0.907157,0.872093,0.901392,0.016935,10,0.904739,0.89942,0.911068,0.908168,0.917874,0.908254,0.006179
6,14.518984,0.629171,0.061718,0.011268,0.0007,False,balanced,hinge,l2,0.0015,500,15,"{'sgdc__alpha': 0.0007, 'sgdc__average': False...",0.942085,0.920849,0.945841,0.932302,0.908915,0.930008,0.013637,3,0.968085,0.966151,0.969067,0.974384,0.974396,0.970417,0.003377
7,15.17896,0.929743,0.074099,0.020485,0.0007,False,balanced,hinge,l1,0.0015,500,15,"{'sgdc__alpha': 0.0007, 'sgdc__average': False...",0.940154,0.920849,0.943907,0.916828,0.887597,0.921887,0.020093,4,0.941973,0.947292,0.945384,0.942484,0.951208,0.945668,0.003382
8,15.270733,0.87099,0.060734,0.020067,0.0007,False,balanced,hinge,elasticnet,0.0015,500,15,"{'sgdc__alpha': 0.0007, 'sgdc__average': False...",0.944015,0.949807,0.94971,0.926499,0.903101,0.934648,0.017911,1,0.966151,0.964217,0.962301,0.969067,0.970048,0.966357,0.0029
9,14.11611,0.654448,0.053033,0.005621,0.0007,False,balanced,log,l2,0.0015,500,15,"{'sgdc__alpha': 0.0007, 'sgdc__average': False...",0.940154,0.903475,0.930368,0.928433,0.879845,0.916473,0.021933,6,0.953095,0.960832,0.953601,0.962784,0.961836,0.958429,0.004198


In [70]:
grid_search7.best_params_

{'sgdc__alpha': 0.0007,
 'sgdc__average': False,
 'sgdc__class_weight': 'balanced',
 'sgdc__loss': 'hinge',
 'sgdc__penalty': 'elasticnet',
 'sgdc__tol': 0.0015,
 'svd__n_components': 500,
 'svd__n_iter': 15}

In [71]:
grid_search7.best_score_

0.9346481051817479

#### Save submission

In [72]:
# submission(grid_search7, 'TFIDF_W_STOPWORDS_SVD_SGDC_OPTIMIZED')

#### Pickle Model 🥒

In [73]:
# pickle_model(grid_search7, 'TFIDF_W_STOPWORDS_SVD_SGDC_OPTIMIZED')

### Spacy Embeddings + SGDC

In [None]:
# def get_word_vectors(docs):
#     return [nlp(doc).vector for doc in docs]

# train_desc_embeddings = get_word_vectors(train['description']) 
# test_desc_embeddings = get_word_vectors(test['description']) 

In [None]:
# sgdc = SGDClassifier(early_stopping=True)

#### Save submission

In [None]:
# submission(, 'CUSTOM_TOKENIZER_LEMMAS_TFIDF_SVD_SGDC')

#### Pickle Model 🥒

In [None]:
# pickle(, 'CUSTOM_TOKENIZER_LEMMAS_TFIDF_SVD_SGDC')

## Bayesian Optimization

In [None]:
# from bayes_opt import BayesianOptimization


# def sgdc_pipe(n_iter, n_components, max_iter):
    
#     vect = TfidfVectorizer(tokenizer=tokenize, ngram_range=(1, 3))
#     svd = TruncatedSVD(algorithm='randomized')
#     sgdc = SGDClassifier(early_stopping=True)

#     pipeline = Pipeline([('vect', vect), ('svd', svd), ('sgdc', sgdc)])
    
#     return pipeline


# def sgdc_cv():
#     scores = cross_val_score(pipeline, data, targets, scoring='auc')
# pbounds = { 
#     'svd__n_iter': (10, 20),
#     'svd__n_components': (100, 1000),
#     'sgdc__max_iter': (1000, 1500)
# }

