In [1]:
import pandas as pd
import numpy as np
import json
import os
import re
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix
from sklearn.pipeline import Pipeline
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
import string

a)remove all duplicates from sheet in excel using DATA>remove duplicates
b)using excel concatenate function joined "AgentCorrected_CatName_Primary" and "AgentCorrected_Integer_Primary" column with comma and new target column name is "primary".


In [2]:

df2_cicd=pd.read_csv('/Users/jghosh2/Documents/my-notebook/Tax_ml_poc/data/TaxML-CICD - Prod_Data.csv')
df2_cicd.drop_duplicates(inplace=True)
df2_cicd.dropna(how='all',inplace=True)

In [3]:
import numpy as np
import multiprocessing as mp

import string
import spacy 
import en_core_web_sm
from nltk.tokenize import word_tokenize
from sklearn.base import TransformerMixin, BaseEstimator
from normalise import normalise
import nltk
nltk.download('brown')
nlp = en_core_web_sm.load()


class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self,
                 variety="BrE",
                 user_abbrevs={},
                 n_jobs=1):
        """
        Text preprocessing transformer includes steps:
            1. Text normalization
            2. Punctuation removal
            3. Stop words removal
            4. Lemmatization

        variety - format of date (AmE - american type, BrE - british format) 
        user_abbrevs - dict of user abbreviations mappings (from normalise package)
        n_jobs - parallel jobs to run
        """
        self.variety = variety
        self.user_abbrevs = user_abbrevs
        self.n_jobs = n_jobs

    def fit(self, X, y=None):
        return self

    def transform(self, X, *_):
        X_copy = X.copy()

        partitions = 1
        cores = mp.cpu_count()
        if self.n_jobs <= -1:
            partitions = cores
        elif self.n_jobs <= 0:
            return X_copy.apply(self._preprocess_text)
        else:
            partitions = min(self.n_jobs, cores)

        data_split = np.array_split(X_copy, partitions)
        pool = mp.Pool(cores)
        data = pd.concat(pool.map(self._preprocess_part, data_split))
        pool.close()
        pool.join()

        return data
    def _normalize(self, text):
    # some issues in normalise package
        try:
            return ' '.join(normalise(text, variety=self.variety, user_abbrevs=self.user_abbrevs, verbose=False))
        except:
            return text

    def _preprocess_part(self, part):
        return part.apply(self._preprocess_text)

    def _preprocess_text(self, text):
        normalized_text = self._normalize(text)
        doc = nlp(normalized_text)
        removed_punct = self._remove_punct(doc)
        removed_stop_words = self._remove_stop_words(removed_punct)
        return self._lemmatize(removed_stop_words)


    def _remove_punct(self, doc):
        return [t for t in doc if t.text not in string.punctuation]

    def _remove_stop_words(self, doc):
        return [t for t in doc if not t.is_stop]

    def _lemmatize(self, doc):
        return ' '.join([t.lemma_ for t in doc])

[nltk_data] Downloading package brown to /Users/jghosh2/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [50]:
def preprocess_text(message):

    #new_stopwords=['grocery']
    stpwrd = nltk.corpus.stopwords.words('english')
    #stpwrd.extend(new_stopwords)
    # 1. Init Lemmatizer
    lemmatizer = WordNetLemmatizer()
    #lowering and removing punctuation
    message = re.sub(r'[^\w\s]','', message.lower())
    #removing the numerical values and working only with text values
    message = re.sub('[^a-zA-Z]', " ", message )
    #removing the stopwords
    message = ' '.join([word for word in message.split() if word not in stpwrd])
    #lemmatizing the text
    message =  " ".join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(message) if w not in string.punctuation])
    #print("message is : ",message)
    
    return message

In [3]:
import spacy #load spacy
#nlp = en_core_web_sm.load()
nlp = spacy.load("en_core_web_sm", disable=['parser', 'tagger', 'ner'])
stops = stopwords.words("english")

def normalize(comment, lowercase, remove_stopwords):
    if lowercase:
        comment = comment.lower()
    comment = nlp(comment)
    lemmatized = list()
    for word in comment:
        lemma = word.lemma_.strip()
        if lemma:
            if not remove_stopwords or (remove_stopwords and lemma not in stops):
                lemmatized.append(lemma)
    return " ".join(lemmatized)




In [4]:
df = pd.read_csv('/Users/jghosh2/Documents/my-notebook/Tax_ml_poc/data/historical_data.csv', encoding='utf8',engine='python',usecols=['Item','Description','establishment_type','primary'])
df1=df.sample(frac=1, random_state=42)
df1 = df1.fillna('')
df1['input_str'] = df1[['Item','Description','establishment_type']].apply(lambda x: ' '.join(x[x.notnull()]), axis = 1)
#df1['cleanText'] = df1['input_str'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]).lower())
#df1['cleanText']=df1['input_str'].map(lambda s:preprocess_text(s)) 
df1['cleanText'] = df1['input_str'].apply(normalize, lowercase=True, remove_stopwords=True)

df1 = df1.reset_index(drop=True)
X=df1[['Item','Description','establishment_type','cleanText']]
Y_primary=df1['primary']
#Train test split with stratified sampling for evaluation
X_train, X_test, y_train_primary, y_test_primary = train_test_split(X,
                                                    Y_primary,
                                                    test_size = .20, 
                                                    random_state = 42
                                                    )

df2 = df2_cicd
df2['primary'] = df2['AgentCorrected_CatName_Primary'] + ',' + df2['AgentCorrected_Integer_Primary'].astype('int').astype('str')
df2['input_str'] = df2[['Item','Description','establishment_type']].apply(lambda x: ' '.join(x[x.notnull()]), axis = 1)
#df2['cleanText']=df2['input_str'].map(lambda s:preprocess_text(s))
#df2['cleanText'] = df2['input_str'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]).lower())
df2['cleanText'] = df2['input_str'].apply(normalize, lowercase=True, remove_stopwords=True)
X_cicd=df2[['Item','Description','establishment_type','cleanText']]

Y_primary_cicd=df2['primary']
X_train_cicd, X_test_cicd, y_train_primary_cicd, y_test_primary_cicd = train_test_split(X_cicd,
                                                    Y_primary_cicd,
                                                    test_size = .20, 
                                                    random_state = 42
                                                    )
X_train_final_1 = X_train.append(X_train_cicd)
X_test_final_1 = X_test.append(X_test_cicd)
X_train_final=X_train['cleanText'].append(X_train_cicd['cleanText'])
y_train_final_primary=y_train_primary.append(y_train_primary_cicd)
X_test_final=X_test['cleanText'].append(X_test_cicd['cleanText'])
y_test_final_primary=y_test_primary.append(y_test_primary_cicd)




split and save  data for train and test

In [36]:
train_size = 0.8
train_end = int(len(df1)*train_size)
df_train = df1[:train_end]
df_test = df1[train_end:]
train_end_cicd = int(len(df2)*train_size)
df2_train = df2[:train_end_cicd]
df2_test = df2[train_end_cicd:]
df2_train = df2_train[['Item','Description','establishment_type','primary','input_str','cleanText']]
df2_test = df2_test[['Item','Description','establishment_type','primary','input_str','cleanText']]
X_train_save = df_train.append(df2_train)
X_test_save = df_test.append(df2_test)
X_train_save['label'] = 'train'
X_test_save['label'] = 'test'
X_data = X_train_save.append(X_test_save)
X_data.to_csv('df_traintestdata_03-01-22.csv')

In [14]:
df1.shape

(170829, 6)

In [8]:
result=X_test_final_1
rf = Pipeline([('vect', CountVectorizer(strip_accents='ascii', token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b', lowercase=True, stop_words='english',max_df=0.9)),
       ('tfidf', TfidfTransformer(sublinear_tf=True)),
       ('clf', RandomForestClassifier(oob_score=True,n_jobs=-1))])
rf.fit(X_train_final, y_train_final_primary)
y_pred= rf.predict(X_test_final)
result['original_cat_primary']=y_test_final_primary
result['prediction_cat_primary']=y_pred
result['prediction_cat_primary_confscore']=rf.predict_proba(X_test_final).max()
output={'accuracy':accuracy_score(y_pred,y_test_final_primary),'precision_score':precision_score(y_pred,y_test_final_primary,average='macro'),'recall_score':recall_score(y_pred,y_test_final_primary,average='macro')
,'f1_score':f1_score(y_pred,y_test_final_primary,average='macro')}
result['confusion_matrix_primary']=str(output)

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


In [10]:
print(result['confusion_matrix_primary'][0])

{'accuracy': 0.817339755464987, 'precision_score': 0.5905648165179906, 'recall_score': 0.7215184393843495, 'f1_score': 0.6307314672452732}


In [26]:
result

Unnamed: 0,Item,Description,establishment_type,cleanText,original_cat_primary,prediction_cat_primary,prediction_cat_primary_confscore,confusion_matrix_primary
87548,Grey Goose 375ml (40% ABV),\N,GROCERY,grey goose,"CAT_LIQUOR,535","CAT_LIQUOR,535",1.0,"{'accuracy': 0.8185367800039901, 'precision_sc..."
65201,Deli Exp Artisan Sub Ital,,GROCERY,deli exp artisan sub ital,"CAT_PREPACKAGED_FOOD,106","CAT_PREPARED_FOOD,101",1.0,"{'accuracy': 0.8185367800039901, 'precision_sc..."
109008,Ecos-All-Purpose Cleaner-Degreaser22 oz,,GROCERY,ecosallpurpose cleanerdegreaser,"CAT_TPP_CLEANING_EQUIPMENT,782","CAT_SOFT_DRINK,112",1.0,"{'accuracy': 0.8185367800039901, 'precision_sc..."
89,"New Amsterdam Vodka, 750ml (ABV 35%)",,GROCERY,new amsterdam vodka,"CAT_LIQUOR,535","CAT_LIQUOR,535",1.0,"{'accuracy': 0.8185367800039901, 'precision_sc..."
88983,Bud Light 24pk 12oz Btl 4.2% ABV,\N,GROCERY,bud light btl,"CAT_BEER,533","CAT_BEER,533",1.0,"{'accuracy': 0.8185367800039901, 'precision_sc..."
...,...,...,...,...,...,...,...,...
13573,"Espolòn Tequila Blanco, 750mL (0.0% ABV)",,GROCERY,espol tequila blanco,"CAT_LIQUOR,535","CAT_LIQUOR,535",1.0,"{'accuracy': 0.8185367800039901, 'precision_sc..."
12882,Sprite (12 fl oz),"Crisp, refreshing and clean-tasting, Sprite is...",GROCERY,sprite fl crisp refreshing cleantasting sprite...,"CAT_SOFT_DRINK,112","CAT_SOFT_DRINK,112",1.0,"{'accuracy': 0.8185367800039901, 'precision_sc..."
16959,Stay Woke,Coffee Por ter (6.8%)\n\nA beautiful coffee P...,LIQUOR,stay woke coffee por ter nna beautiful coffee ...,"CAT_ALCOHOL,109","CAT_TPP,531",1.0,"{'accuracy': 0.8185367800039901, 'precision_sc..."
15362,"Svedka, 1.75L bottle vodka (40% ABV)",,LIQUOR,svedka bottle vodka,"CAT_LIQUOR,535","CAT_LIQUOR,535",1.0,"{'accuracy': 0.8185367800039901, 'precision_sc..."


In [27]:

misclassifications_primary= result.loc[result['original_cat_primary']!=result['prediction_cat_primary']]

In [39]:
misclassifications_primary.sort_values(by=['Item','Description','establishment_type'], ascending=True).head(170)

Unnamed: 0,Item,Description,establishment_type,cleanText,original_cat_primary,prediction_cat_primary,prediction_cat_primary_confscore,confusion_matrix_primary
76354,Uptown Wine Cocktails Lime Margarita. 1.5...,,GROCERY,uptown wine cocktail lime margarita bottle,"CAT_ALCOHOL,109","CAT_WINE,534",1.0,"{'accuracy': 0.8185367800039901, 'precision_sc..."
48048,Menage A Trois Sparkling Rose 750ml,\N,GROCERY,menage trois sparkling rose,"CAT_WINE,534","CAT_SPARKLING_WINE,716",1.0,"{'accuracy': 0.8185367800039901, 'precision_sc..."
124692,21st Amendment Introduces Blood Orange 6Pk Cans,\N,GROCERY,st amendment introduces blood orange can,"CAT_BEER,533","CAT_WINE,534",1.0,"{'accuracy': 0.8185367800039901, 'precision_sc..."
156165,AVEDA Shampure Hand & Body Wash 1000ml,\n This gently cleanses the body and leaves...,COSMETICS,aveda shampure hand body wash gently clean bod...,"CAT_TPP_SHAMPOOS,816","CAT_TPP_SKIN_CARE_PRODUCTS,818",1.0,"{'accuracy': 0.8185367800039901, 'precision_sc..."
62019,Aratama Abekawa Mochi,"Five count, 200 grams.",CONVENIENCE,aratama abekawa mochi five count gram convenience,"CAT_PREPACKAGED_FOOD_DESSERTS,725","CAT_PREPACKAGED_FOOD,106",1.0,"{'accuracy': 0.8185367800039901, 'precision_sc..."
...,...,...,...,...,...,...,...,...
12995,7th Generation Dish Washer Detergent Pack,\N,GROCERY,th generation dish washer detergent pack,"CAT_TPP_DISH_WASHING_PRODUCTS,786","CAT_PREPARED_FOOD,101",1.0,"{'accuracy': 0.8185367800039901, 'precision_sc..."
40016,9 - Roughtail Hoptometrist,\N,GROCERY,roughtail hoptometrist,"CAT_BEER,533","CAT_SOFT_DRINK,112",1.0,"{'accuracy': 0.8185367800039901, 'precision_sc..."
113977,9. Refresh Juice,"Watermelon, lime, green apple and pineapple.",GROCERY,refresh juice watermelon lime green apple pine...,"CAT_PREPARED_FOOD,101","CAT_JUICE,110",1.0,"{'accuracy': 0.8185367800039901, 'precision_sc..."
77555,"99 Banana 375 , 375Ml Liqueur (49.5% Abv)",,GROCERY,banana liqueur,"CAT_ALCOHOL,109","CAT_LIQUOR,535",1.0,"{'accuracy': 0.8185367800039901, 'precision_sc..."


"{'accuracy': 0.8146149522203485, 'precision_score': 0.6027551606729535, 'recall_score': 0.7097466761674291, 'f1_score': 0.6327463859326979}"

saving model to local

In [11]:
import pickle
# save the model to disk
filename_primary= 'finalized_model_rf_primary-03-01-22_allprod.sav'
pickle.dump(rf, open(filename_primary, 'wb'))


In [11]:
import datetime
print("Time :{} result :{}".format(datetime.datetime.now(),result['confusion_matrix_primary'][0]))

Time :2022-01-10 15:47:29.569629 result :{'accuracy': 0.817339755464987, 'precision_score': 0.5905648165179906, 'recall_score': 0.7215184393843495, 'f1_score': 0.6307314672452732}


Time :2022-01-10 13:30:26.356610 result :{'accuracy': 0.8185367800039901, 'precision_score': 0.5848761715254991, 'recall_score': 0.672646577840006, 'f1_score': 0.6100045958278678}

Time :2022-01-10 13:39:45.378508 result :{'accuracy': 0.8167982443640095, 'precision_score': 0.5828179565369508, 'recall_score': 0.694053097419111, 'f1_score': 0.6153519093480956}

Time :2022-01-10 14:04:13.438304 result :{'accuracy': 0.8207028244079003, 'precision_score': 0.598134322659592, 'recall_score': 0.7163954355735384, 'f1_score': 0.6336420913159138}

Time :2022-01-10 15:44:34.142947 result :{'accuracy': 0.8164847379371277, 'precision_score': 0.570865693013315, 'recall_score': 0.6990846543147078, 'f1_score': 0.6078947687949255}

Time :2022-01-10 15:47:29.569629 result :{'accuracy': 0.817339755464987, 'precision_score': 0.5905648165179906, 'recall_score': 0.7215184393843495, 'f1_score': 0.6307314672452732}

{'accuracy': 0.8198348463305261, 'precision_score': 0.5943177708372124, 'recall_score': 0.6808857187011089, 'f1_score': 0.6211879572499776}

In [None]:
result

In [54]:
def split_cat(category):
    category_list=category.split(',')
    if len(category_list)==1:      
        primary_category=category_list[0]
        secondary_category=category_list[0]
    if len(category_list)==2:      
        primary_category=category_list[0]
        if category_list[1] in ['TEMP_HEATED','TEMP_COLD','TEMP_UNHEATED']:
              secondary_category=category_list[0]
        else:
              secondary_category=category_list[1]
    if len(category_list)>2:      
        primary_category=category_list[0]
        secondary_category=category_list[1]
    else:
        pass
        
    return primary_category,secondary_category
def split_int(integer):
    str_int=str(integer)
    integer_list=str_int.split(',')
    if len(integer_list)==1:      
        primary_integer=integer_list[0]
        secondary_integer=integer_list[0]
    if len(integer_list)==2:      
        primary_integer=integer_list[0]
        if integer_list[1]=='1':
            secondary_integer=integer_list[0]
        else:
            secondary_integer=integer_list[1]
    if len(integer_list)>2:      
        primary_integer=integer_list[0]
        secondary_integer=integer_list[1]
    else:
         pass
        
    return primary_integer,secondary_integer
def combine(category,integer):
    return category+","+integer

In [55]:
df['primary_cat']=df['Agent Corrected CAT Name'].map(lambda x:split_cat(x)[0])
df['secondary_cat']=df['Agent Corrected CAT Name'].map(lambda x:split_cat(x)[1])
df['primary_int']=df['Agent Corrected Integer'].map(lambda x:split_int(x)[0])
df['secondary_int']=df['Agent Corrected Integer'].map(lambda x:split_int(x)[1])
df['primary_int_prediction']=df['Integer'].map(lambda x:split_int(x)[0])
df['primary']=df[['primary_cat','primary_int']].apply(lambda x:combine(*x),axis=1)
df['secondary']=df[['secondary_cat','secondary_int']].apply(lambda x:combine(*x),axis=1)

In [63]:
df.to_csv('TaxML-CICD - Prod_Data_after_preprocess.csv')

In [64]:
df

Unnamed: 0.1,Unnamed: 0,UniqueUUID,store_uuid,item_uuid,Item,Description,establishment_type,CAT Name,Integer,Confidence Score,...,Agent Corrected Integer,CAT NAME_ ValidationScore [0-100],Inetger_ValidationScore[0-100],primary_cat,secondary_cat,primary_int,secondary_int,primary_int_prediction,primary,secondary
0,,2198c3e7-9b2f-441e-9e24-813d70f3bf26:9abcba67-...,2198c3e7-9b2f-441e-9e24-813d70f3bf26,9abcba67-3a79-41a5-987a-034cae6a0ff9,Pure Happiness,A sunny sunflower bouquet gets an autumnal spi...,FLOWERS,CAT_TPP,531,0.75,...,531,100,100,CAT_TPP,CAT_TPP,531,531,531,"CAT_TPP,531","CAT_TPP,531"
1,,2198c3e7-9b2f-441e-9e24-813d70f3bf26:03a01438-...,2198c3e7-9b2f-441e-9e24-813d70f3bf26,03a01438-c236-40bd-a6d1-9dff278d67e3,Silver Snow Bouquet,"Like a quiet walk through a snowy forest, this...",FLOWERS,CAT_TPP,531,0.85,...,531,100,100,CAT_TPP,CAT_TPP,531,531,531,"CAT_TPP,531","CAT_TPP,531"
2,,2198c3e7-9b2f-441e-9e24-813d70f3bf26:14312ee6-...,2198c3e7-9b2f-441e-9e24-813d70f3bf26,14312ee6-2324-4f0b-81b7-f1e4cb7cf454,Beautiful in Blue,Brighten the home with the beauty of bright bl...,FLOWERS,CAT_TPP,531,0.90,...,531,100,100,CAT_TPP,CAT_TPP,531,531,531,"CAT_TPP,531","CAT_TPP,531"
3,,2198c3e7-9b2f-441e-9e24-813d70f3bf26:6cb0e71e-...,2198c3e7-9b2f-441e-9e24-813d70f3bf26,6cb0e71e-586d-4fd9-a71a-5dcd86e6f920,Blush Life Bouquet,Put a spring in their step with this beautiful...,FLOWERS,CAT_TPP,531,0.85,...,531,100,100,CAT_TPP,CAT_TPP,531,531,531,"CAT_TPP,531","CAT_TPP,531"
4,,2198c3e7-9b2f-441e-9e24-813d70f3bf26:1dbd5b6a-...,2198c3e7-9b2f-441e-9e24-813d70f3bf26,1dbd5b6a-0e09-46e8-ad34-54b996c53d57,Red Tulips,Call ahead for this arrangement before orderin...,FLOWERS,CAT_TPP,531,0.90,...,531,100,100,CAT_TPP,CAT_TPP,531,531,531,"CAT_TPP,531","CAT_TPP,531"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
722,16-12-2021 1:51 AM,61ff9153-b582-5003-8680-466f512bb8e1:58739636-...,61ff9153-b582-5003-8680-466f512bb8e1,58739636-cef8-4d82-aa5b-2689dac0b482,Misko (1 lb),,GROCERY,"CAT_SOFT_DRINK,CAT_PREPACKAGED_FOOD_PASTA",112737,0.41,...,737,0,0,CAT_PREPACKAGED_FOOD_PASTA,CAT_PREPACKAGED_FOOD_PASTA,737,737,112,"CAT_PREPACKAGED_FOOD_PASTA,737","CAT_PREPACKAGED_FOOD_PASTA,737"
723,16-12-2021 1:51 AM,df29da39-f17b-5114-b9d7-1f67953a5e53:5781fe4d-...,df29da39-f17b-5114-b9d7-1f67953a5e53,5781fe4d-20f2-4c6e-985e-2bc359d57646,Chunky Chocolates (2.4 oz),Chunks of chocolate with premium ingredients m...,GROCERY,"CAT_CONFECTIONARY,CAT_CHOCOLATE",707706,0.30,...,707,0,0,CAT_CONFECTIONARY,CAT_CONFECTIONARY,707,707,707,"CAT_CONFECTIONARY,707","CAT_CONFECTIONARY,707"
724,16-12-2021 1:51 AM,8fed2af5-d154-444a-aa96-0992a845c008:17e6f169-...,8fed2af5-d154-444a-aa96-0992a845c008,17e6f169-5a76-4b69-9519-cfac5dd4f420,"Essential Everyday Salsa, Restaurant Style, an...",,GROCERY,CAT_PREPACKAGED_FOOD_CONDIMENTS,740,0.65,...,740,100,100,CAT_PREPACKAGED_FOOD_CONDIMENTS,CAT_PREPACKAGED_FOOD_CONDIMENTS,740,740,740,"CAT_PREPACKAGED_FOOD_CONDIMENTS,740","CAT_PREPACKAGED_FOOD_CONDIMENTS,740"
725,16-12-2021 1:51 AM,f0fba24f-ba3a-53eb-b35e-97efaaf4a468:63cdb229-...,f0fba24f-ba3a-53eb-b35e-97efaaf4a468,63cdb229-8333-46c1-8cfc-bcc94764d91d,"Woodbridge Chardonnay, 1.5L white wine (13.5% ...",,GROCERY,CAT_WINE,534,0.68,...,534,100,100,CAT_WINE,CAT_WINE,534,534,534,"CAT_WINE,534","CAT_WINE,534"


In [57]:
df_test = df.loc[df['Inetger_ValidationScore[0-100]'] == 0]

In [58]:
len(df_test)

403

In [60]:
df2=df_test[df_test['primary_int_prediction']==df_test['primary_int']]
len(df2)#no of correct prediction

73

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint
from time import time
import logging
rf = Pipeline([('vect', CountVectorizer(strip_accents='ascii', token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b', lowercase=True, stop_words='english')),
       ('tfidf', TfidfTransformer(use_idf=True)),
       ('clf', RandomForestClassifier()),
      ])
parameters = {
    "vect__max_df": (0.5, 0.75, 1.0),
    # 'vect__max_features': (None, 5000, 10000, 50000),
    "vect__ngram_range": ((1, 1), (1, 2)),  # unigrams or bigramslf__
     "clf__bootstrap":[True, False],
     "clf__max_depth":[10, 50, 100,500, None],
     "clf__max_features":['auto', 'sqrt'],
     "clf__min_samples_leaf":[1,2,4],
     "clf__min_samples_split":[2,5,10],
     "clf__n_estimators":[400,600,800],
     "clf__random_state":[3]

}

RandomizedSearch = RandomizedSearchCV(rf,
                          parameters, 
                          cv=5,
                          verbose=1, 
                          n_jobs=-1)

t0 = time()
rf_best_model = RandomizedSearch.fit(X_train_final, y_train_final_primary)
print("done in %0.3fs" % (time() - t0))
#print()
print("Best score: %0.3f" % rf_best_model.best_score_)
print("Best parameters set:")
best_parameters = rf_best_model.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


In [None]:
best_parameters

169000 total data b4 traing + 403 rows
exp01:169000+403 rows
exp02:169000+727 rows
