# Loading Necessary Libraries

In [5]:
from numpy import random
import re    # for regular expressions 
import nltk  # for text manipulation 
import string 
import warnings 
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt  
from wordcloud import WordCloud # data visualization library
#text features can be constructed using assorted techniques – Bag of Words, TF-IDF, and Word Embeddings.
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer #tfidf and Bag-of-Words Features.
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from nltk.corpus import stopwords
import gensim # library for word2vec
%matplotlib inline
pd.set_option("display.max_colwidth", 200) 
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Loading Dataset

In [2]:
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5566 entries, 0 to 5565
Data columns (total 6 columns):
Inv_Id              5566 non-null int64
Vendor_Code         5566 non-null object
GL_Code             5566 non-null object
Inv_Amt             5566 non-null float64
Item_Description    5566 non-null object
Product_Category    5566 non-null object
dtypes: float64(1), int64(1), object(4)
memory usage: 261.0+ KB


In [24]:
# train.Vendor_Code.value_counts()
# train.GL_Code.value_counts()
# train.Inv_Amt.value_counts()
# train.Product_Category.value_counts()

In [25]:
train.shape

(5566, 6)

In [26]:
test.shape

(2446, 5)

In [27]:
train.head()

Unnamed: 0,Inv_Id,Vendor_Code,GL_Code,Inv_Amt,Item_Description,Product_Category
0,15001,VENDOR-1676,GL-6100410,83.24,Artworking/Typesetting Production Jun 2009 Champion Parts Inc SMAP Prototype and Comp Production/Packaging Design,CLASS-1963
1,15002,VENDOR-1883,GL-2182000,51.18,Auto Leasing Corporate Services Corning Inc /Ny 2013-Mar Auto Leasing and Maintenance Other Corporate Services,CLASS-1250
2,15004,VENDOR-1999,GL-6050100,79.02,Store Management Lease/Rent Deltona Corp Real Estate Base Rent Jul2018,CLASS-1274
3,15005,VENDOR-1771,GL-6101400,48.5,Store Construction General Requirements Colonial Trust Iii General Contractor Final Site Clean Up 2005-Dec,CLASS-1522
4,15006,VENDOR-1331,GL-2182000,63.35,Jul 2015 Aydin Corp Contingent Labor/Temp Labor Contingent Labor/Temp Labor Corporate Services Human Resources,CLASS-1376


In [28]:
test.head()

Unnamed: 0,Inv_Id,Vendor_Code,GL_Code,Inv_Amt,Item_Description
0,15003,VENDOR-2513,GL-6050310,56.13,Travel and Entertainment Miscellaneous Company Car (Field Only) Ground Transportation Miscellaneous Company Car (Field Only) Oct2011 Fortune National Corp
1,15008,VENDOR-1044,GL-6101400,96.56,Final Site Clean Up Store Construction Advanced Micro Devices Inc Oct2011 General Requirements General Contractor
2,15013,VENDOR-1254,GL-6101400,55.93,Arabian American Development Co Final Site Clean Up 2008-Oct General Requirements General Contractor Store Construction
3,15019,VENDOR-1331,GL-2182000,32.62,Corporate Services Contingent Labor/Temp Labor Human Resources Contingent Labor/Temp Labor Jun 2014 Aydin Corp
4,15020,VENDOR-2513,GL-6050310,25.81,Fortune National Corp Miscellaneous Company Car (Field Only) Jun-2015 Miscellaneous Company Car (Field Only) Ground Transportation Travel and Entertainment


# Data Pre-processing

In [29]:
combi = train.append(test, ignore_index=True)
combi.shape

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


(8012, 6)

In [30]:
combi.head()

Unnamed: 0,GL_Code,Inv_Amt,Inv_Id,Item_Description,Product_Category,Vendor_Code
0,GL-6100410,83.24,15001,Artworking/Typesetting Production Jun 2009 Champion Parts Inc SMAP Prototype and Comp Production/Packaging Design,CLASS-1963,VENDOR-1676
1,GL-2182000,51.18,15002,Auto Leasing Corporate Services Corning Inc /Ny 2013-Mar Auto Leasing and Maintenance Other Corporate Services,CLASS-1250,VENDOR-1883
2,GL-6050100,79.02,15004,Store Management Lease/Rent Deltona Corp Real Estate Base Rent Jul2018,CLASS-1274,VENDOR-1999
3,GL-6101400,48.5,15005,Store Construction General Requirements Colonial Trust Iii General Contractor Final Site Clean Up 2005-Dec,CLASS-1522,VENDOR-1771
4,GL-2182000,63.35,15006,Jul 2015 Aydin Corp Contingent Labor/Temp Labor Contingent Labor/Temp Labor Corporate Services Human Resources,CLASS-1376,VENDOR-1331


In [31]:
combi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8012 entries, 0 to 8011
Data columns (total 6 columns):
GL_Code             8012 non-null object
Inv_Amt             8012 non-null float64
Inv_Id              8012 non-null int64
Item_Description    8012 non-null object
Product_Category    5566 non-null object
Vendor_Code         8012 non-null object
dtypes: float64(1), int64(1), object(4)
memory usage: 375.6+ KB


In [39]:
combi['tidy_Item_Description'] = combi['Item_Description'].str.replace("[^a-zA-Z]"," ")

In [40]:
combi

Unnamed: 0,GL_Code,Inv_Amt,Inv_Id,Item_Description,Product_Category,Vendor_Code,tidy_Item_Description
0,GL-6100410,83.24,15001,Artworking/Typesetting Production Jun 2009 Champion Parts Inc SMAP Prototype and Comp Production/Packaging Design,CLASS-1963,VENDOR-1676,Artworking Typesetting Production Jun Champion Parts Inc SMAP Prototype and Comp Production Packaging Design
1,GL-2182000,51.18,15002,Auto Leasing Corporate Services Corning Inc /Ny 2013-Mar Auto Leasing and Maintenance Other Corporate Services,CLASS-1250,VENDOR-1883,Auto Leasing Corporate Services Corning Inc Ny Mar Auto Leasing and Maintenance Other Corporate Services
2,GL-6050100,79.02,15004,Store Management Lease/Rent Deltona Corp Real Estate Base Rent Jul2018,CLASS-1274,VENDOR-1999,Store Management Lease Rent Deltona Corp Real Estate Base Rent Jul
3,GL-6101400,48.50,15005,Store Construction General Requirements Colonial Trust Iii General Contractor Final Site Clean Up 2005-Dec,CLASS-1522,VENDOR-1771,Store Construction General Requirements Colonial Trust Iii General Contractor Final Site Clean Up Dec
4,GL-2182000,63.35,15006,Jul 2015 Aydin Corp Contingent Labor/Temp Labor Contingent Labor/Temp Labor Corporate Services Human Resources,CLASS-1376,VENDOR-1331,Jul Aydin Corp Contingent Labor Temp Labor Contingent Labor Temp Labor Corporate Services Human Resources
5,GL-6101400,32.28,15007,Final Site Clean Up 2018Mar Store Construction Dravo Corp General Contractor General Requirements,CLASS-1522,VENDOR-2076,Final Site Clean Up Mar Store Construction Dravo Corp General Contractor General Requirements
6,GL-6050310,5.38,15009,Travel and Entertainment Miscellaneous Company Car (Field Only) Texas New Mexico Power Co Ground Transportation Miscellaneous Company Car (Field Only) 2011-Mar,CLASS-1758,VENDOR-1802,Travel and Entertainment Miscellaneous Company Car Field Only Texas New Mexico Power Co Ground Transportation Miscellaneous Company Car Field Only Mar
7,GL-6101400,31.21,15010,General Contractor General Requirements Final Site Clean Up American Pad & Paper Co Apr2014 Store Construction,CLASS-1522,VENDOR-1191,General Contractor General Requirements Final Site Clean Up American Pad Paper Co Apr Store Construction
8,GL-6100410,42.89,15011,Aquila Distributors Inc /Bd Prototype and Comp Production/Packaging Design Jul 2007 Artworking/Typesetting Production SMAP,CLASS-1963,VENDOR-2120,Aquila Distributors Inc Bd Prototype and Comp Production Packaging Design Jul Artworking Typesetting Production SMAP
9,GL-6050100,59.50,15012,Base Rent Store Management Chicago Rivet & Machine Co Dec-2017 Lease/Rent Real Estate,CLASS-1274,VENDOR-1704,Base Rent Store Management Chicago Rivet Machine Co Dec Lease Rent Real Estate


In [42]:
# combi[combi['Product_Category']== 'CLASS-1805']

In [43]:
STOPWORDS = set(stopwords.words('english'))
def clean_text(text):
    text = text.lower() # lowercase text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text

In [74]:
combi['tidy_Item_Description'] = combi['tidy_Item_Description'].apply(clean_text)
combi['tidy_Item_Description'] = combi['tidy_Item_Description'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))

In [75]:
combi['GL_Code'] = combi['GL_Code'].str.replace("[^0-9]","")
combi['Vendor_Code'] = combi['Vendor_Code'].str.replace("[^0-9]","")
combi['Inv_Amt'] = combi['Inv_Amt'].astype(str)

In [76]:
combi['predictor_text'] = combi['GL_Code']+" "+combi['Vendor_Code']+" "+combi['Inv_Amt']+" "+combi['tidy_Item_Description']

In [77]:
combi.head()

Unnamed: 0,GL_Code,Inv_Amt,Inv_Id,Item_Description,Product_Category,Vendor_Code,tidy_Item_Description,predictor_text
0,6100410,83.24,15001,Artworking/Typesetting Production Jun 2009 Champion Parts Inc SMAP Prototype and Comp Production/Packaging Design,CLASS-1963,1676,artworking typesetting production jun champion parts inc smap prototype comp production packaging design,6100410 1676 83.24 artworking typesetting production jun champion parts inc smap prototype comp production packaging design
1,2182000,51.18,15002,Auto Leasing Corporate Services Corning Inc /Ny 2013-Mar Auto Leasing and Maintenance Other Corporate Services,CLASS-1250,1883,auto leasing corporate services corning inc mar auto leasing maintenance corporate services,2182000 1883 51.18 auto leasing corporate services corning inc mar auto leasing maintenance corporate services
2,6050100,79.02,15004,Store Management Lease/Rent Deltona Corp Real Estate Base Rent Jul2018,CLASS-1274,1999,store management lease rent deltona corp real estate base rent jul,6050100 1999 79.02 store management lease rent deltona corp real estate base rent jul
3,6101400,48.5,15005,Store Construction General Requirements Colonial Trust Iii General Contractor Final Site Clean Up 2005-Dec,CLASS-1522,1771,store construction general requirements colonial trust iii general contractor final site clean dec,6101400 1771 48.5 store construction general requirements colonial trust iii general contractor final site clean dec
4,2182000,63.35,15006,Jul 2015 Aydin Corp Contingent Labor/Temp Labor Contingent Labor/Temp Labor Corporate Services Human Resources,CLASS-1376,1331,jul aydin corp contingent labor temp labor contingent labor temp labor corporate services human resources,2182000 1331 63.35 jul aydin corp contingent labor temp labor contingent labor temp labor corporate services human resources


In [78]:
combi['predictor_text'].apply(lambda x: len(x.split(' '))).sum()

133545

In [79]:
combi.columns = combi.columns.str.strip()

# Feature Engineering

## Bag of Words Feature:

In [80]:
bow_vectorizer = CountVectorizer()
bow=bow_vectorizer.fit_transform(combi['predictor_text'])
bow.shape

(8012, 3437)

## TF-IDF Feature:

In [81]:
tfidf_vectorizer = TfidfVectorizer()
tfidf = tfidf_vectorizer.fit_transform(combi['predictor_text'])
tfidf.shape

(8012, 3437)

# Word2Vec Feature:

In [272]:
%%time
tokenized_text = combi['predictor_text'].apply(lambda x: x.split()) #tokenizing
model_w2v = gensim.models.Word2Vec(
            tokenized_text,
            size=400, # desired no. of features/independent variables
            window=5, # context window size
            min_count=1,
            sg = 1, # 1 for skip-gram model
            hs = 0,
            negative = 10, # for negative sampling
            workers = 8, # no. of cores
            seed = 32)

model_w2v.train(tokenized_text, total_examples= len(combi['predictor_text']), epochs = 100)

Wall time: 59.4 s


In [284]:
model_w2v.wv.most_similar(positive='artworking')

[('typesetting', 0.7681471109390259),
 ('48.89', 0.6752206087112427),
 ('1971', 0.6751769781112671),
 ('51.21', 0.6701453924179077),
 ('70.2', 0.6682206392288208),
 ('26.67', 0.6644734144210815),
 ('90.24', 0.6633161306381226),
 ('13.39', 0.6594852209091187),
 ('cummins', 0.6581088304519653),
 ('25.1', 0.657589852809906)]

In [274]:
len(model_w2v['artworking'])

400

In [275]:
def word_vector(tokens, size):
    vec = np.zeros(size).reshape((1,size))
    count = 0.
    for word in tokens:
        try:
            vec +=model_w2v[word].reshape((1,size))
            count +=1.
        except KeyError: # handling the case where the token is not in vocabulary 
                        continue
    if count !=0:
        vec /=count
    return vec

In [276]:
wordvec_arrays = np.zeros((len(tokenized_text), 400))
for i in range (len(tokenized_text)):
    wordvec_arrays[i,:] = word_vector(tokenized_text[i], 400)
    wordvec_df = pd.DataFrame(wordvec_arrays)
wordvec_df.shape

(8012, 400)

## Doc2Vec Feature:

In [261]:
from tqdm import tqdm
tqdm.pandas(desc = 'progress-bar')
from gensim.models.doc2vec import LabeledSentence

In [262]:
def add_label(rvw):
    output = []
    for i, s in zip(rvw.index, rvw):
        output.append(LabeledSentence(s, ['reviews_' + str(i)]))
    return output
labeled_text = add_label(tokenized_text)

In [263]:
labeled_text[:5]

[LabeledSentence(words=['6100410', '1676', '83.24', 'artworking', 'typesetting', 'production', 'jun', 'champion', 'parts', 'inc', 'smap', 'prototype', 'comp', 'production', 'packaging', 'design'], tags=['reviews_0']),
 LabeledSentence(words=['2182000', '1883', '51.18', 'auto', 'leasing', 'corporate', 'services', 'corning', 'inc', 'mar', 'auto', 'leasing', 'maintenance', 'corporate', 'services'], tags=['reviews_1']),
 LabeledSentence(words=['6050100', '1999', '79.02', 'store', 'management', 'lease', 'rent', 'deltona', 'corp', 'real', 'estate', 'base', 'rent', 'jul'], tags=['reviews_2']),
 LabeledSentence(words=['6101400', '1771', '48.5', 'store', 'construction', 'general', 'requirements', 'colonial', 'trust', 'iii', 'general', 'contractor', 'final', 'site', 'clean', 'dec'], tags=['reviews_3']),
 LabeledSentence(words=['2182000', '1331', '63.35', 'jul', 'aydin', 'corp', 'contingent', 'labor', 'temp', 'labor', 'contingent', 'labor', 'temp', 'labor', 'corporate', 'services', 'human', 'reso

In [105]:
%%time
model_d2v = gensim.models.Doc2Vec(dm = 1, # for 'distributed memory' model
                                 dm_mean=1, # for using mean of the context word vectors
                                 size=200, # no. of desired features
                                 window=5, # width of the context window
                                 negative=7, # if > 0 then negative sampling will be used
                                 min_count=5, # ignores all words with total frequency less than 2
                                 workers=8, # no. of cores
                                 alpha=0.03, #learning rate
                                 seed = 23)

model_d2v.build_vocab([i for i in tqdm(labeled_text)])

model_d2v.train(labeled_text, total_examples= len(combi['predictor_text']), epochs=100)

100%|█████████████████████████████████████████████████████████████████████████| 8012/8012 [00:00<00:00, 3959557.40it/s]


Wall time: 38.3 s


In [106]:
docvec_arrays = np.zeros((len(tokenized_text), 200)) 
for i in range(len(combi)):
    docvec_arrays[i,:] = model_d2v.docvecs[i].reshape((1,200))    

docvec_df = pd.DataFrame(docvec_arrays) 
docvec_df.shape

(8012, 200)

In [123]:
train['Product_Category'].value_counts()

CLASS-1758    1498
CLASS-1274     986
CLASS-1522     851
CLASS-1250     451
CLASS-1376     365
CLASS-1963     230
CLASS-1249     176
CLASS-1828     113
CLASS-2141     108
CLASS-1721     107
CLASS-1567      84
CLASS-1919      70
CLASS-1850      60
CLASS-2112      53
CLASS-1477      50
CLASS-1870      36
CLASS-2241      36
CLASS-1309      32
CLASS-2003      32
CLASS-1429      32
CLASS-1964      28
CLASS-1322      28
CLASS-1294      25
CLASS-1770      19
CLASS-1983      17
CLASS-1867      15
CLASS-1652      15
CLASS-2038      13
CLASS-1805      12
CLASS-2152       9
CLASS-1248       5
CLASS-1688       4
CLASS-2146       2
CLASS-2015       2
CLASS-1838       1
CLASS-1957       1
Name: Product_Category, dtype: int64

# # Naive Bayes Classifier for Multinomial Models

In [207]:
# b=set(train['Product_Category'])
# my_categories = list(b)
# # my_categories.remove('CLASS-2146') 
# my_categories.remove('CLASS-2015') 
# my_categories.remove('CLASS-1838') 
# my_categories.remove('CLASS-1957')
# target_names=my_categories

In [211]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

train_bow = bow[:5566,:]
test_bow = bow[5566:,:]

xtrain_bow, xvalid_bow, ytrain, yvalid = train_test_split(train_bow, train['Product_Category'], random_state=42, test_size=0.25)

nb = MultinomialNB()
nb.fit(xtrain_bow, ytrain)
y_pred = nb.predict(xvalid_bow)
print('accuracy %s' % accuracy_score(y_pred, yvalid))
print(classification_report(yvalid, y_pred))

accuracy 0.9806034482758621


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

  CLASS-1248       1.00      1.00      1.00         1
  CLASS-1249       1.00      1.00      1.00        42
  CLASS-1250       0.99      1.00      1.00       113
  CLASS-1274       0.99      1.00      0.99       234
  CLASS-1294       1.00      1.00      1.00         5
  CLASS-1309       0.00      0.00      0.00        12
  CLASS-1322       1.00      1.00      1.00        10
  CLASS-1376       0.99      1.00      1.00       108
  CLASS-1429       1.00      0.90      0.95        10
  CLASS-1477       1.00      1.00      1.00        16
  CLASS-1522       1.00      1.00      1.00       201
  CLASS-1567       1.00      1.00      1.00        15
  CLASS-1652       1.00      1.00      1.00         3
  CLASS-1688       0.00      0.00      0.00         1
  CLASS-1721       0.65      1.00      0.79        32
  CLASS-1758       1.00      1.00      1.00       381
  CLASS-1770       1.00      1.00      1.00         5
  CLASS-1805       0.00    

In [212]:
train_tfidf = tfidf[:5566,:]
test_tfidf = tfidf[5566:,:] 

xtrain_tfidf, xvalid_tfidf, ytrain, yvalid = train_test_split(train_tfidf, train['Product_Category'], random_state=42, test_size=0.25)

nb = MultinomialNB()
nb.fit(xtrain_tfidf, ytrain)
y_pred = nb.predict(xvalid_tfidf)
print('accuracy %s' % accuracy_score(y_pred, yvalid))
print(classification_report(yvalid, y_pred))

accuracy 0.9353448275862069


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

  CLASS-1248       0.00      0.00      0.00         1
  CLASS-1249       1.00      0.76      0.86        42
  CLASS-1250       0.87      1.00      0.93       113
  CLASS-1274       0.96      1.00      0.98       234
  CLASS-1294       1.00      0.80      0.89         5
  CLASS-1309       0.00      0.00      0.00        12
  CLASS-1322       1.00      0.70      0.82        10
  CLASS-1376       0.99      1.00      1.00       108
  CLASS-1429       1.00      0.60      0.75        10
  CLASS-1477       0.84      1.00      0.91        16
  CLASS-1522       0.94      1.00      0.97       201
  CLASS-1567       0.88      1.00      0.94        15
  CLASS-1652       0.00      0.00      0.00         3
  CLASS-1688       0.00      0.00      0.00         1
  CLASS-1721       0.62      1.00      0.76        32
  CLASS-1758       0.99      1.00      0.99       381
  CLASS-1770       1.00      0.20      0.33         5
  CLASS-1805       0.00    

### NB + BoW performed best with accuracy 0.9806034482758621

# Logistic Regression Classifier for Multinomial Models

In [213]:
from sklearn.linear_model import LogisticRegression
train_bow = bow[:5566,:]
test_bow = bow[5566:,:]

xtrain_bow, xvalid_bow, ytrain, yvalid = train_test_split(train_bow, train['Product_Category'], random_state=42, test_size=0.25)

lreg = LogisticRegression(n_jobs=-1,C=1e5)
lreg.fit(xtrain_bow, ytrain)
y_pred = lreg.predict(xvalid_bow)
print('accuracy %s' % accuracy_score(y_pred, yvalid))
print(classification_report(yvalid, y_pred))

  " = {}.".format(effective_n_jobs(self.n_jobs)))


accuracy 0.9978448275862069


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

  CLASS-1248       1.00      1.00      1.00         1
  CLASS-1249       1.00      1.00      1.00        42
  CLASS-1250       1.00      1.00      1.00       113
  CLASS-1274       1.00      1.00      1.00       234
  CLASS-1294       1.00      1.00      1.00         5
  CLASS-1309       1.00      1.00      1.00        12
  CLASS-1322       1.00      1.00      1.00        10
  CLASS-1376       1.00      1.00      1.00       108
  CLASS-1429       1.00      1.00      1.00        10
  CLASS-1477       1.00      1.00      1.00        16
  CLASS-1522       1.00      1.00      1.00       201
  CLASS-1567       0.94      1.00      0.97        15
  CLASS-1652       1.00      1.00      1.00         3
  CLASS-1688       1.00      1.00      1.00         1
  CLASS-1721       1.00      1.00      1.00        32
  CLASS-1758       1.00      1.00      1.00       381
  CLASS-1770       1.00      1.00      1.00         5
  CLASS-1805       1.00    

In [214]:
train_tfidf = tfidf[:5566,:]
test_tfidf = tfidf[5566:,:] 

xtrain_tfidf, xvalid_tfidf, ytrain, yvalid = train_test_split(train_tfidf, train['Product_Category'], random_state=42, test_size=0.25)

lreg = LogisticRegression(n_jobs=-1, C=1e5)
lreg.fit(xtrain_tfidf, ytrain)
y_pred = lreg.predict(xvalid_tfidf)
print('accuracy %s' % accuracy_score(y_pred, yvalid))
print(classification_report(yvalid, y_pred))

  " = {}.".format(effective_n_jobs(self.n_jobs)))


accuracy 0.9971264367816092


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

  CLASS-1248       1.00      1.00      1.00         1
  CLASS-1249       1.00      1.00      1.00        42
  CLASS-1250       0.99      1.00      1.00       113
  CLASS-1274       1.00      1.00      1.00       234
  CLASS-1294       1.00      1.00      1.00         5
  CLASS-1309       1.00      1.00      1.00        12
  CLASS-1322       1.00      1.00      1.00        10
  CLASS-1376       1.00      1.00      1.00       108
  CLASS-1429       1.00      1.00      1.00        10
  CLASS-1477       1.00      1.00      1.00        16
  CLASS-1522       1.00      1.00      1.00       201
  CLASS-1567       1.00      1.00      1.00        15
  CLASS-1652       1.00      1.00      1.00         3
  CLASS-1688       1.00      1.00      1.00         1
  CLASS-1721       1.00      1.00      1.00        32
  CLASS-1758       1.00      1.00      1.00       381
  CLASS-1770       1.00      1.00      1.00         5
  CLASS-1805       1.00    

In [277]:
train_w2v = wordvec_df.iloc[:5566,:]
test_w2v = wordvec_df.iloc[5566:,:] 

xtrain_w2v, xvalid_w2v, ytrain, yvalid = train_test_split(train_w2v, train['Product_Category'], random_state=42, test_size=0.25)

lreg = LogisticRegression(n_jobs=-1,C=1e5) 
lreg.fit(xtrain_w2v, ytrain)
y_pred = lreg.predict(xvalid_w2v)
print('accuracy %s' % accuracy_score(y_pred, yvalid))
print(classification_report(yvalid, y_pred))

  " = {}.".format(effective_n_jobs(self.n_jobs)))


accuracy 0.9985632183908046


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

  CLASS-1248       1.00      1.00      1.00         1
  CLASS-1249       1.00      1.00      1.00        42
  CLASS-1250       0.99      1.00      1.00       113
  CLASS-1274       1.00      1.00      1.00       234
  CLASS-1294       1.00      1.00      1.00         5
  CLASS-1309       1.00      1.00      1.00        12
  CLASS-1322       1.00      1.00      1.00        10
  CLASS-1376       1.00      1.00      1.00       108
  CLASS-1429       1.00      1.00      1.00        10
  CLASS-1477       1.00      1.00      1.00        16
  CLASS-1522       1.00      1.00      1.00       201
  CLASS-1567       1.00      1.00      1.00        15
  CLASS-1652       1.00      1.00      1.00         3
  CLASS-1688       1.00      1.00      1.00         1
  CLASS-1721       1.00      1.00      1.00        32
  CLASS-1758       1.00      1.00      1.00       381
  CLASS-1770       1.00      1.00      1.00         5
  CLASS-1805       1.00    

In [216]:
train_d2v = docvec_df.iloc[:5566,:]
test_d2v = docvec_df.iloc[5566:,:]

xtrain_d2v, xvalid_d2v, ytrain, yvalid = train_test_split(train_d2v, train['Product_Category'], random_state=42, test_size=0.25)

lreg = LogisticRegression(n_jobs=-1,C=1e5) 
lreg.fit(xtrain_d2v, ytrain)
y_pred = lreg.predict(xvalid_d2v)
print('accuracy %s' % accuracy_score(y_pred, yvalid))
print(classification_report(yvalid, y_pred))

  " = {}.".format(effective_n_jobs(self.n_jobs)))


accuracy 0.8692528735632183


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

  CLASS-1248       0.00      0.00      0.00         1
  CLASS-1249       0.79      0.71      0.75        42
  CLASS-1250       0.88      0.96      0.92       113
  CLASS-1274       0.96      0.96      0.96       234
  CLASS-1294       0.60      0.60      0.60         5
  CLASS-1309       0.60      0.50      0.55        12
  CLASS-1322       0.83      0.50      0.62        10
  CLASS-1376       0.89      0.94      0.91       108
  CLASS-1429       0.67      0.60      0.63        10
  CLASS-1477       0.83      0.94      0.88        16
  CLASS-1522       0.92      0.92      0.92       201
  CLASS-1567       0.63      0.80      0.71        15
  CLASS-1652       1.00      0.67      0.80         3
  CLASS-1688       1.00      1.00      1.00         1
  CLASS-1721       0.96      0.75      0.84        32
  CLASS-1758       0.94      0.96      0.95       381
  CLASS-1770       1.00      0.60      0.75         5
  CLASS-1805       0.00    

#### LR + W2V performed best with accuracy 0.9992816091954023

# Support Vector Classifier

In [158]:
from sklearn.linear_model import SGDClassifier
from sklearn import svm

In [164]:
train_bow = bow[:5566,:]
test_bow = bow[5566:,:]

xtrain_bow, xvalid_bow, ytrain, yvalid = train_test_split(train_bow, train['Product_Category'], random_state=42, test_size=0.25)

svc = svm.SVC(C=1e5)
svc.fit(xtrain_bow, ytrain)
y_pred = svc.predict(xvalid_bow)
print('accuracy %s' % accuracy_score(y_pred, yvalid))
print(classification_report(yvalid, y_pred))

accuracy 0.9985632183908046


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

  CLASS-1477       1.00      1.00      1.00         1
  CLASS-1652       1.00      1.00      1.00        42
  CLASS-2003       1.00      1.00      1.00       113
  CLASS-2038       1.00      1.00      1.00       234
  CLASS-1274       1.00      1.00      1.00         5
  CLASS-1919       1.00      1.00      1.00        12
  CLASS-1721       1.00      1.00      1.00        10
  CLASS-1850       1.00      1.00      1.00       108
  CLASS-2141       1.00      1.00      1.00        10
  CLASS-1322       1.00      1.00      1.00        16
  CLASS-1567       1.00      1.00      1.00       201
  CLASS-1688       0.94      1.00      0.97        15
  CLASS-1248       1.00      1.00      1.00         3
  CLASS-1983       1.00      1.00      1.00         1
  CLASS-1828       1.00      1.00      1.00        32
  CLASS-1250       1.00      1.00      1.00       381
  CLASS-1758       1.00      1.00      1.00         5
  CLASS-2112       1.00    

In [178]:
# train_bow = bow[:5566,:]
# test_bow = bow[5566:,:]

# xtrain_bow, xvalid_bow, ytrain, yvalid = train_test_split(train_bow, train['Product_Category'], random_state=42, test_size=0.25)

# sgd = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=100, tol=None)
# sgd.fit(xtrain_bow, ytrain)
# y_pred = sgd.predict(xvalid_bow)
# print('accuracy %s' % accuracy_score(y_pred, yvalid))
# print(classification_report(yvalid, y_pred, target_names=my_categories))

In [217]:
train_tfidf = tfidf[:5566,:]
test_tfidf = tfidf[5566:,:] 

xtrain_tfidf, xvalid_tfidf, ytrain, yvalid = train_test_split(train_tfidf, train['Product_Category'], random_state=42, test_size=0.25)

svc = svm.SVC(C=1e5)
svc.fit(xtrain_tfidf, ytrain)
y_pred = svc.predict(xvalid_tfidf)
print('accuracy %s' % accuracy_score(y_pred, yvalid))
print(classification_report(yvalid, y_pred))



accuracy 0.9992816091954023


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

  CLASS-1248       1.00      1.00      1.00         1
  CLASS-1249       1.00      1.00      1.00        42
  CLASS-1250       1.00      1.00      1.00       113
  CLASS-1274       1.00      1.00      1.00       234
  CLASS-1294       1.00      1.00      1.00         5
  CLASS-1309       1.00      1.00      1.00        12
  CLASS-1322       1.00      1.00      1.00        10
  CLASS-1376       1.00      1.00      1.00       108
  CLASS-1429       1.00      1.00      1.00        10
  CLASS-1477       1.00      1.00      1.00        16
  CLASS-1522       1.00      1.00      1.00       201
  CLASS-1567       1.00      1.00      1.00        15
  CLASS-1652       1.00      1.00      1.00         3
  CLASS-1688       1.00      1.00      1.00         1
  CLASS-1721       1.00      1.00      1.00        32
  CLASS-1758       1.00      1.00      1.00       381
  CLASS-1770       1.00      1.00      1.00         5
  CLASS-1805       1.00    

In [177]:
# train_tfidf = tfidf[:5566,:]
# test_tfidf = tfidf[5566:,:] 

# xtrain_tfidf, xvalid_tfidf, ytrain, yvalid = train_test_split(train_tfidf, train['Product_Category'], random_state=42, test_size=0.25)

# sgd = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=100, tol=None)
# sgd.fit(xtrain_tfidf, ytrain)
# y_pred = sgd.predict(xvalid_tfidf)
# print('accuracy %s' % accuracy_score(y_pred, yvalid))
# print(classification_report(yvalid, y_pred, target_names=my_categories))

In [286]:
train_w2v = wordvec_df.iloc[:5566,:]
test_w2v = wordvec_df.iloc[5566:,:] 

xtrain_w2v, xvalid_w2v, ytrain, yvalid = train_test_split(train_w2v, train['Product_Category'], random_state=42, test_size=0.25)

svc = svm.SVC(C=1e5)
svc.fit(xtrain_w2v, ytrain)
y_pred = svc.predict(xvalid_w2v)
print('accuracy %s' % accuracy_score(y_pred, yvalid))
print(classification_report(yvalid, y_pred))



accuracy 0.9992816091954023


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

  CLASS-1248       1.00      1.00      1.00         1
  CLASS-1249       1.00      1.00      1.00        42
  CLASS-1250       0.99      1.00      1.00       113
  CLASS-1274       1.00      1.00      1.00       234
  CLASS-1294       1.00      1.00      1.00         5
  CLASS-1309       1.00      1.00      1.00        12
  CLASS-1322       1.00      1.00      1.00        10
  CLASS-1376       1.00      1.00      1.00       108
  CLASS-1429       1.00      1.00      1.00        10
  CLASS-1477       1.00      1.00      1.00        16
  CLASS-1522       1.00      1.00      1.00       201
  CLASS-1567       1.00      1.00      1.00        15
  CLASS-1652       1.00      1.00      1.00         3
  CLASS-1688       1.00      1.00      1.00         1
  CLASS-1721       1.00      1.00      1.00        32
  CLASS-1758       1.00      1.00      1.00       381
  CLASS-1770       1.00      1.00      1.00         5
  CLASS-1805       1.00    

In [176]:
# train_w2v = wordvec_df.iloc[:5566,:]
# test_w2v = wordvec_df.iloc[5566:,:] 

# xtrain_w2v, xvalid_w2v, ytrain, yvalid = train_test_split(train_w2v, train['Product_Category'], random_state=42, test_size=0.25)

# sgd = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=100, tol=None)
# sgd.fit(xtrain_w2v, ytrain)
# y_pred = sgd.predict(xvalid_w2v)
# print('accuracy %s' % accuracy_score(y_pred, yvalid))
# print(classification_report(yvalid, y_pred, target_names=my_categories))

In [179]:
train_d2v = docvec_df.iloc[:5566,:]
test_d2v = docvec_df.iloc[5566:,:]

xtrain_d2v, xvalid_d2v, ytrain, yvalid = train_test_split(train_d2v, train['Product_Category'], random_state=42, test_size=0.25)

svc = svm.SVC(C=1e5) 
svc.fit(xtrain_d2v, ytrain)
y_pred = svc.predict(xvalid_d2v)
print('accuracy %s' % accuracy_score(y_pred, yvalid))
print(classification_report(yvalid, y_pred))



accuracy 0.9030172413793104


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

  CLASS-1477       0.00      0.00      0.00         1
  CLASS-1652       0.89      0.79      0.84        42
  CLASS-2003       0.95      1.00      0.97       113
  CLASS-2038       0.95      0.99      0.97       234
  CLASS-1274       0.62      1.00      0.77         5
  CLASS-1919       0.89      0.67      0.76        12
  CLASS-1721       0.44      0.40      0.42        10
  CLASS-1850       0.88      0.98      0.93       108
  CLASS-2141       0.78      0.70      0.74        10
  CLASS-1322       1.00      0.81      0.90        16
  CLASS-1567       0.96      0.92      0.94       201
  CLASS-1688       0.72      0.87      0.79        15
  CLASS-1248       1.00      1.00      1.00         3
  CLASS-1983       1.00      1.00      1.00         1
  CLASS-1828       0.85      0.69      0.76        32
  CLASS-1250       0.97      0.97      0.97       381
  CLASS-1758       0.50      0.60      0.55         5
  CLASS-2112       0.00    

#### SVC + TFIDF and SVC + W2V performed best with accuracy 0.9992816091954023

# Random Forest

In [181]:
from sklearn.ensemble import RandomForestClassifier

In [187]:
train_bow = bow[:5566,:]
test_bow = bow[5566:,:]

xtrain_bow, xvalid_bow, ytrain, yvalid = train_test_split(train_bow, train['Product_Category'], random_state=42, test_size=0.25)

rf = RandomForestClassifier(n_jobs=-1,  n_estimators=1000, random_state=42)
rf.fit(xtrain_bow, ytrain)
y_pred = rf.predict(xvalid_bow)
print('accuracy %s' % accuracy_score(y_pred, yvalid))
print(classification_report(yvalid, y_pred))

accuracy 0.9964080459770115


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

  CLASS-1477       1.00      1.00      1.00         1
  CLASS-1652       1.00      1.00      1.00        42
  CLASS-2003       1.00      1.00      1.00       113
  CLASS-2038       1.00      1.00      1.00       234
  CLASS-1274       1.00      1.00      1.00         5
  CLASS-1919       1.00      1.00      1.00        12
  CLASS-1721       1.00      1.00      1.00        10
  CLASS-1850       1.00      1.00      1.00       108
  CLASS-2141       1.00      1.00      1.00        10
  CLASS-1322       1.00      1.00      1.00        16
  CLASS-1567       1.00      1.00      1.00       201
  CLASS-1688       1.00      1.00      1.00        15
  CLASS-1248       1.00      1.00      1.00         3
  CLASS-1983       1.00      1.00      1.00         1
  CLASS-1828       1.00      1.00      1.00        32
  CLASS-1250       1.00      1.00      1.00       381
  CLASS-1758       1.00      1.00      1.00         5
  CLASS-2112       1.00    

In [188]:
train_tfidf = tfidf[:5566,:]
test_tfidf = tfidf[5566:,:] 

xtrain_tfidf, xvalid_tfidf, ytrain, yvalid = train_test_split(train_tfidf, train['Product_Category'], random_state=42, test_size=0.25)

rf = RandomForestClassifier(n_jobs=-1,  n_estimators=1000, random_state=42)
rf.fit(xtrain_tfidf, ytrain)
y_pred = rf.predict(xvalid_tfidf)
print('accuracy %s' % accuracy_score(y_pred, yvalid))
print(classification_report(yvalid, y_pred))

accuracy 0.9964080459770115


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

  CLASS-1477       0.50      1.00      0.67         1
  CLASS-1652       1.00      1.00      1.00        42
  CLASS-2003       1.00      1.00      1.00       113
  CLASS-2038       1.00      1.00      1.00       234
  CLASS-1274       1.00      1.00      1.00         5
  CLASS-1919       1.00      1.00      1.00        12
  CLASS-1721       1.00      1.00      1.00        10
  CLASS-1850       1.00      1.00      1.00       108
  CLASS-2141       1.00      1.00      1.00        10
  CLASS-1322       1.00      1.00      1.00        16
  CLASS-1567       1.00      1.00      1.00       201
  CLASS-1688       1.00      1.00      1.00        15
  CLASS-1248       1.00      1.00      1.00         3
  CLASS-1983       1.00      1.00      1.00         1
  CLASS-1828       1.00      1.00      1.00        32
  CLASS-1250       1.00      1.00      1.00       381
  CLASS-1758       1.00      1.00      1.00         5
  CLASS-2112       1.00    

In [189]:
train_w2v = wordvec_df.iloc[:5566,:]
test_w2v = wordvec_df.iloc[5566:,:] 

xtrain_w2v, xvalid_w2v, ytrain, yvalid = train_test_split(train_w2v, train['Product_Category'], random_state=42, test_size=0.25)

rf = RandomForestClassifier(n_jobs=-1,  n_estimators=1000, random_state=42)
rf.fit(xtrain_w2v, ytrain)
y_pred = rf.predict(xvalid_w2v)
print('accuracy %s' % accuracy_score(y_pred, yvalid))
print(classification_report(yvalid, y_pred))

accuracy 0.9935344827586207


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

  CLASS-1477       1.00      1.00      1.00         1
  CLASS-1652       1.00      1.00      1.00        42
  CLASS-2003       0.99      1.00      1.00       113
  CLASS-2038       1.00      1.00      1.00       234
  CLASS-1274       1.00      1.00      1.00         5
  CLASS-1919       1.00      0.83      0.91        12
  CLASS-1721       1.00      1.00      1.00        10
  CLASS-1850       1.00      1.00      1.00       108
  CLASS-2141       1.00      1.00      1.00        10
  CLASS-1322       1.00      1.00      1.00        16
  CLASS-1567       1.00      1.00      1.00       201
  CLASS-1688       1.00      1.00      1.00        15
  CLASS-1248       1.00      1.00      1.00         3
  CLASS-1983       1.00      1.00      1.00         1
  CLASS-1828       0.89      1.00      0.94        32
  CLASS-1250       1.00      1.00      1.00       381
  CLASS-1758       1.00      1.00      1.00         5
  CLASS-2112       1.00    

In [190]:
train_d2v = docvec_df.iloc[:5566,:]
test_d2v = docvec_df.iloc[5566:,:]

xtrain_d2v, xvalid_d2v, ytrain, yvalid = train_test_split(train_d2v, train['Product_Category'], random_state=42, test_size=0.25)

rf = RandomForestClassifier(n_jobs=-1,  n_estimators=1000, random_state=42)
rf.fit(xtrain_d2v, ytrain)
y_pred = rf.predict(xvalid_d2v)
print('accuracy %s' % accuracy_score(y_pred, yvalid))
print(classification_report(yvalid, y_pred))

accuracy 0.7557471264367817


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

  CLASS-1477       0.00      0.00      0.00         1
  CLASS-1652       0.94      0.40      0.57        42
  CLASS-2003       0.92      0.97      0.94       113
  CLASS-2038       0.74      0.94      0.83       234
  CLASS-1274       1.00      0.20      0.33         5
  CLASS-1919       0.00      0.00      0.00        12
  CLASS-1721       0.00      0.00      0.00        10
  CLASS-1850       0.76      0.82      0.79       108
  CLASS-2141       0.00      0.00      0.00        10
  CLASS-1322       1.00      0.06      0.12        16
  CLASS-1567       0.81      0.87      0.84       201
  CLASS-1688       0.80      0.27      0.40        15
  CLASS-1248       0.00      0.00      0.00         3
  CLASS-1983       0.00      0.00      0.00         1
  CLASS-1828       0.92      0.38      0.53        32
  CLASS-1250       0.70      0.99      0.82       381
  CLASS-1758       0.00      0.00      0.00         5
  CLASS-2112       0.00    

# XG Boost

In [191]:
from xgboost import XGBClassifier

In [279]:
train_bow = bow[:5566,:]
test_bow = bow[5566:,:]

xtrain_bow, xvalid_bow, ytrain, yvalid = train_test_split(train_bow, train['Product_Category'], random_state=42, test_size=0.25)

xgb_model = XGBClassifier(n_jobs=-1,n_estimators=1000,max_depth=10, random_state=42)
xgb_model.fit(xtrain_bow, ytrain)
y_pred = xgb_model.predict(xvalid_bow)
print('accuracy %s' % accuracy_score(y_pred, yvalid))
print(classification_report(yvalid, y_pred))

accuracy 0.9985632183908046


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

  CLASS-1248       0.50      1.00      0.67         1
  CLASS-1249       1.00      1.00      1.00        42
  CLASS-1250       1.00      1.00      1.00       113
  CLASS-1274       1.00      1.00      1.00       234
  CLASS-1294       1.00      1.00      1.00         5
  CLASS-1309       1.00      1.00      1.00        12
  CLASS-1322       1.00      1.00      1.00        10
  CLASS-1376       1.00      1.00      1.00       108
  CLASS-1429       1.00      1.00      1.00        10
  CLASS-1477       1.00      1.00      1.00        16
  CLASS-1522       1.00      1.00      1.00       201
  CLASS-1567       1.00      1.00      1.00        15
  CLASS-1652       1.00      1.00      1.00         3
  CLASS-1688       1.00      1.00      1.00         1
  CLASS-1721       1.00      1.00      1.00        32
  CLASS-1758       1.00      1.00      1.00       381
  CLASS-1770       1.00      1.00      1.00         5
  CLASS-1805       1.00    

In [200]:
train_tfidf = tfidf[:5566,:]
test_tfidf = tfidf[5566:,:] 

xtrain_tfidf, xvalid_tfidf, ytrain, yvalid = train_test_split(train_tfidf, train['Product_Category'], random_state=42, test_size=0.25)

xgb_model = XGBClassifier(n_jobs=-1,n_estimators=1000,max_depth=10, random_state=42)
xgb_model.fit(xtrain_tfidf, ytrain)
y_pred = xgb_model.predict(xvalid_tfidf)
print('accuracy %s' % accuracy_score(y_pred, yvalid))
print(classification_report(yvalid, y_pred))

accuracy 0.9964080459770115


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

  CLASS-1477       0.50      1.00      0.67         1
  CLASS-1652       1.00      1.00      1.00        42
  CLASS-2003       1.00      1.00      1.00       113
  CLASS-2038       1.00      1.00      1.00       234
  CLASS-1274       1.00      1.00      1.00         5
  CLASS-1919       0.86      1.00      0.92        12
  CLASS-1721       1.00      1.00      1.00        10
  CLASS-1850       1.00      1.00      1.00       108
  CLASS-2141       1.00      1.00      1.00        10
  CLASS-1322       1.00      1.00      1.00        16
  CLASS-1567       1.00      1.00      1.00       201
  CLASS-1688       1.00      1.00      1.00        15
  CLASS-1248       1.00      1.00      1.00         3
  CLASS-1983       1.00      1.00      1.00         1
  CLASS-1828       1.00      1.00      1.00        32
  CLASS-1250       1.00      1.00      1.00       381
  CLASS-1758       1.00      1.00      1.00         5
  CLASS-2112       1.00    

In [280]:
train_w2v = wordvec_df.iloc[:5566,:]
test_w2v = wordvec_df.iloc[5566:,:] 

xtrain_w2v, xvalid_w2v, ytrain, yvalid = train_test_split(train_w2v, train['Product_Category'], random_state=42, test_size=0.25)

xgb_model = XGBClassifier(n_jobs=-1,n_estimators=1000,max_depth=10, random_state=42)
xgb_model.fit(xtrain_w2v, ytrain)
y_pred = xgb_model.predict(xvalid_w2v)
print('accuracy %s' % accuracy_score(y_pred, yvalid))
print(classification_report(yvalid, y_pred))

accuracy 0.992816091954023


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

  CLASS-1248       1.00      1.00      1.00         1
  CLASS-1249       0.98      1.00      0.99        42
  CLASS-1250       1.00      1.00      1.00       113
  CLASS-1274       1.00      1.00      1.00       234
  CLASS-1294       1.00      1.00      1.00         5
  CLASS-1309       1.00      1.00      1.00        12
  CLASS-1322       1.00      0.70      0.82        10
  CLASS-1376       1.00      1.00      1.00       108
  CLASS-1429       1.00      0.90      0.95        10
  CLASS-1477       0.94      1.00      0.97        16
  CLASS-1522       1.00      1.00      1.00       201
  CLASS-1567       1.00      1.00      1.00        15
  CLASS-1652       1.00      1.00      1.00         3
  CLASS-1688       0.50      1.00      0.67         1
  CLASS-1721       0.97      1.00      0.98        32
  CLASS-1758       1.00      1.00      1.00       381
  CLASS-1770       1.00      1.00      1.00         5
  CLASS-1805       1.00    

In [202]:
train_d2v = docvec_df.iloc[:5566,:]
test_d2v = docvec_df.iloc[5566:,:]

xtrain_d2v, xvalid_d2v, ytrain, yvalid = train_test_split(train_d2v, train['Product_Category'], random_state=42, test_size=0.25)

xgb_model = XGBClassifier(n_jobs=-1,n_estimators=1000,max_depth=10, random_state=42)
xgb_model.fit(xtrain_d2v, ytrain)
y_pred = xgb_model.predict(xvalid_d2v)
print('accuracy %s' % accuracy_score(y_pred, yvalid))
print(classification_report(yvalid, y_pred))

accuracy 0.8520114942528736


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

  CLASS-1477       0.00      0.00      0.00         1
  CLASS-1652       0.74      0.60      0.66        42
  CLASS-2003       0.95      0.98      0.97       113
  CLASS-2038       0.87      0.98      0.92       234
  CLASS-1274       0.60      0.60      0.60         5
  CLASS-1919       1.00      0.17      0.29        12
  CLASS-1721       0.50      0.20      0.29        10
  CLASS-1850       0.83      0.91      0.87       108
  CLASS-2141       1.00      0.30      0.46        10
  CLASS-1322       1.00      0.62      0.77        16
  CLASS-1567       0.91      0.92      0.91       201
  CLASS-1688       0.58      0.47      0.52        15
  CLASS-1248       1.00      0.67      0.80         3
  CLASS-1983       0.00      0.00      0.00         1
  CLASS-1828       0.76      0.59      0.67        32
  CLASS-1250       0.90      0.97      0.94       381
  CLASS-1758       1.00      0.40      0.57         5
  CLASS-2112       0.00    

# Best Performing Models:
# LR + W2V, SVC + TFIDF and SVC + W2V performed best with accuracy 0.9992816091954023


### Now we train on full training set and predict based on these 3 models.

In [270]:
train_w2v = wordvec_df.iloc[:5566,:]
test_w2v = wordvec_df.iloc[5566:,:] 

lreg.fit(train_w2v, train['Product_Category'])
y_pred_lr_w2v = lreg.predict(test_w2v)

submission = pd.DataFrame()
submission['Inv_Id'] = test['Inv_Id']
submission['Product_Category'] = y_pred_lr_w2v
submission.to_csv('submission_Krishna_Priya_LR+W2V.csv', index=False)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


In [221]:
train_tfidf = tfidf[:5566,:]
test_tfidf = tfidf[5566:,:]  

svc.fit(train_tfidf, train['Product_Category'])
y_pred_svc_tfidf = svc.predict(test_tfidf)

submission = pd.DataFrame()
submission['Inv_Id'] = test['Inv_Id']
submission['Product_Category'] = y_pred_svc_tfidf
submission.to_csv('submission_Krishna_Priya_SVC+TFIDF.csv', index=False)



In [288]:
train_w2v = wordvec_df.iloc[:5566,:]
test_w2v = wordvec_df.iloc[5566:,:] 

svc.fit(train_w2v, train['Product_Category'])
y_pred_svc_w2v = svc.predict(test_w2v)

submission = pd.DataFrame()
submission['Inv_Id'] = test['Inv_Id']
submission['Product_Category'] = y_pred_svc_w2v
submission.to_csv('submission_Krishna_Priya_SVC+W2V.csv', index=False)



# Word2Vec model is performing best on validation and Test data both.
# Also Logistic Regression and SVC is giving almost similar results show that our Word2Vec model is built robust.
# Getting 99.96 % accuracy on test data shows that only 1 class is not being predicted correctly and as far as I see we have only 1 training data for class 1838 and that must not be predicting correctly. which will be improved as soon as we give it more training data for that class.