####Amazon catalog consists of billions of products that belong to thousands of browse nodes (each browse node represents a collection of items for sale). Browse nodes are used to help customer navigate through amazon website and classify products to product type groups. Hence, it is important to predict the node assignment at the time of listing of the product or when the browse node information is absent.


**Data Description**

Full Train/Test dataset details:

*   Key column – PRODUCT_ID
*   Input features – TITLE, DESCRIPTION, BULLET_POINTS, BRAND
*   Target column – BROWSE_NODE_ID
*   Train dataset size – 2,903,024
*   Number of classes in Train – 9,919
*   Overall Test dataset size – 110,775


### loading dataset

In [1]:
!wget -q !wget https://s3-ap-southeast-1.amazonaws.com/he-public-data/dataset52a7b21.zip

In [2]:
! unzip /content/dataset52a7b21.zip

Archive:  /content/dataset52a7b21.zip
   creating: dataset/
  inflating: dataset/train.csv       
  inflating: dataset/sample_submission.csv  
  inflating: dataset/test.csv        
  inflating: dataset/.~lock.train.csv#  


### let's import necessary library



In [1]:
import pandas as pd
import numpy as np
import os
import re
import csv

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from spacy.lang.en.stop_words import STOP_WORDS
from itertools import filterfalse
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

In [3]:
!pip install contractions



###read dataset

In [4]:
train_data =pd.read_csv("/content/dataset/train.csv",escapechar="\\",quoting=csv.QUOTE_NONE,nrows=150000)
train_data.shape

(150000, 5)

In [5]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### fill null values with 0

In [7]:
train_data=train_data.fillna(0)


### combining all text containing columns for better evaluation

In [8]:
cols = ['TITLE','DESCRIPTION','BULLET_POINTS']
train_data["text"] =train_data["TITLE"].astype(str) + train_data["DESCRIPTION"].astype(str) + train_data["BULLET_POINTS"].astype(str)
train_data.head()

Unnamed: 0,TITLE,DESCRIPTION,BULLET_POINTS,BRAND,BROWSE_NODE_ID,text
0,"Pete The Cat Bedtime Blues Doll, 14.5 Inch","Pete the Cat is the coolest, most popular cat ...","[Pete the Cat Bedtime Blues plush doll,Based o...",MerryMakers,0,"Pete The Cat Bedtime Blues Doll, 14.5 InchPete..."
1,"The New Yorker NYHM014 Refrigerator Magnet, 2 ...",The New Yorker Handsome Cello Wrapped Hard Mag...,[Cat In A Tea Cup by New Yorker cover artist G...,The New Yorker,1,"The New Yorker NYHM014 Refrigerator Magnet, 2 ..."
2,The Ultimate Self-Sufficiency Handbook: A Comp...,0,Skyhorse Publishing,imusti,2,The Ultimate Self-Sufficiency Handbook: A Comp...
3,Amway Nutrilite Kids Chewable Iron Tablets (100),0,"[Nutrilite Kids,Chewable Iron Tablets,Quantity...",Amway,3,Amway Nutrilite Kids Chewable Iron Tablets (10...
4,Teacher Planner Company A4 6 Lesson Academic T...,0,0,0,4,Teacher Planner Company A4 6 Lesson Academic T...


### let's take the necessary column which is further used for classification

In [9]:
train_data=train_data.drop(cols, axis=1)
train_data.head()

Unnamed: 0,BRAND,BROWSE_NODE_ID,text
0,MerryMakers,0,"Pete The Cat Bedtime Blues Doll, 14.5 InchPete..."
1,The New Yorker,1,"The New Yorker NYHM014 Refrigerator Magnet, 2 ..."
2,imusti,2,The Ultimate Self-Sufficiency Handbook: A Comp...
3,Amway,3,Amway Nutrilite Kids Chewable Iron Tablets (10...
4,0,4,Teacher Planner Company A4 6 Lesson Academic T...


### replacing punctuations with " "

In [10]:
import re
punc = r'[^\w\s]|^[a-zA-z]+|[a-zA-z]+"" "'
for ele in train_data:
	if ele in punc:
		train_data =train_data.replace(ele, "")


In [11]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
from string import punctuation
stop_words = stopwords.words("english")
punctuation = punctuation + '\n' + '—' + '“' + ',' + '”' + '‘' + '-' + '’'

In [13]:
import string

### make tokens from text using word_tokenize

In [14]:
train_data['text'] = train_data['text'].apply(lambda x: word_tokenize(x))

In [15]:
def normalize_tokens(list_of_tokens):
    return map(lambda x: x.lower(),list_of_tokens)

### let's convert all the letters in lower case


In [16]:
train_data['text'] = train_data['text'].apply(lambda x: normalize_tokens(x))
train_data['text'] =train_data['text'].apply(lambda x: list(x))



In [17]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

### contractions expansion

In [18]:
def contracted_word_expansion(token):
    if token in contractions_dict.keys():
        return contractions_dict[token]
    else:
        return token

In [19]:
def contractions_expansion(list_of_tokens):
    return map(contracted_word_expansion,list_of_tokens)

In [20]:
from contractions import contractions_dict

In [21]:
train_data['text'] = train_data['text'].apply(lambda x: contractions_expansion(x))
train_data['text'] = train_data['text'].apply(lambda x: list(x))

### regex to remove unnecessary things from text

In [22]:
regex = r'^@[a-zA-z0-9]|^#[a-zA-Z0-9]|\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*|\W+|\d+|<("[^"]*"|\'[^\']*\'|[^\'">])*>|_+|[^\u0000-\u007f]+'

In [23]:
def waste_word_or_not(token):
    return re.search(regex,token)

In [24]:
def filter_waste_words(list_of_tokens):
    return filterfalse(waste_word_or_not,list_of_tokens)

In [25]:
train_data['text'] = train_data['text'].apply(lambda x: filter_waste_words(x))
train_data['text'] = train_data['text'].apply(lambda x: list(x))

In [26]:
def split(list_of_tokens):
    return map(lambda x: re.split(regex,x)[0],list_of_tokens)

In [27]:
train_data['text'] = train_data['text'].apply(lambda x: split(x))
train_data['text'] = train_data['text'].apply(lambda x: list(x))


### let's remove stopwards

In [28]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [29]:
en_stop_words = list(set(stopwords.words('english')).union(set(STOP_WORDS)))

In [30]:
def is_stopword(token):
    return not(token in en_stop_words or re.search(r'\b\w\b|[^\u0000-\u007f]+|_+|\W+',token))

In [31]:
def stopwords_removal(list_of_tokens):
    return filter(is_stopword,list_of_tokens)

In [32]:
train_data['text'] = train_data['text'].apply(lambda x: stopwords_removal(x))
train_data['text'] = train_data['text'].apply(lambda x: list(x))

### function to apply stemming



In [33]:
from nltk.stem import PorterStemmer
porter = PorterStemmer()

def stemmer(stem_text):
    stem_text = [porter.stem(word) for word in stem_text]
    return " ".join(stem_text)

In [34]:
train_data['text'] = train_data['text'].apply(stemmer)


### again tokenize the lines into words


In [35]:

train_data['text'] = train_data['text'].apply(lambda x: word_tokenize(x))

### let's collect 10 most occuring words from each string

In [36]:
from collections import Counter

In [37]:
train_data['text']=train_data['text'].apply(lambda x:[k for k, v in Counter(x).most_common(5)])


In [38]:
train_data=train_data.drop(columns='BRAND',axis=1)


### convert list into string

In [39]:
def listtostring(s):

  text = ", "
  return (text.join(s))

In [40]:
train_data['text']=train_data['text'].apply(lambda x:listtostring(x))


To run machine learning algorithms we need to convert the text files into numerical feature vectors. We will use bag of words model for our analysis. In general we segment each text file into words and count of times each word occurs in each document and finally assign each word an integer id. Each unique word in our dictionary will correspond to a feature (descriptive feature).

More precisely we will convert our text documents to a matrix of token counts (CountVectorizer), then transform a count matrix to a normalized tf-idf representation (tf-idf transformer).



In [41]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline


###countervectorizer

In [None]:
vectorizer = CountVectorizer()
vectorizer.fit(train_data['text'])
vector = vectorizer.transform(train_data['text']).toarray()

###tf-idf

In [45]:
tfidf_converter = TfidfTransformer()
X_tfidf = tfidf_converter.fit_transform(vector).toarray()
X_tfidf

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

###Spliting the data into train and test sets:



In [42]:
from sklearn.model_selection import train_test_split


In [43]:
X = train_data['text']
y = train_data['BROWSE_NODE_ID']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [44]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((120000,), (30000,), (120000,), (30000,))

###train the model 



In [45]:
from sklearn.svm import LinearSVC

svc = Pipeline([('vect', CountVectorizer(min_df=5, ngram_range=(1,2))),
               ('tfidf', TfidfTransformer()),
               ('model',LinearSVC()),
               ])

svc.fit(X_train, y_train)

ytest = np.array(y_test)
y_pred = svc.predict(X_test)

###accuracy score

In [46]:
from sklearn.metrics import accuracy_score, classification_report
print('accuracy %s' % accuracy_score(y_pred, y_test))


accuracy 0.5532333333333334


###let's  save our model as a pickle object in Python.

In [47]:
import pickle
with open('text_classifier', 'wb') as picklefile:
    pickle.dump(svc,picklefile)

###load the model

In [1]:
import pickle
with open('/content/text_classifier', 'rb') as training_model:
    model = pickle.load(training_model)


###reading  the testing data

In [2]:
import pandas as pd
import csv
test_data =pd.read_csv("/content/dataset/test.csv",escapechar="\\",quoting=csv.QUOTE_NONE)


In [3]:

cols = ['TITLE','DESCRIPTION','BULLET_POINTS']
test_data["text"] =test_data["TITLE"].astype(str) + test_data["DESCRIPTION"].astype(str) + test_data["BULLET_POINTS"].astype(str)
test_data=test_data.drop(cols, axis=1)

In [4]:
test_data.head()

Unnamed: 0,PRODUCT_ID,BRAND,text
0,1,Command,"Command 3M Small Kitchen Hooks, White, Decorat..."
1,2,O'Neal,O'Neal Jump Hardware JAG Unisex-Adult Glove (B...
2,3,Boelter Brands,"NFL Detroit Lions Portable Party Fridge, 15.8 ..."
3,4,Panasonic,Panasonic Single Line KX-TS880MX Corded Phone ...
4,5,Zero,Zero Baby Girl's 100% Cotton Innerwear Bloomer...


In [5]:
X_test=test_data['text']

In [6]:
prediction= model.predict(X_test)


###prediction for test data

In [7]:
prediction

array([101900,  15772,    180, ...,      5,    800,      5])

### defining the browse_node_id according to their product id as per the hackathon

In [9]:
submission= pd.DataFrame(prediction, columns=['BROWSE_NODE_ID'])
submission = pd.concat([test_data['PRODUCT_ID'], submission], axis=1)
submission.columns =['PRODUCT_ID','BROWSE_NODE_ID']
submission

Unnamed: 0,PRODUCT_ID,BROWSE_NODE_ID
0,1,101900
1,2,15772
2,3,180
3,4,604
4,5,1138
...,...,...
110770,110771,2888
110771,110772,1194
110772,110773,5
110773,110774,800


In [10]:
submission.to_csv('sub.csv', index=False)

In [11]:
submission_df=pd.read_csv('sub.csv')

In [12]:
submission_df

Unnamed: 0,PRODUCT_ID,BROWSE_NODE_ID
0,1,101900
1,2,15772
2,3,180
3,4,604
4,5,1138
...,...,...
110770,110771,2888
110771,110772,1194
110772,110773,5
110773,110774,800
