In [39]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import csv
import nltk
import os
import string

from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

In [43]:
np.random.seed(500)

#### Importing Datasets


In [2]:
train_data = pd.read_csv("dataset/train.csv", escapechar = "\\", quoting = csv.QUOTE_NONE)
test_data = pd.read_csv("dataset/test.csv", escapechar = "\\", quoting = csv.QUOTE_NONE)

In [3]:
train_data.head()

Unnamed: 0,TITLE,DESCRIPTION,BULLET_POINTS,BRAND,BROWSE_NODE_ID
0,"Pete The Cat Bedtime Blues Doll, 14.5 Inch","Pete the Cat is the coolest, most popular cat ...","[Pete the Cat Bedtime Blues plush doll,Based o...",MerryMakers,0
1,"The New Yorker NYHM014 Refrigerator Magnet, 2 ...",The New Yorker Handsome Cello Wrapped Hard Mag...,[Cat In A Tea Cup by New Yorker cover artist G...,The New Yorker,1
2,The Ultimate Self-Sufficiency Handbook: A Comp...,,Skyhorse Publishing,imusti,2
3,Amway Nutrilite Kids Chewable Iron Tablets (100),,"[Nutrilite Kids,Chewable Iron Tablets,Quantity...",Amway,3
4,Teacher Planner Company A4 6 Lesson Academic T...,,,,4


In [4]:
test_data.head()

Unnamed: 0,PRODUCT_ID,TITLE,DESCRIPTION,BULLET_POINTS,BRAND
0,1,"Command 3M Small Kitchen Hooks, White, Decorat...",Sale Unit: PACK,[INCLUDES - 9 hooks and 12 small indoor strips...,Command
1,2,O'Neal Jump Hardware JAG Unisex-Adult Glove (B...,Synthetic leather palm with double-layer thumb...,[Silicone printing for a better grip. Long las...,O'Neal
2,3,"NFL Detroit Lions Portable Party Fridge, 15.8 ...",Boelter Brands lets you celebrate your favorit...,[Runs on 12 Volt DC Power or 110 Volt AC Power...,Boelter Brands
3,4,Panasonic Single Line KX-TS880MX Corded Phone ...,Features: 50 Station Phonebook Corded Phone Al...,Panasonic Landline Phones doesn't come with a ...,Panasonic
4,5,Zero Baby Girl's 100% Cotton Innerwear Bloomer...,"Zero Baby Girl Panties Set. 100% Cotton, Breat...","[Zero Baby Girl Panties, Pack of 6, 100% Cotto...",Zero


In [5]:
train_data.isnull().sum()

TITLE                 71
DESCRIPTION       723664
BULLET_POINTS     166263
BRAND              56737
BROWSE_NODE_ID         0
dtype: int64

In [6]:
new_train_df = train_data.dropna()
new_train_df.head()

Unnamed: 0,TITLE,DESCRIPTION,BULLET_POINTS,BRAND,BROWSE_NODE_ID
0,"Pete The Cat Bedtime Blues Doll, 14.5 Inch","Pete the Cat is the coolest, most popular cat ...","[Pete the Cat Bedtime Blues plush doll,Based o...",MerryMakers,0
1,"The New Yorker NYHM014 Refrigerator Magnet, 2 ...",The New Yorker Handsome Cello Wrapped Hard Mag...,[Cat In A Tea Cup by New Yorker cover artist G...,The New Yorker,1
5,Men'S Full Sleeve Raglan T-Shirts Denim T-Shir...,Men'S Full Sleeve Raglan T-Shirts Denim T-Shir...,"[Color: Blue,Sleeve: Full Sleeve,Material: Cot...",Bhavya Enterprise,5
6,Glance Women's Wallet (Black) (LW-21),This Black wallet by Glance will be a treasure...,[The Most Comfortable Women's Wallet That You ...,Glance,6
7,Wild Animals Hungry Brain Educational Flash Ca...,Wild Animals are the animals that mostly stays...,[Playful learning: Flash cards develops the lo...,hungry brain,7


In [7]:
new_train_df.isnull().sum()

TITLE             0
DESCRIPTION       0
BULLET_POINTS     0
BRAND             0
BROWSE_NODE_ID    0
dtype: int64

In [8]:
new_train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2110586 entries, 0 to 2903022
Data columns (total 5 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   TITLE           object
 1   DESCRIPTION     object
 2   BULLET_POINTS   object
 3   BRAND           object
 4   BROWSE_NODE_ID  int64 
dtypes: int64(1), object(4)
memory usage: 96.6+ MB


In [9]:
new_train_df.nunique()

TITLE             2003527
DESCRIPTION       1221892
BULLET_POINTS     1316867
BRAND              227539
BROWSE_NODE_ID       9636
dtype: int64

In [10]:
new_train_df.shape

(2110586, 5)

In [11]:
new_train_df = new_train_df.drop_duplicates(keep = "first")
new_train_df.shape

(2026638, 5)

In [44]:
df = new_train_df.sample(n=10000)
df.head()

Unnamed: 0,TITLE,DESCRIPTION,BULLET_POINTS,BRAND,BROWSE_NODE_ID
1852416,Zeyu Automatic Air Freshener Dispenser with On...,<p><b>Application:</b></p><p>Office House Hosp...,[Colour : White || Material : Plastic || Size ...,zeyu,6696
1831625,Premium 36cm Round Chafing Dish with Full Glas...,Enhance your Buffet presentation with this Pre...,"[Constructed of Heavy Gauge, Quality Stainless...",Avon Appliances,73418
2246596,Genron Ultra Slim 360 Matte Velvet Feel Thin A...,<p><b>Elegant and Unique</b><br>1. Exquisite S...,"[Extremely thin, Perfect Fit, and Light Weight...",Genron,1045
1981537,Generic Gift Handmade Car Key Cover Case Holde...,Brand Name:Easwraih; Material Type:Top Layer L...,"[Car Key Case,Brand Name:Easwraih; Material,Ty...",Generic,2282
2849389,WALFRONT Canvas Pliers Clamp for Stretching Oi...,Features used for properly stretching a canvas...,[Canvas paint framing pliers for homemade oil ...,WALFRONT,117


In [59]:
df["DESCRIPTION"] = df["DESCRIPTION"].apply(str)

In [60]:
import re
def clean_html(text):
    
    clean = re.compile('<.*?>')
    return re.sub(clean, '',text)

def remove_special(text):
        x=''
        for i in text:
            if i.isalnum():
                x=x+i
            else:
                x=x+' '
        return x

In [None]:
# Step - a : Remove blank rows if any.
df['DESCRIPTION'].dropna(inplace=True)
# Step - b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
df['DESCRIPTION']= [clean_html(entry) for entry in df['DESCRIPTION']]
df['DESCRIPTION'] = [entry.lower() for entry in df['DESCRIPTION']]
#df['DESCRIPTION']= [remove_special(entry) for entry in df['DESCRIPTION']]
# Step - c : Tokenization : In this each entry in the df will be broken into set of words
df['DESCRIPTION']= [word_tokenize(entry) for entry in df['DESCRIPTION']]
# Step - d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(df['DESCRIPTION']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'DESCRIPTION'_final'
    df.loc[index,'DESCRIPTION_final'] = str(Final_words)

In [57]:
df

Unnamed: 0,TITLE,DESCRIPTION,BULLET_POINTS,BRAND,BROWSE_NODE_ID,DESCRIPTION_final
1852416,Zeyu Automatic Air Freshener Dispenser with On...,"[application, office, house, hospital, factory...",[Colour : White || Material : Plastic || Size ...,zeyu,6696.0,
1831625,Premium 36cm Round Chafing Dish with Full Glas...,"[enhance, your, buffet, presentation, with, th...","[Constructed of Heavy Gauge, Quality Stainless...",Avon Appliances,73418.0,
2246596,Genron Ultra Slim 360 Matte Velvet Feel Thin A...,"[elegant, and, unique, 1, exquisite, smoothly,...","[Extremely thin, Perfect Fit, and Light Weight...",Genron,1045.0,
1981537,Generic Gift Handmade Car Key Cover Case Holde...,"[brand, name, easwraih, material, type, top, l...","[Car Key Case,Brand Name:Easwraih; Material,Ty...",Generic,2282.0,
2849389,WALFRONT Canvas Pliers Clamp for Stretching Oi...,"[features, used, for, properly, stretching, a,...",[Canvas paint framing pliers for homemade oil ...,WALFRONT,117.0,
...,...,...,...,...,...,...
19948,,,,,,['nan']
19949,,,,,,['nan']
19950,,,,,,['nan']
19951,,,,,,['nan']


In [20]:
X = df["DESCRIPTION"]
X.shape

(10000,)

In [21]:
y = df["BROWSE_NODE_ID"]
y.shape

(10000,)

In [22]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.25, random_state = 123)

In [23]:
X_train.shape

(7500,)

In [32]:
tfidf = TfidfVectorizer()
tfidf.fit(X)

TfidfVectorizer()

In [36]:
Train_X_Tfidf = tfidf.transform(X_train)
Test_X_Tfidf = tfidf.transform(X_test)

AttributeError: 'numpy.ndarray' object has no attribute 'lower'

In [26]:
print(tfidf.vocabulary_)



In [None]:
from sklearn.naive_bayes import MultinomialNB
# fit the training dataset on the NB classifier
Naive = MultinomialNB()
Naive.fit(X_train,y_train)
# predict the labels on validation dataset
predictions_NB = Naive.predict(X_test)
# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, y_test)*100)