In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
import os
import numpy as np
import pandas as pd
from chardet import detect
import string

from sklearn import model_selection, naive_bayes, svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from spacy.lang.en import English
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix

from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from collections import defaultdict
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import spacy

In [None]:
%cd /content/gdrive/My Drive/EM

In [None]:
os.listdir()

In [None]:
np.random.seed(42)

# Read data

In [None]:
train_filepath = 'train.csv'
test_filepath = 'test.csv'

def get_encoding(file):
    with open(file, 'rb') as f:
        rawdata = f.read()
    return detect(rawdata)['encoding']

data_train = pd.read_csv(train_filepath, sep=",", encoding = get_encoding(train_filepath))
data_test = pd.read_csv(test_filepath, sep=",", encoding = get_encoding(test_filepath))

In [None]:
print(data_train.shape)
print()
print(data_train.columns)
print()
print(data_train.head())
print()
print(data_train.info())
print()
print(data_train.label_id.value_counts())

In [None]:
print(data_test.shape)
print()
print(data_test.columns)
print()
print(data_test.head())

# Tokenization

In [None]:
data_train['Splitted_sentence'] = [word_tokenize(sentence) for sentence in data_train['name']]

In [None]:
print(type(data_train))
print(len(data_train))
print(data_train['Splitted_sentence'][0])
print(data_train['Splitted_sentence'])

# Lemmatisation

In [None]:
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index, sentence in enumerate(data_train['Splitted_sentence']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(sentence):
        # if word not in stopwords.words('english') and word.isalpha():
        word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
        Final_words.append(word_Final)
    data_train.loc[index,'Tokens'] = str(Final_words)

# Data split

In [None]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(data_train['Tokens'],data_train['label_id'],test_size=0.3, random_state=42, shuffle=True)

# Vectorization

In [None]:
# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    # mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]
    mytokens = [ word for word in mytokens if word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [None]:
# vectorizer = TfidfVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,2), min_df=1, max_df=1.0)
vectorizer = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,2), min_df=1, max_df=1.0)

vectorizer.fit(data_train['Tokens'])

Train_X_vectorized = vectorizer.transform(Train_X)
Test_X_vectorized = vectorizer.transform(Test_X)

In [None]:
print(len(vectorizer.vocabulary_))
print(vectorizer.vocabulary_)

# Naive Bayes

In [None]:
# Naive Bayes classifier
classifier = naive_bayes.MultinomialNB()
classifier.fit(Train_X_vectorized, Train_Y)
prediction_NB = classifier.predict(Test_X_vectorized)
print("Naive Bayes Accuracy Score: {}%".format(round(accuracy_score(prediction_NB, Test_Y)*100, 2)))

# SVM

In [None]:
# SVM classifier
classifier = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
classifier.fit(Train_X_vectorized,Train_Y)
prediction_SVM = classifier.predict(Test_X_vectorized)
print("SVM Accuracy Score: {}%".format(round(accuracy_score(prediction_SVM, Test_Y)*100, 2)))

# Logistic Regression

In [None]:
# Logistic Regression classifier
classifier = LogisticRegression(solver='lbfgs', max_iter=2000)
classifier.fit(Train_X_vectorized,Train_Y)
prediction_logreg = classifier.predict(Test_X_vectorized)
print("Logistic Regression Accuracy Score: {}%".format(round(accuracy_score(prediction_logreg, Test_Y)*100, 2)))

# Random Forest

In [None]:
# Random Forest
classifier = RandomForestClassifier(n_estimators=10, random_state=42, verbose=3) # Add verbose=3 (more than 1) to see progress
classifier.fit(Train_X_vectorized,Train_Y)
prediction_randomforest = classifier.predict(Test_X_vectorized)
print("Random Forest Accuracy Score: {}%".format(round(accuracy_score(prediction_randomforest, Test_Y)*100, 2)))

# Accuracies

In [None]:
print("Naive Bayes: {}%".format(round(accuracy_score(prediction_NB, Test_Y)*100, 2)))
print()
print("SVM: {}%".format(round(accuracy_score(prediction_SVM, Test_Y)*100, 2)))
print(confusion_matrix(Test_Y, prediction_SVM))
print()
print("Logistic Regression: {}%".format(round(accuracy_score(prediction_logreg, Test_Y)*100, 2)))
print("Random Forest: {}%".format(round(accuracy_score(prediction_randomforest, Test_Y)*100, 2)))

# Predict

In [None]:
ser = pd.Series(data=data_test['name'])

gt_test_X_vectorized = vectorizer.transform(ser)

In [None]:
gt_test_X_vectorized.shape

In [None]:
prediction_SVM = classifier.predict(gt_test_X_vectorized)

In [None]:
print(prediction_SVM.shape)
print(prediction_SVM)

In [None]:
df = pd.DataFrame(data_test['sku_id'])

In [None]:
df['label_id'] = pd.DataFrame(prediction_NB, columns = ['label_id'])

In [None]:
df

In [None]:
df.to_csv(r'res/res.csv',index=False)

In [None]:
os.listdir()

# Random

In [None]:
df_4fun = pd.DataFrame(data_test['sku_id'])

In [None]:
df_4fun['label_id'] = np.random.randint(1, 7, df_4fun.shape[0])

In [None]:
df_4fun

In [None]:
df.to_csv(r'res/res_3_df4fun.csv',index=False)