# MIDAS NLP


# Product categorisation Problem
This problem can be solved in many ways, product can be categorised on the basis of brands, ratings, discounts, and the type of product. I would move forward with a approach to identify the product category on the basis of its type classification. This is a case of multi classification problem.


In [2]:
import numpy as np 
import pandas as pd 
import re 
import nltk 
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

A function to 

In [3]:
def preprocess_string(str_arg):
    '''
    input: str_arg --> Takes string to clean
    output: cleaned_str --> Gives back cleaned string
    This fuction cleans the text in the mentioned ways as comments after the line.This has been copied from some other kernel.

    '''
    cleaned_str=re.sub('[^a-z\s]+',' ',str_arg,flags=re.IGNORECASE) #every char except alphabets is replaced
    cleaned_str=re.sub('(\s+)',' ',cleaned_str) #multiple spaces are replaced by single space
    cleaned_str=cleaned_str.lower() #converting the cleaned string to lower case
    
    return cleaned_str # Returning the preprocessed string in tokenized form

In [4]:
import_df = pd.read_csv('flipkart_com-ecommerce_sample - flipkart_com-ecommerce_sample - flipkart_com-ecommerce_sample.csv')
# Reading relevant data
import_df['product_category_tree'] = import_df['product_category_tree'].apply(lambda x : x.split('>>')[0][2:].strip())
# Category processing. (Check data to understand)
top_fiv_gen = list(import_df.groupby('product_category_tree').count().sort_values(by='uniq_id',ascending=False).head(10).index)
# Taking only top 5 categories for example sake
processed_df = import_df[import_df['product_category_tree'].isin(top_fiv_gen)][['product_category_tree','description']]
# Selecting only relevant columns
processed_df['description'] = processed_df['description'].astype('str').apply(preprocess_string)
# Cleaning strings
cat_list = list(processed_df['product_category_tree'].unique())
# Creating a list of categories for later use
print(cat_list)
# Printing the list of top 5 categories
le = preprocessing.LabelEncoder()
category_encoded=le.fit_transform(processed_df['product_category_tree'])
processed_df['product_category_tree'] = category_encoded
# Encoding the product categor

['Clothing', 'Footwear', 'Beauty and Personal Care', 'Home Decor & Festive Needs', 'Automotive', 'Home Furnishing', 'Mobiles & Accessories', 'Jewellery', 'Kitchen & Dining', 'Computers']


In [5]:
X_train, X_test, y_train, y_test = train_test_split(processed_df['description'],processed_df['product_category_tree'],test_size=0.2)


In [6]:
vect = CountVectorizer(stop_words = 'english')
# Removing stop words
X_train_matrix = vect.fit_transform(X_train) 
# Converting the train data

In [7]:
clf=MultinomialNB()
# Defining model
clf.fit(X_train_matrix, y_train)
# Fitting to multinomial NB model 
print(clf.score(X_train_matrix, y_train))
# Scoring the trained model (Expected to be above 95 percent)
X_test_matrix = vect.transform(X_test) 
# Converting the test data
print (clf.score(X_test_matrix, y_test))
# Scoring for the test data
predicted_result=clf.predict(X_test_matrix)
print(classification_report(y_test,predicted_result))
# Printing score

0.9800811785929044
0.9744514577697625
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       204
           1       0.92      0.95      0.93       152
           2       1.00      0.99      1.00      1245
           3       0.86      0.80      0.83       101
           4       0.99      0.98      0.99       250
           5       0.97      0.99      0.98       191
           6       0.98      0.98      0.98       129
           7       0.96      1.00      0.98       722
           8       0.97      0.91      0.94       112
           9       0.99      0.87      0.93       221

    accuracy                           0.97      3327
   macro avg       0.96      0.95      0.95      3327
weighted avg       0.97      0.97      0.97      3327



In [8]:
vectorizer = TfidfVectorizer(stop_words = 'english')
# Removing stop words
X_train_tfidf = vectorizer.fit_transform(X_train)
# Converting the train data

In [9]:
clf2=MultinomialNB()
# Defining model
clf2.fit(X_train_tfidf, y_train)
# Fitting to multinomial NB model 
print(clf2.score(X_train_tfidf, y_train))
# Scoring the trained model (Expected to be above 95 percent)
X_test_tfidf = vectorizer.transform(X_test) 
# Converting the test data
print (clf2.score(X_test_tfidf, y_test))
# Printing score

0.9627931449188214
0.9495040577096483


In [10]:
le.inverse_transform(clf.predict(vect.transform(['car'])))

array(['Automotive'], dtype=object)