In [0]:
import pandas as pd
import numpy as np

#### Read the data Directly from the Drive ####

In [0]:
!pip install -U -q PyDrive

In [0]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
def readURL(sharedPath, filename):
    fluff, id = sharedPath.split('=')
    downloaded = drive.CreateFile({'id':id}) 
    downloaded.GetContentFile(filename)  
    df = pd.read_csv(filename)

    return df

In [0]:
trainLink = 'link_to_the_data'
evalLink = 'link_to_the_data'

traindf = readURL(trainLink, 'train.csv')
evaldf = readURL(evalLink, 'test.csv')

#### Summary information  and Meta Data####

In [13]:
traindf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 575907 entries, 0 to 575906
Data columns (total 2 columns):
titles    575907 non-null object
labels    575907 non-null object
dtypes: object(2)
memory usage: 8.8+ MB


In [14]:
evaldf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143977 entries, 0 to 143976
Data columns (total 1 columns):
titles    143977 non-null object
dtypes: object(1)
memory usage: 1.1+ MB


#### Problem Statement ####

Build a classifier to classify product titles. We want to classify an input product title into categories.
Training data to classify this product into categories is provided with the problem. The categories
assigned in the training data are the only categories to be considered.
Also provided is a set of product titles where your classifier should predict the relevant labels. We
will internally evaluate the classifier on the prediction on the test data set.
Please provide all the relevant pre-processing, model development and tuning code.
Few points
1. It’s a tab separated file. The first column is the title, the second column contains labels.
2. There are multiple labels in the 2 nd column but for this exercise you can consider only one
label. If you want you can set it up as a multi label classification problem but its ok if you do
not.
3. Simplify the problem if need be.
4. In evaluation please generate a probability score for each label as well.

#### work on the dataset ####

the column label schema is Array. We need to separate the arrays as product Categories and Labels. There will be one categories each product titles but multiple labels (or) no labels at all for the product.

#### custom function to separate the lables ####

In [0]:
def separateLabels(df):
    #copy the dataset
    df_copy = df.copy()
    #first of all remove all square brackes
    df_copy['labels'] = df_copy['labels'].apply(lambda x: x.replace('[','').replace(']',''))
    df['category'] = df_copy['labels'].apply(lambda x: x.split(",")[0].replace("'",''))
    df['macro_category'] = df_copy['labels'].apply(lambda x: x.split(",")[1:])
    return df

In [0]:
process_traindf = separateLabels(traindf) #dataframe separated into category and macro_category.


#### Now convert the macro category into multilabel classification problem ####

In [0]:
def convertIntoMultilabel(df):
    df_copy = df.copy()
    df_copy['macro_category'] = df_copy['macro_category'].apply(lambda x: "no_label" if len(x) == 0 else x)
    visited_label = ['no_label'] #list to store macro_labels
    for macro_lables in df['macro_category']:
        if macro_lables == 'no_label':
          pass
        else:
          for label in macro_lables:
              if label in visited_label:
                  continue
              else:
                  visited_label.append(label)
    
    print("total macro category length is:{}".format(len(visited_label)))

    one_hot_value = [] #list to append after doing one-hot encode

    for nums, labels in enumerate(df_copy['macro_category']):
      #console log
      if nums % 50000 == 0 and nums != 0:
          print("finished one hot encoding for n={}".format(nums))

      one_hot_encode = [0]*len(visited_label)
      if labels == 'no_label':
        one_hot_encode[0] = 1
        one_hot_value.append(one_hot_encode)
      else:
          for indi_labels in labels:
              #get the index of the label and assign true
              index = visited_label.index(indi_labels)
              one_hot_encode[index]=1
          
          one_hot_value.append(one_hot_encode)
      

    return one_hot_value, visited_label

In [19]:
one_hot_encoding, columns = convertIntoMultilabel(process_traindf)

total macro category length is:177
finished one hot encoding for n=50000
finished one hot encoding for n=100000
finished one hot encoding for n=150000
finished one hot encoding for n=200000
finished one hot encoding for n=250000
finished one hot encoding for n=300000
finished one hot encoding for n=350000
finished one hot encoding for n=400000
finished one hot encoding for n=450000
finished one hot encoding for n=500000
finished one hot encoding for n=550000


In [0]:
column_rename = [x.replace("'",'').replace(" ",'') for x in columns]
one_hot_df = pd.DataFrame(one_hot_encoding, columns=column_rename)

In [21]:
print("Shape of the one hot encoding dataframe:{}".format(one_hot_df.shape))

Shape of the one hot encoding dataframe:(575907, 177)


#### concat one hot encoding dataframe + product dataframe ####

In [22]:
product_df = process_traindf[['titles', 'category']]

##now concatenate via axis 1
product_df = pd.concat([product_df, one_hot_df], axis=1)

## dataset size is more than 786.5+ MB
## as the column size increases w.r.t to same number of rows
## size of the dataset will also increases
product_df.shape 

(575907, 179)

In [23]:
product_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 575907 entries, 0 to 575906
Columns: 179 entries, titles to tiloil
dtypes: int64(177), object(2)
memory usage: 786.5+ MB


#### Processing the Titles ####

Removing unwanted words from the dataset.

  1. Our text preprocessing will include the following steps:
  2. Convert all text to lower case.
  3. Replace REPLACE_BY_SPACE_RE symbols by space in text.
  4. Remove symbols that are in BAD_SYMBOLS_RE from text.
  5. Remove stop words.

In [24]:
import nltk
nltk.download("stopwords")
import re
from nltk.corpus import stopwords

def cleanInputData(text):
    REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
    BAD_SYMBOLS_RE = re.compile('[^a-z #+_]') #removing numbers from the dataset
    STOPWORDS = set(stopwords.words('english'))

    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [0]:
temp_df = pd.DataFrame()
temp_df['title'] = process_traindf['titles'].apply(cleanInputData)

#### Problem Approach ####

1. For now, consider only multi-class problem.
2. Check what is the maximum number of words in the titles column.
3. Check the class count in the dataset. If the dataset is highly imbalanced for any of the particular label, do necessary steps to ensure that the model is stable.
4. Convert the data into vectors (or) sequence of integers. Truncate and the pad the input sequence all to be in the same length for modeling
5. Convert the targetLabel(categorical column) into numerical records for training
6.  Do Train and Test Split.
7.  Start Training a model, store all callbacks and return the probability metrics for evaluation dataset


For reference, consider these two articles

https://towardsdatascience.com/multi-class-text-classification-with-scikit-learn-12f1e60e0a9f - For Skipgram, reasoning for specific keywords for specific categories

https://towardsdatascience.com/multi-class-text-classification-with-lstm-1590bee1bd17 - For LSTM Model

https://medium.com/@nitinpanwar98/text-classification-using-machine-learning-with-code-65a8491d389f - Majority Voting Scheme

https://machinelearningmastery.com/how-to-develop-a-convolutional-neural-network-to-classify-satellite-photos-of-the-amazon-rainforest/ - Training neural net on Multi label Classification Problem

Steps to do
 1. Vectorize consumer complaints text, by turning each text into either a sequence of integers or into a vector.
 2. Limit the data set to the top 50000 words. (for now, we are doing top 50000 words)
 3. Set the max number of words in each complaint at 316.

In [27]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout

Using TensorFlow backend.


In [28]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 24000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 316
# This is fixed.
EMBEDDING_DIM = 100
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(product_df['titles'].values)
word_index = tokenizer.word_index
print('Found {} unique tokens.'.format(len(word_index)))

Found 20109 unique tokens.


In [29]:
total_target_var = product_df['category'].unique().tolist()
print("total number of target variable:{}".format(len(total_target_var)))

total number of target variable:254


### Check if the classes are lineary separable ###
The classifiers and learning algorithms can not directly process the text documents in their original form, as most of them expect numerical feature vectors with a fixed size rather than the raw text documents with variable length. Therefore, during the preprocessing step, the texts are converted to a more manageable representation.

One common approach for extracting features from text is to use the bag of words model: a model where for each document, a complaint narrative in our case, the presence (and often the frequency) of words is taken into consideration, but the order in which they occur is ignored.

Checking the words that are most important to the category.

Warning: feeding the entire dataset to TfIdfVectorizer won't fit in the memory. So,feeding the subset of the each class and check the most common words for each category

In [0]:
def getSubset(df):
    #get the copy of the dataframe
    df_copy = df
    subsetdf = pd.DataFrame()
    for num, label in enumerate(df['category'].unique().tolist()):
        #get the dataset
        label_df = df_copy[df_copy['category'] == label]
        label_df = label_df[['titles', 'category']]
        #get only 10000 from each dataset
        if label_df.shape[0] <= 500:
            subsetdf = pd.concat([subsetdf, label_df], axis=0)
        else:
            label_df = label_df.iloc[:500]
            subsetdf = pd.concat([subsetdf, label_df], axis=0)
        
        if num % 50 == 0 and num != 0:
            print("finished getting subset for '{}' category".format(num))
    
    return  subsetdf

In [31]:
subset_pdf = getSubset(product_df)
subset_pdf = subset_pdf.drop_duplicates()
subset_pdf.shape

finished getting subset for '50' category
finished getting subset for '100' category
finished getting subset for '150' category
finished getting subset for '200' category
finished getting subset for '250' category


(21125, 2)

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', ngram_range=(1, 2), stop_words='english')
feature_vector = tfidf_vectorizer.fit_transform(subset_pdf.titles).toarray()
labels = subset_pdf.category
feature_vector.shape

(21125, 5716)

In [0]:
np.save("featureVector.npy", feature_vector)

In [35]:
!ls

adc.json	       clustr_train_data.csv  sample_data
cluster_eval_data.csv  featureVector.npy


### finding most correlated words ###
Now, each of 25351 titles is represented by 7666 features, representing the tf-idf score for different unigrams and bigrams. We can use sklearn.feature_selection.chi2 to find the terms that are the most correlated with each of the category

In [36]:
subset_pdf['category_id'] = subset_pdf['category'].factorize()[0]
subset_pdf_required = subset_pdf[['category','category_id']]
print("shape of the dataset:{}".format(subset_pdf_required.shape))
#category to id values
category_to_id = dict(subset_pdf_required.values)

shape of the dataset:(21125, 2)


In [37]:
(list(category_to_id.keys())[list(category_to_id.values()).index(48)])

'canned food'

In [38]:
from sklearn.feature_selection import chi2
import numpy as np
labels = subset_pdf_required['category_id']
N = 2
num = 1

for Product, category_id in sorted(category_to_id.items()):

    if num % 50 == 0 and num != 0:
        print("Finished Writing Unigrams and Bigrams for {} categories".format(num))

    features_ch2 = chi2(feature_vector, labels == category_id)
    indices = np.argsort(features_ch2[0])
    feature_names = np.array(tfidf_vectorizer.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    with open("words_grams_to_category.txt", "a") as fp:
        fp.write("## Product:{}".format(list(category_to_id.keys())[list(category_to_id.values()).index(category_id)]))
        fp.write("\n")
        fp.write(".......... Mosts Correlated Unigrams:\n{}".format('\n'.join(unigrams[-N:])))
        fp.write(".......... Most Correlated Bigrams: \n {}".format('\n'.join(bigrams[-N:])))
        fp.write("\n\n")
        fp.write('****************************************')
        fp.write("\n")
        fp.close()
    
    num += 1 #increment

Finished Writing Unigrams and Bigrams for 50 categories
Finished Writing Unigrams and Bigrams for 100 categories
Finished Writing Unigrams and Bigrams for 150 categories
Finished Writing Unigrams and Bigrams for 200 categories
Finished Writing Unigrams and Bigrams for 250 categories


In [39]:
!ls

adc.json	       clustr_train_data.csv  sample_data
cluster_eval_data.csv  featureVector.npy      words_grams_to_category.txt


In [0]:
from google.colab import files
files.download("words_grams_to_category.txt")

#### checking some conditions ####

If we check the most correlated words for the categories in file **words_grams_to_category.txt** we can see for some class, the words are correctly captured. For Example - 

```
basmati rice
daawat
basmati.......... Most Correlated Bigrams: 
 kohinoor basmati
basmati rice
```

If the bigrams are unique to identify these specific keywords then traditional Machine Learning Algorithms are able to capture these sepecific patterns and able to identify the classes!

#### Train and Test Split ####

In [42]:
#convert category into id's
product_df_copy = product_df.copy()
product_df_copy = product_df_copy.drop_duplicates()
product_df_copy['category_id'] = product_df_copy['category'].factorize()[0]
product_df_copy.columns

Index(['titles', 'category', 'no_label', 'beverages', 'insectrepellent',
       'pestrepellents', 'dairy', 'make-up', 'papad', 'appalam',
       ...
       'kitchenware', 'nipple', 'pista', 'masurdal', 'lentils', 'flossers',
       'luggagebags', 'gingellyoil', 'tiloil', 'category_id'],
      dtype='object', length=180)

In [0]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

X_train, X_test, y_train, y_test = train_test_split(product_df_copy['titles'], product_df_copy['category_id'], random_state = 0)

In [44]:
print("Train shape: Train Data{}, Train Label{}".format(X_train.shape, y_train.shape))
print("Test Shape: Test Data{}, Test Label{}".format(X_test.shape, y_test.shape))

Train shape: Train Data(66473,), Train Label(66473,)
Test Shape: Test Data(22158,), Test Label(22158,)


In [0]:
count_vect = CountVectorizer() #count Vectorizer
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer() #tfidf vectorizer
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

#create a classifier
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [0]:
#make a prediction
X_test_counts = count_vect.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

In [47]:
X_test.iloc[0:10]

2265             cream kakao wafer biscuit loacker g
12800      pedigree daily food adult dogs vegetarian
65063              heracles olive oil pomace ltr tin
28503                   danone milk toned ltr carton
195510    nestle cerelac wheat apple stage gm carton
320766      udupi ruchi kerala parata instant mix gm
359448                               glucon kg plain
410174            danish premium mutton curry cut gm
418491                      yeos hot chilli sauce ml
315688                  hamdard rooh afza sharbat ml
Name: titles, dtype: object

In [0]:
predictions = []

for test_instance in X_test:
    preds = clf.predict(count_vect.transform([test_instance]))
    predictions.append(preds)

In [49]:
# calculate accuracy of class predictions
from sklearn import metrics
acc = metrics.accuracy_score(predictions, y_test)
print("MNB accuracy:{}".format(acc))

MNB accuracy:0.5482895568192075


#### Observation ####

Now, if we try to print metrics for all the classes, it will throw an error. Because in our dataset there are some class where instances are very low.(eg.., 1-10). To know the model stability we need equal distribution of the all classes in the train and test datasets. 

**Removing classes** that are very low in number. 

In [0]:
def removeClassFromDataset(df, classes):
    df = df[df['category'] != classes]
    return df

#removing classes that are lesser then 10 instances
def removingLessClass(uniqueClass, df, n=10):
    #removing class
    classes_to_be_removed = []
    for classes in uniqueClass:
        class_df = df[df['category'] == classes]
        if class_df.shape[0] <= 10:
            classes_to_be_removed.append(classes)
    
    print("number of classes that have lesser than 10 instances:N={}".format(len(classes_to_be_removed)))

    for classes in classes_to_be_removed:
        df = removeClassFromDataset(df, classes)
    
    return df

In [0]:
unique_brand = product_df_copy['category'].unique().tolist()

In [67]:
major_classes_df = removingLessClass(unique_brand, product_df_copy)

number of classes that have lesser than 10 instances:N=108


In [73]:
print("Remaining class after removing minority class:{}".format(len(major_classes_df['category'].unique().tolist())))

Remaining class after removing minority class:146


### Splitting Train and Test ###

Splitting the dataset into train and test set with equal manner. 70% of data in train data and 30% of data in test set for each class

In [0]:
def splitTrainAndTest(df):
    #train and test dataframe
    traindf, testdf = pd.DataFrame(), pd.DataFrame()
    unique_class = df.category.unique().tolist()
    
    #iterate
    for indi_class in unique_class:
        subdf = df[df['category'] == indi_class]
        n_rows = subdf.shape[0]
        #split 70, 30 percent
        train_percentage = int(float(n_rows * 70)/100)
        train_subdf = subdf.iloc[:train_percentage]
        test_subdf = subdf.iloc[train_percentage:]

        #append to the dataset
        traindf = pd.concat([traindf, train_subdf], axis=0)
        testdf = pd.concat([testdf, test_subdf], axis=0)
    

    return traindf, testdf

In [0]:
train, test = splitTrainAndTest(major_classes_df)

In [77]:
print("train dataframe shape:{}".format(train.shape))
print("test dataframe shape:{}".format(test.shape))

train dataframe shape:(61642, 180)
test dataframe shape:(26511, 180)


In [78]:
#check all class are present in the dataset
print("total number of classes in the train dataset:{}".format(len(train['category'].unique().tolist())))
print("total number of classes in the test dataset:{}".format(len(test['category'].unique().tolist())))

total number of classes in the train dataset:146
total number of classes in the test dataset:146


#### Model Function ####

In [0]:
def modelTrain(model, train, test):
    count_vect = CountVectorizer() #count Vectorizer
    tfidf_transformer = TfidfTransformer() #tfidf vectorizer

    X_train_counts = count_vect.fit_transform(train['titles'])
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

    y_train = train['category_id']
    y_test = test['category_id']

    #create a classifier
    clf = model.fit(X_train_tfidf, y_train)

    print("Model finished Training.......................")

    #predictions
    predictions = []

    for test_instance in test['titles']:
        preds = clf.predict(count_vect.transform([test_instance]))
        predictions.append(preds) #append to the prediction
    
    return clf, predictions

In [83]:
mnb_clf, mnb_preds = modelTrain(MultinomialNB(), train, test)

Model finished Training.......................


In [85]:
mnb_acc = metrics.accuracy_score(mnb_preds, test['category_id'])
print("MNB Accuracy:{}".format(mnb_acc))

MNB Accuracy:0.5381162536305685


In [86]:
from sklearn import metrics
print(metrics.classification_report(test['category_id'], mnb_preds, target_names=test['category'].unique()))

                                      precision    recall  f1-score   support

                     branded grocery       0.39      0.83      0.53      4023
                              snacks       0.68      0.45      0.54       912
                           home care       0.65      0.72      0.68      2276
                   breakfast cereals       0.73      0.36      0.48       680
              sweets & confectionery       0.27      0.04      0.07       674
                           cosmetics       0.71      0.25      0.37      1056
                        ready-to-eat       0.00      0.00      0.00        67
                 fruits & vegetables       0.91      0.79      0.84       907
                              fryums       0.00      0.00      0.00         6
                           beverages       0.50      0.58      0.54       790
                        instant food       1.00      0.00      0.01       236
                             staples       0.75      0.54      

  'precision', 'predicted', average, warn_for)


#### observation ####

From the above results, the models is not at all performing in some classes. we need to fine tune in a better way. And removing the minority classes doesn't help at all in this cases!