In [13]:
import dataset_utils
import nltk
import re
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

In [14]:
import sys  
sys.path.insert(0, 'C:\\Users\\ARosa\\Documents\\spain-ai-nlp')

from sorting_utils import *
import dataset_utils

In [15]:
def tokenize(text):
    words = re.sub(r"[^A-Za-z0-9\-]", " ", text).lower().split()
    porter_stemmer=nltk.PorterStemmer()
    words = [porter_stemmer.stem(word) for word in words]
    return words

In [16]:
# method to get the corpus from the dataframe
def get_corpus(df):
    """
    Compute the text corpus from the dataframe after completing
    a full clearning process, removing all except alphanumeric text
    """
    corpus = []
    for index, row in df.iterrows():
        corpus.append(clean_and_tokenize(row['name'].lower()))
        corpus.append(clean_and_tokenize(row['description'].lower()))
    #corpus = ' '.join(corpus)
    return corpus

In [17]:
def get_corpus2(df):
    """
    Compute the text corpus from the dataframe after completing
    a soft clearning process, just removing extra whitespaces and html tags
    """
    corpus = []
    for index, row in df.iterrows():
        corpus.append(dataset_utils.sanitize_input(row['name'].lower()))
        corpus.append(dataset_utils.sanitize_input(row['description'].lower()))
    #corpus = ' '.join(corpus)
    return corpus

In [52]:
column_names = ['name', 'description']
train_data = pd.read_csv('../data/train.csv', names=column_names, header=None)[1:]
train_data.head()

Unnamed: 0,name,description
1,CROPPED JACKET TRF,Jacket made of a technical fabric with texture...
2,OVERSIZED SHIRT WITH POCKET TRF,Oversized long sleeve shirt with a round colla...
3,TECHNICAL TROUSERS TRF,High-waist trousers with a matching elastic wa...
4,SHIRT DRESS,Collared dress featuring sleeves falling below...
5,PUFF SLEEVE DRESS WITH PLEATS TRF,Loose-fitting midi dress with a round neckline...


Let´s use first the more pruned corpus

In [19]:
corpus = get_corpus(train_data)
print(len(corpus))

67226


# Method 1: using CountVectorizer

In [20]:
cv=CountVectorizer()
word_count_vector=cv.fit_transform(corpus)
word_count_vector.shape

(67226, 7819)

In [21]:
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) 
tfidf_transformer.fit(word_count_vector)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [22]:
# print idf values 
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(),columns=["idf_weights"]) 
 
# sort ascending 
df_idf.sort_values(by=['idf_weights'])

Unnamed: 0,idf_weights
with,1.682803
and,2.030730
the,2.547326
front,2.685391
of,2.760784
...,...
payment,11.422683
peacefully,11.422683
bygone,11.422683
patchouliinstructions,11.422683


In [46]:
# count matrix 
count_vector=cv.transform(corpus) 
 
# tf-idf scores 
tf_idf_vector=tfidf_transformer.transform(count_vector)

#get tfidf vector for the nth document 
n_document_vector=tf_idf_vector[10] 
 
#print the scores 
df = pd.DataFrame(n_document_vector.T.todense(), 
                      index=cv.get_feature_names(), 
                      columns=["tfidf"]) 
df.sort_values(by=["tfidf"],ascending=False)

Unnamed: 0,tfidf
floral,0.581224
trf,0.529517
dress,0.456109
print,0.416838
00,0.000000
...,...
era,0.000000
equivalency,0.000000
equipped,0.000000
equipment,0.000000


# Method 2: Using Tfidf_vectorizer

Performing the same analysis but using the Tfidf_vectorizer

In [25]:
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
tfidf_vectors = tfidf.fit_transform(corpus)
feature_names = tfidf.get_feature_names()

In [27]:
# get the first vector out (for the first document) 
first_vector_tfidfvectorizer=tfidf_vectors[0] 
 
# place tf-idf values in a pandas data frame 
df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=feature_names, 
                  columns=["tfidf"]) 
df.sort_values(by=["tfidf"],ascending=False)

Unnamed: 0,tfidf
crop,0.626230
jacket,0.569223
trf,0.532749
-,0.000000
penguin,0.000000
...,...
embroid,0.000000
embrac,0.000000
embossed-effect,0.000000
emboss,0.000000


This is the tfidf vector of the first document in the training data, which defines the words with higher frequency and more relevance in the document. We can use the tfidf score to evaluate the words of an answer, if we use the testing text as a corpus to train the vectorizer.

# Analyzing test data

In [53]:
column_names = ['description']
test_data = pd.read_csv('../data/test_descriptions.csv', names=column_names, header=None)[1:]
test_data.head()

Unnamed: 0,description
1,"Knit midi dress with a V-neckline, straps and ..."
2,"Loose-fitting dress with a round neckline, lon..."
3,Nautical cap with peak.<br/><br/>This item mus...
4,Nautical cap with peak. Adjustable inner strap...
5,Nautical cap with side button detail.<br/><br/...


In [54]:
# method to get the corpus from the dataframe
def get_corpus3(df):
    """
    Compute the text corpus from the dataframe after completing
    a full clearning process, removing all except alphanumeric text
    """
    corpus = []
    for index, row in df.iterrows():
        corpus.append(clean_and_tokenize(row['description'].lower()))
    #corpus = ' '.join(corpus)
    return corpus

In [55]:
corpus = get_corpus3(test_data)
print(len(corpus))

1441


We have 1441 documents, our answers document should have the same length.

In [58]:
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
tfidf_vectors = tfidf.fit_transform(corpus)
feature_names = tfidf.get_feature_names()
# get the first vector out (for the first document) 
first_vector_tfidfvectorizer=tfidf_vectors[0] 
 
# place tf-idf values in a pandas data frame 
df0 = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=feature_names, 
                  columns=["tfidf"]) 
df0 = df0.sort_values(by=["tfidf"],ascending=False)
df0

  'stop_words.' % sorted(inconsistent))


Unnamed: 0,tfidf
midi,0.451433
v-necklin,0.386010
dress,0.314312
knit,0.314312
match,0.314312
...,...
eyestay,0.000000
eyelet,0.000000
eye,0.000000
extralight,0.000000


In [80]:
df0.tfidf[:10].index.values.tolist()

['midi',
 'v-necklin',
 'dress',
 'knit',
 'match',
 '69',
 'lace',
 '177',
 'model',
 '6']

We are going to build a dataframe with the same length as the number of documents and for each document we will give a list of the relevant words, taking the top 10 words. 

In [82]:
top_words = pd.DataFrame(columns=["top_words"], index = range(len(corpus)))
for i in range(len(corpus)):
    i_vector_tfidfvectorizer=tfidf_vectors[i]
    dfi = pd.DataFrame(i_vector_tfidfvectorizer.T.todense(), index=feature_names, 
                  columns=["tfidf"]) 
    dfi = dfi.sort_values(by=["tfidf"],ascending=False)
    top_words.loc[i] = [dfi.tfidf[:10].index.values.tolist()]

In [85]:
pd.set_option('display.max_colwidth', -1)
top_words.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,top_words
0,"[midi, v-necklin, dress, knit, match, 69, lace, 177, model, 6]"
1,"[loose-fit, pleat, dress, necklin, 69, open, 177, long, button, sleev]"
2,"[peak, nautic, cardboard, intact, return, cap, origin, item, packag, thi]"
3,"[peak, nautic, cap, inner, adjust, strap, plate, pocket, plush, plimsoll-styl]"
4,"[nautic, cardboard, intact, return, cap, origin, item, packag, thi, button]"


In [86]:
top_words.tail()

Unnamed: 0,top_words
1436,"[cushion, stripe, includ, cover, print, cotton, platform, polka, point, pocket]"
1437,"[gnome, rectangular, cushion, print, featur, -, plastic, pocket, plush, plimsoll-styl]"
1438,"[eye, mask, band, jersey, fit, comfort, elast, cotton, featur, pleasant]"
1439,"[chipboard, iron, jacquard, hanger, hook, set, paisley, pad, sold, fabric]"
1440,"[hanger, ultra-thin, littl, kind, cloth, suitabl, iron, wardrob, lightweight, room]"


## Sources
https://towardsdatascience.com/natural-language-processing-feature-engineering-using-tf-idf-e8b9d00e7e76
https://kavita-ganesan.com/tfidftransformer-tfidfvectorizer-usage-differences/
