# Get the key words for the testing descriptions


In [1]:
import nltk
import re
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
import sys  
sys.path.insert(0, 'C:\\Users\\ARosa\\Documents\\spain-ai-nlp')

from sorting_utils import *
from dataset_utils import *

In [3]:
column_names = ['description']
test_data = pd.read_csv('../data/test_descriptions.csv', names=column_names, header=None)[1:]
test_data.head()

Unnamed: 0,description
1,"Knit midi dress with a V-neckline, straps and ..."
2,"Loose-fitting dress with a round neckline, lon..."
3,Nautical cap with peak.<br/><br/>This item mus...
4,Nautical cap with peak. Adjustable inner strap...
5,Nautical cap with side button detail.<br/><br/...


In [4]:
def clean_and_tokenize2(input):
    """
    Cleans the input and returns a list of tokens (words).
    Step needed to perform a tfidf analysis
    """
    if input.startswith('"') and input.endswith('"'):
        input = input[1:-1]

    input = input.replace('<br>', '') \
        .replace('</br>', '') \
        .replace('<br/>', '') \
        .replace('|', '') \
        .upper()
    words = re.sub(r"[^A-Za-z0-9\-]", " ", input).lower()
    return words

In [5]:
# method to get the corpus from the dataframe
def get_corpus3(df):
    """
    Compute the text corpus from the dataframe after completing
    a full clearning process, removing all except alphanumeric text
    """
    corpus = []
    for index, row in df.iterrows():
        corpus.append(clean_and_tokenize2(row['description']))
    #corpus = ' '.join(corpus)
    return corpus

In [65]:
corpus = get_corpus3(test_data)
print(len(corpus))

1441


In [35]:
corpus[:10]

['knit midi dress with a v-neckline  straps and matching lace detail height of model  177 cm    69 6 ',
 'loose-fitting dress with a round neckline  long sleeves  pleat details and a buttoned opening at the back height of model  177 cm   69 6 ',
 'nautical cap with peak this item must be returned with the original cardboard packaging intact ',
 'nautical cap with peak  adjustable inner strap detail ',
 'nautical cap with side button detail this item must be returned with the original cardboard packaging intact ',
 'faded short sleeve t-shirt with a round neckline and a front print due to the dyeing process  the print on each t-shirt is unique and may differ from what is shown in the photo height of model  177 cm    69 6 ',
 'coat with a round collar and long sleeves  featuring front welt pockets  faux suede interior and button fastening on the front  height of model  177 cm    69 6 ',
 'ripped t-shirt  round neck and short sleevesheight of model 176',
 'fitted top made from a polyamide

In [7]:
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
tfidf_vectors = tfidf.fit_transform(corpus)
feature_names = tfidf.get_feature_names()
# get the first vector out (for the first document) 
first_vector_tfidfvectorizer=tfidf_vectors[0] 
 
# place tf-idf values in a pandas data frame 
df0 = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=feature_names, 
                  columns=["tfidf"]) 
df0 = df0.sort_values(by=["tfidf"],ascending=False)
df0

  'stop_words.' % sorted(inconsistent))


Unnamed: 0,tfidf
midi,0.451433
v-necklin,0.386010
dress,0.314312
knit,0.314312
match,0.314312
...,...
eyestay,0.000000
eyelet,0.000000
eye,0.000000
extralight,0.000000


In [8]:
def get_top_words(tfidf_vectors, feature_names, position, n):
    '''
    It extracts the tifidf vector corresponding to the given position and returns the top n words 
    '''
    vector_tfidfvectorizer=tfidf_vectors[position]
    df0 = pd.DataFrame(vector_tfidfvectorizer.T.todense(), index=feature_names, 
                  columns=["tfidf"]) 
    df0 = df0.sort_values(by=["tfidf"],ascending=False)
    return df0[:n].index.tolist()

In [78]:
print(get_top_words(tfidf_vectors, feature_names, 10, 5))

['polyamid', 'reinforc', '178', 'chest', 'blend']


# Get the submission file

In [9]:
import pandas as pd
df = pd.read_csv('../submission/model_4_submission.csv', sep='\n')
df.head()

Unnamed: 0,name
0,"KNIT MIDI DRESS WITH A V-PRINT, V-NECK KNIT DR..."
1,"HEARTIC SHIRT DRESS TRF, LACE MINI DRESS, DOTT..."
2,FAUXCE BLANKET WITH A CONTRAST HERRINGBONE DES...
3,FAUX FUR CUSHION WITH SNOOPY PRINT.<BR/><|ENDO...
4,"ACIDONA WITH SIDE BUTTON, PACK OF FOUR BASIC R..."


In [50]:
len(df['name'])

1441

In [51]:
df['row_num'] = df.reset_index().index

In [52]:
df.head()

Unnamed: 0,name,row_num
0,"KNIT MIDI DRESS WITH A V-PRINT, V-NECK KNIT DR...",0
1,"HEARTIC SHIRT DRESS TRF, LACE MINI DRESS, DOTT...",1
2,FAUXCE BLANKET WITH A CONTRAST HERRINGBONE DES...,2
3,FAUX FUR CUSHION WITH SNOOPY PRINT.<BR/><|ENDO...,3
4,"ACIDONA WITH SIDE BUTTON, PACK OF FOUR BASIC R...",4


In [22]:
def clean_tokenize_answer(answer):
    '''
    Method to remove non alphanumeric characters, to transform to lowercase
    and to stem the words. It returns the list of words per answer.
    '''
    list_answers = answer.split(',')
    clean_answers = []
    for a in list_answers:
        ca = clean_and_tokenize2(a)
        ca = tokenize(ca)
        clean_answers.append(ca)
    return clean_answers

In [57]:
def get_top5_score(top_5, answer):
    '''
    Give the answer the score using one components:
    1. the matches with top_5 words
    Max. score = 1 (if there are 5 similar words)
    :param top_5: top 5 words for that answer
    :param answer: list of words of the answer
    '''
    answer = [item for sublist in answer for item in sublist]
    #print(type(answer))
    unique_words = set(answer)
    score = 0.0
    for word in unique_words:
        if word in top_5:
            score += 0.25
    #print("unique words {} and score {}".format(unique_words, score))
    return score

In [58]:
def get_tfidf_score(answer, top_5):
    '''
    This method will return a new column with the scores given for each
    answer in the list.
    :param answer: list of candidates
    :param top_5: top five words for that description in the test file
    '''
    # clean the answer the same way we did for the tfidf model
    #print(len(answer))
    #print("answer {}".format(answer))
    c_answer = clean_tokenize_answer(answer)

    # score only with similar words
    return get_top5_score(top_5, c_answer)
    

In [59]:
def get_global_score(top_5, answer):
    '''
    Give the answer the total score using two components:
    1. the matches with top_5 words
    2. the length/words_count score
    Max. score = 2
    :param top_5: top 5 words for that answer
    :param answer: list of words of the answer
    '''
    unique_words = set(answer)
    score = 0.0
    for word in unique_words:
        if word in top_5:
            score += 0.25
    score += length_wc_scoring(answer,0.3,0.7)       
    return score

In [60]:
def new_sorted_list(row, tfidf_vectors, feature_names):
    scores = []
    candidates = row['name']
    row_index = row['row_num']
    row_list = candidates.split(",")
    # for each answer we have already a list of words
    # get the top 5 words for that answer
    top_5 = get_top_words(tfidf_vectors, feature_names, row_index, 5)
    #print("top_5 {}".format(top_5))
    for i in range(len(row_list)):
        r = row_list[i]

        # compute the score
        s = get_tfidf_score(r.strip(), top_5)
        #print("answer{} and score{}".format(r,s))
        scores.append(s)
    df = pd.DataFrame()
    df['answer'] = row_list
    df['score'] = scores
    #print(df)
    df = df.sort_values(by='score', ascending=False)
    #print(df)
    return ",".join(df.answer)

In [67]:
df['sorted'] = df.apply(lambda row: new_sorted_list(row,tfidf_vectors, feature_names), axis=1)
df.head()

Unnamed: 0,name,row_num,sorted
0,"KNIT MIDI DRESS WITH A V-PRINT, V-NECK KNIT DR...",0,"KNIT MIDI DRESS WITH A V-NECKLINE, KNIT MIDI ..."
1,"HEARTIC SHIRT DRESS TRF, LACE MINI DRESS, DOTT...",1,IFT WHEEL LOOSE-FITTING DRESS WITH PLEATS TRF...
2,FAUXCE BLANKET WITH A CONTRAST HERRINGBONE DES...,2,FADED ROUND NECK TOP WITH SHORT SLEEVES.<|END...
3,FAUX FUR CUSHION WITH SNOOPY PRINT.<BR/><|ENDO...,3,PEAK CAP WITH A CONTRAST PATCH ON THE FRONT A...
4,"ACIDONA WITH SIDE BUTTON, PACK OF FOUR BASIC R...",4,"THEALED NAUTICAL CAP WITH SIDE BUTTON DETAIL,..."


In [62]:
df_top5 = df[['sorted']]

In [63]:
df_top5.rename(columns = {'sorted':'name'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [64]:
df_top5.head()

Unnamed: 0,name
0,"KNIT MIDI DRESS WITH A V-NECKLINE, KNIT MIDI ..."
1,IFT WHEEL LOOSE-FITTING DRESS WITH PLEATS TRF...
2,FADED ROUND NECK TOP WITH SHORT SLEEVES.<|END...
3,PEAK CAP WITH A CONTRAST PATCH ON THE FRONT A...
4,"THEALED NAUTICAL CAP WITH SIDE BUTTON DETAIL,..."


In [None]:
df.to_csv('../submission/model_4_submission_sorted_selected.csv', sep='\n',index=False)