# Text mining on articles descriptions
This notebook contains some text mining performed on earticles description, including most common words, nouns and adjectives, and exploration of bigrams and trigrams. 
This was the first step preceding the looking for fabric and material names. 

## Import of libraries and dataset

In [1]:
# import libraries
import pandas as pd

In [2]:
# import dataset
articoli = pd.read_csv("../data/articles.csv")

In [3]:
#remove nan values from descriptions
articoli.dropna(subset=['detail_desc'], inplace=True)

In [4]:
articoli.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


## Frequent words

In [5]:
#create a list with all the sentences
frasi = articoli["detail_desc"].tolist()
len(frasi)

105126

In [6]:
#!pip install nltk

In [7]:
#import nltk and english stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lucre\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
#transform all sentences to lower case
articoli["detail_desc"] = articoli["detail_desc"].str.lower()

In [9]:
articoli

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","microfibre t-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","microfibre t-shirt bra with underwired, moulde..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105537,953450001,953450,5pk regular Placement1,302,Socks,Socks & Tights,1010014,Placement print,9,Black,...,Socks Bin,F,Menswear,3,Menswear,26,Men Underwear,1021,Socks and Tights,socks in a fine-knit cotton blend with a small...
105538,953763001,953763,SPORT Malaga tank,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey,A,Ladieswear,1,Ladieswear,2,H&M+,1005,Jersey Fancy,loose-fitting sports vest top in ribbed fast-d...
105539,956217002,956217,Cartwheel dress,265,Dress,Garment Full body,1010016,Solid,9,Black,...,Jersey,A,Ladieswear,1,Ladieswear,18,Womens Trend,1005,Jersey Fancy,"short, a-line dress in jersey with a round nec..."
105540,957375001,957375,CLAIRE HAIR CLAW,72,Hair clip,Accessories,1010016,Solid,9,Black,...,Small Accessories,D,Divided,2,Divided,52,Divided Accessories,1019,Accessories,large plastic hair claw.


In [12]:
#finding the most common words

from collections import Counter

# join all sentences together
all_desc = ' '.join(articoli["detail_desc"])

# tokenize
desc_words = all_desc.split()

# count freq of every token
word_count = Counter(desc_words)

# 20 most common token
top_words = word_count.most_common(30)

for word, count in top_words:
    print(f"{word}: {count}")

and: 160061
a: 151693
with: 150693
the: 135045
in: 105010
at: 80687
cotton: 33706
an: 24557
top: 24128
soft: 24063
jersey: 23023
hem.: 22381
front: 21169
sleeves: 19997
elasticated: 18343
back: 17938
fabric: 17532
long: 17181
zip: 15958
back.: 15909
waist: 15870
pockets: 14882
cuffs: 13819
side: 13543
weave: 13164
ribbing: 13063
short: 12828
of: 12678
down: 12580
waist,: 12241


In [13]:
#download pos tagger and punkt
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\lucre\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lucre\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [14]:
#import tokenizer and pos tagger
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import pos_tag

## Frequent pos

In [15]:
nltk.download('universal_tagset')

[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\lucre\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [16]:
#30 most frequent nouns and adjectives with universal tagger

words_with_noun = []
words_with_adj = []
for frase in articoli["detail_desc"]:
    tokens = word_tokenize(frase)
    pos_tags = pos_tag(tokens, tagset='universal')

    proper_nouns = [word for word, pos in pos_tags if pos == 'NOUN']
    adjectives = [ad for ad, pos in pos_tags if pos == 'ADJ']

    words_with_noun.extend(proper_nouns)
    words_with_adj.extend(adjectives)

# Find most frequent words
word_counts = Counter(words_with_noun)
most_common_nouns = word_counts.most_common(30)

adj_counts = Counter(words_with_adj)
most_common_adj = adj_counts.most_common(30)

# Print most frwquent words
print(" 30 Most frequent nouns")
for word, count in most_common_nouns:
    print(f"{word}: {count} volte")

print()
print(" 30 Most frequent Adjectives")
for ad, count in most_common_adj:
    print(f"{ad}: {count} volte")


 30 Most frequent nouns
cotton: 34229 volte
waist: 34074 volte
front: 30375 volte
sleeves: 27571 volte
jersey: 27128 volte
back: 25213 volte
hem: 25032 volte
pockets: 25014 volte
cuffs: 23650 volte
top: 18598 volte
fabric: 17668 volte
zip: 14058 volte
neckline: 12569 volte
button: 11790 volte
collar: 11283 volte
dress: 11124 volte
shoulder: 10755 volte
buttons: 10465 volte
legs: 10403 volte
fly: 9257 volte
straps: 9255 volte
weave: 9092 volte
seam: 8722 volte
blend: 8703 volte
elastication: 8515 volte
shoulders: 8492 volte
cm: 8415 volte
hems: 8226 volte
pocket: 7784 volte
sides: 7464 volte

 30 Most frequent Adjectives
soft: 35585 volte
short: 17455 volte
long: 16044 volte
elasticated: 15034 volte
wide: 12905 volte
top: 12020 volte
narrow: 11910 volte
adjustable: 11676 volte
side: 8806 volte
long-sleeved: 6645 volte
organic: 6619 volte
ribbed: 6556 volte
unlined: 6458 volte
decorative: 6319 volte
concealed: 6047 volte
woven: 5696 volte
front: 5679 volte
high: 5346 volte
small: 5050 vo

In [17]:
#30 most frequent nouns and adj without universal tagger

words_with_noun = []
words_with_adj = []
for frase in articoli["detail_desc"]:
    tokens = word_tokenize(frase)
    pos_tags = pos_tag(tokens)

    proper_nouns = [word for word, pos in pos_tags if pos == 'NN']
    adjectives = [ad for ad, pos in pos_tags if pos == 'JJ']

    words_with_noun.extend(proper_nouns)
    words_with_adj.extend(adjectives)

# Find most frequent words
word_counts = Counter(words_with_noun)
most_common_nouns = word_counts.most_common(30)

adj_counts = Counter(words_with_adj)
most_common_adj = adj_counts.most_common(30)

# Print most frequent words
print(" 30 Most frequent nouns")
for word, count in most_common_nouns:
    print(f"{word}: {count} volte")

print()
print(" 30 Most frequent Adjectives")
for ad, count in most_common_adj:
    print(f"{ad}: {count} volte")

 30 Most frequent nouns
cotton: 34229 volte
waist: 34074 volte
front: 30375 volte
jersey: 26425 volte
back: 25213 volte
hem: 25004 volte
top: 18598 volte
fabric: 17668 volte
zip: 14058 volte
neckline: 12569 volte
button: 11790 volte
collar: 11283 volte
dress: 11124 volte
shoulder: 10755 volte
fly: 9237 volte
weave: 9090 volte
seam: 8722 volte
blend: 8703 volte
elastication: 8515 volte
cm: 8410 volte
pocket: 7784 volte
sweatshirt: 7411 volte
v-neck: 6838 volte
skirt: 6751 volte
drawstring: 6689 volte
side: 6616 volte
imitation: 6208 volte
neck: 5984 volte
jacket: 5715 volte
jumper: 5658 volte

 30 Most frequent Adjectives
soft: 35585 volte
short: 17455 volte
long: 16044 volte
elasticated: 15034 volte
wide: 12905 volte
top: 12020 volte
narrow: 11910 volte
adjustable: 11676 volte
side: 8802 volte
long-sleeved: 6645 volte
organic: 6619 volte
ribbed: 6556 volte
unlined: 6458 volte
decorative: 6319 volte
concealed: 6047 volte
woven: 5696 volte
front: 5679 volte
high: 5346 volte
small: 5050 v

In [18]:
#remove stopwords from sentences
articoli["detail_desc"] = articoli["detail_desc"].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

In [19]:
import re
# Remove non ascii characters
articoli["detail_desc"] = articoli["detail_desc"].apply(lambda x: re.sub(r'[^\x00-\x7F]+', '', x))

In [20]:
import string

def remove_punctuation(text):
    # Use a regular expression to remove all punctuation characters
    return re.sub(r'[^\w\s]', '', text)

articoli["detail_desc"] = articoli["detail_desc"].apply(remove_punctuation)


In [21]:
#import lemmatizer
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lucre\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [22]:
#lemmatization
articoli["detail_desc"] = articoli["detail_desc"].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

## Bigrams and Trigrams

In [23]:
from nltk.collocations import *
#most frequent bigrams
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()

In [24]:
joined_sent = ' '.join(articoli["detail_desc"])
joined_tok = word_tokenize(joined_sent)

In [25]:
joined_tok[:20]

['jersey',
 'top',
 'narrow',
 'shoulder',
 'strap',
 'jersey',
 'top',
 'narrow',
 'shoulder',
 'strap',
 'jersey',
 'top',
 'narrow',
 'shoulder',
 'strap',
 'microfibre',
 'tshirt',
 'bra',
 'underwired',
 'moulded']

In [26]:
#find bigrams
finder_bi = BigramCollocationFinder.from_words(joined_tok)
common_big = finder_bi.nbest(bigram_measures.raw_freq, 1000)

In [27]:
#filter the bigrams

for bigram in common_big:
  if bigram[0] == "top":
    print(bigram)
  if bigram[1] == "top":
    print(bigram)

('top', 'soft')
('longsleeved', 'top')
('vest', 'top')
('top', 'sweatshirt')
('top', 'short')
('trim', 'top')
('zip', 'top')
('hem', 'top')
('top', 'fastdrying')
('top', 'long')
('jersey', 'top')
('top', 'narrow')
('elasticated', 'top')
('bikini', 'top')
('fitted', 'top')
('button', 'top')
('top', 'ribbed')
('shortsleeved', 'top')
('sport', 'top')
('top', 'lightweight')
('top', 'cotton')
('inside', 'top')
('top', 'organic')
('sleeveless', 'top')
('flounce', 'top')
('top', 'printed')
('top', 'jersey')
('top', 'airy')
('elastication', 'top')
('content', 'top')
('pleat', 'top')
('cut', 'top')
('short', 'top')
('top', 'pair')
('vneck', 'top')
('lined', 'top')
('cropped', 'top')
('top', 'concealed')
('wrapover', 'top')
('offtheshoulder', 'top')
('top', 'wide')
('top', 'sock')
('top', 'fineknit')
('top', 'elasticated')
('top', 'sturdy')
('top', 'lined')
('opening', 'top')
('tie', 'top')
('pompom', 'top')


In [28]:
#find trigrams
finder_tri = TrigramCollocationFinder.from_words(joined_tok)
common_tri = finder_tri.nbest(trigram_measures.raw_freq, 1000)

In [29]:
#filter trigrams

for trigram in common_tri:
  if trigram[0] == "top":
    print(trigram)
  if trigram[1] == "top":
    print(trigram)

('longsleeved', 'top', 'soft')
('top', 'sweatshirt', 'fabric')
('top', 'soft', 'cotton')
('top', 'soft', 'jersey')
('top', 'soft', 'sweatshirt')
('top', 'fastdrying', 'functional')
('top', 'soft', 'organic')
('top', 'long', 'sleeve')
('sport', 'top', 'fastdrying')
('vest', 'top', 'soft')
('top', 'cotton', 'jersey')
('top', 'soft', 'printed')
('top', 'lightweight', 'sweatshirt')
('longsleeved', 'top', 'sweatshirt')
('top', 'organic', 'cotton')
('top', 'narrow', 'shoulder')
('top', 'ribbed', 'jersey')
('top', 'narrow', 'adjustable')
('top', 'printed', 'sweatshirt')
('hem', 'top', 'soft')
('elasticated', 'top', 'sock')
('top', 'soft', 'viscose')
('top', 'sock', 'soft')
('fitted', 'top', 'soft')
('inside', 'top', 'sweatshirt')
('shortsleeved', 'top', 'soft')
('vest', 'top', 'fastdrying')
('top', 'soft', 'ribbed')
('top', 'short', 'sleeve')
('elasticated', 'top', 'fineknit')
('content', 'top', 'organic')
('top', 'dropped', 'shoulder')
('hem', 'top', 'lightweight')
('padded', 'top', 'edge')


In [30]:
finder_tri.apply_word_filter(lambda w: w not in ('top'))
sorted(finder_tri.score_ngrams(trigram_measures.raw_freq))

[]