In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import pandas as pd
import numpy as np
from gensim.parsing.preprocessing import STOPWORDS
from nltk.corpus import stopwords
import nltk
import re

In [0]:
start = pd.datetime.now()

  """Entry point for launching an IPython kernel.


## Data Cleaning

In [0]:
df_SQL = pd.read_csv('/content/drive/My Drive/NLP/full_data.csv')
df_SQL.drop(columns=['created_at', 'updated_at', 'deleted_at', 'bc_product_id'], inplace=True)
for columns in df_SQL.columns:
    df_SQL[columns] = df_SQL[columns].str.lower() 
df_SQL.drop_duplicates(subset= ['product_id'], keep = 'first', inplace=True)
df_SQL.shape

(48072, 9)

In [0]:
tag_SQL = pd.read_csv('/content/drive/My Drive/NLP/tagged_product_attributes.csv')

for columns in tag_SQL.columns:
    tag_SQL[columns] = tag_SQL[columns].str.lower() 
    
remove = [' ', '_', '(', ')', '-', ',', '&', '"', '"', '/']
for i in remove:
    tag_SQL['attribute_name'] = tag_SQL['attribute_name'].str.replace(i, '')
    tag_SQL['attribute_value'] = tag_SQL['attribute_value'].str.replace(i, '')

tag_SQL.drop(columns='file', inplace=True)
tag_SQL.drop_duplicates(keep='first', inplace=True) #only removed duplicates that have same data in all of the columns
tag_SQL.shape

(97950, 4)

In [0]:
df_join = pd.merge(df_SQL, tag_SQL.drop(columns='product_color_id'), how='inner', on='product_id')
focus_attribute = ['style', 'occasion', 'category', 'fit']
df_clean = df_join[df_join.attribute_name.isin(focus_attribute)].reset_index(drop=True)

for att in focus_attribute:
    df_clean[att] = np.where(df_clean.attribute_name==att, df_clean.attribute_value, None)
df_clean.replace(np.nan, '', regex=True, inplace=True)
df_clean.drop_duplicates(inplace=True)
df_clean.reset_index(inplace=True)
df_clean.shape

(26584, 16)

## Preprocessing with regex

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df_clean.description)
temp = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names()).sum(axis=0)

In [0]:
temp.sort_values(ascending=False).head(50)

and           21434
the           20116
with          16651
in            10961
this          10331
to             9654
of             8717
for            8451
is             7910
from           6269
that           4958
on             4743
an             4351
fit            4303
leather        4066
cotton         4038
style          3808
made           3555
your           3521
it             3233
designed       2958
cut            2946
100            2824
waist          2804
at             2795
high           2675
silhouette     2635
you            2540
silk           2509
look           2489
size           2456
front          2407
classic        2381
denim          2295
these          2273
length         2189
sleeves        2142
polyester      2129
top            2097
soft           2084
stretch        2041
back           2008
jeans          1971
dress          1924
are            1914
button         1900
features       1870
black          1860
body           1831
heel           1822


In [0]:
temp.sort_values(ascending=False).iloc[50:101]

up            1749
or            1742
wool          1710
inspired      1676
model         1664
pants         1635
perfect       1560
sweater       1546
cropped       1545
by            1544
wash          1530
relaxed       1492
blend         1462
shirt         1438
clean         1431
crafted       1430
leg           1388
knit          1372
tall          1365
skirt         1352
wears         1351
cm            1337
toe           1335
pair          1312
slim          1265
hem           1253
cashmere      1230
print         1229
neck          1225
wide          1219
pockets       1201
easy          1184
wear          1181
belt          1177
white         1166
ankle         1142
jacket        1139
rise          1123
lining        1120
our           1118
has           1108
flattering    1104
tee           1090
imported      1086
its           1074
as            1068
bag           1067
dry           1056
oversized     1029
into          1022
chic          1010
dtype: int64

In [0]:
for i in range(0,len(df_clean)):
    text = df_clean.loc[i,'description']
    text = re.sub(r'([0-9]+)', '', text)
    text = re.sub(r'\b(jeans|pants|skirt|shorts|leggings|trousers)\b', 'bottom', text)
    text = re.sub(r'\b(sweater|shirt|jacket|tshirt|coat|blazer|cardigan|hoodie)\b', 'top', text)
    text = re.sub(r'\b(sneakers|boots|flats|heels|slippers|sandals)\b', 'shoe', text)
    text = re.sub(r'\b(dress|one piece|jumpsuit)\b', 'onepiece', text)  
    df_clean.loc[i,'description'] = text

In [0]:
# regex to remove numbers in details
df_clean['product_full_name'] = df_clean['product_full_name'].str.replace(r'([0-9]+)','')
df_clean['details'] = df_clean['details'].str.replace(r'([0-9]+)','')
df_clean['brand_category'] = df_clean['brand_category'].str.replace(r'([0-9]+)','')

## Removing stopwords, lemmatization

In [0]:
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [0]:
stopwords_gensim = list(STOPWORDS)
stopwords_NLTK = list(stopwords.words("english"))
stopwords_combined = list(set(stopwords_gensim+stopwords_NLTK)) #to remove duplicates
negatives = ['not','nor','no','neither', 'never', 'bottom', 'top'] #took out the negative words for a more accurate analysis
stopwords_combined = list(filter(lambda x: x not in negatives, stopwords_combined))
stopwords_combined.sort()
stopwords_expression = '|'.join(stopwords_combined)
stopwords_pattern = f'({stopwords_expression})'

In [0]:
def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    tokens = text.split()
    stemmed_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(stemmed_tokens)

def stem_text(text):
    porter=PorterStemmer()
    tokens = text.split()
    stemmed_tokens = [porter.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

In [0]:
col = ['product_full_name','description','brand_category','details']
for i in col:
    df_clean[i] = df_clean[i].astype(str)
    df_clean[i] = df_clean[i].str.replace(r'[^\w\s]',' ')
    df_clean[i] = df_clean[i].str.replace(r'\n', ' ')
    df_clean[i] = df_clean[i].str.replace(rf'\b{stopwords_pattern}\b','')
    df_clean[i] = df_clean[i].apply(lemmatize)

In [0]:
df_clean.head()

Unnamed: 0,index,product_id,brand,mpn,product_full_name,description,brand_category,brand_canonical_url,details,labels,attribute_name,attribute_value,style,occasion,category,fit
0,0,01e5zxp5h0btezt9qd2hrzj47a,a.l.c.,5529544,lennox high waist cotton linen pant,high rise bottom tailored cool italian cotton ...,unknown,https://shop.nordstrom.com/s/a-l-c-lennox-high...,true size high rise inseam leg opening rise ri...,[],style,modern,modern,,,
1,1,01e5zxp5h0btezt9qd2hrzj47a,a.l.c.,5529544,lennox high waist cotton linen pant,high rise bottom tailored cool italian cotton ...,unknown,https://shop.nordstrom.com/s/a-l-c-lennox-high...,true size high rise inseam leg opening rise ri...,[],style,businesscasual,businesscasual,,,
2,2,01e5zxp5h0btezt9qd2hrzj47a,a.l.c.,5529544,lennox high waist cotton linen pant,high rise bottom tailored cool italian cotton ...,unknown,https://shop.nordstrom.com/s/a-l-c-lennox-high...,true size high rise inseam leg opening rise ri...,[],style,classic,classic,,,
3,3,01e5zxp5h0btezt9qd2hrzj47a,a.l.c.,5529544,lennox high waist cotton linen pant,high rise bottom tailored cool italian cotton ...,unknown,https://shop.nordstrom.com/s/a-l-c-lennox-high...,true size high rise inseam leg opening rise ri...,[],occasion,work,,work,,
4,4,01e5zxp5h0btezt9qd2hrzj47a,a.l.c.,5529544,lennox high waist cotton linen pant,high rise bottom tailored cool italian cotton ...,unknown,https://shop.nordstrom.com/s/a-l-c-lennox-high...,true size high rise inseam leg opening rise ri...,[],category,bottom,,,bottom,


In [0]:
df_clean.to_csv('/content/drive/My Drive/NLP/df_clean.csv')

In [0]:
end = pd.datetime.now()
print(end-start)

0:00:38.742760


  """Entry point for launching an IPython kernel.
