# Cleaning the product description data

In [13]:
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import nltk

## Download and import the english stopwords

In [24]:
nltk.download('stopwords')
nltk.download('wordnet')
stop = stopwords.words('english')
porter = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hritvik.patwa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hritvik.patwa\AppData\Roaming\nltk_data...


##### Data extraction and initial cleaning done in Create_data_api file, product description.csv file has only the product type and its description

In [22]:
URL_DATA = r'C:\Users\hritvik.patwa\Downloads\Projects\Product-Categorization-NLP-master\data\products_description.csv'

Grouping data to a smaller number of categories

In [7]:
def grouping_data(df):
    """Grouping data to a smaller number of categories"""
    df.loc[df['product_type'].isin(['lipstick','lip_liner']),'product_type'] = 'lipstick'
    df.loc[df['product_type'].isin(['blush','bronzer']),'product_type'] = 'contour'
    df.loc[df['product_type'].isin(['eyeliner','eyeshadow','mascara','eyebrow']),'product_type'] = 'eye_makeup'
    return df

Function to remove punctuation
The maketrans() method returns a mapping table that can be used with the translate() method to replace specified characters. We are here replacing nothing with nothing (first 2 parameters) and the third parameter is removed from string

In [9]:
def remove_punctuation(description):
    """Function to remove punctuation"""
    table = str.maketrans('', '', string.punctuation)
    return description.translate(table)

Function to removing stopwords

In [12]:
def remove_stopwords(text):
    """Function to removing stopwords"""
    text = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(text)

Function to apply stemming (https://www.guru99.com/stemming-lemmatization-python-nltk.html)

In [11]:
def stemmer(stem_text):
    """Function to apply stemming"""
    stem_text = [porter.stem(word) for word in stem_text.split()]
    return " ".join(stem_text)

Function to apply lemmatization

In [16]:
def lemmatizer(lem_text):
    """Function to apply lemmatization"""
    lem_text = [wordnet_lemmatizer.lemmatize(word) for word in lem_text.split()]
    return " ".join(lem_text)

In [27]:
def read_data(path):
    """Function to read and clean text data"""
    df = pd.read_csv(path, header=0, index_col=0)
    data = grouping_data(df)
    data['description'] = data['description'].astype(str)
    data['description'] = data['description'].apply(remove_punctuation)
    data['description'] = data['description'].apply(remove_stopwords)
    data['description'] = data['description'].apply(stemmer)
    return data

In [25]:
dataset = read_data(URL_DATA)

In [30]:
dataset.to_csv('lemmatized_product.csv', index= False, encoding='utf-8')

In [28]:
dataset2 = read_data(URL_DATA)

In [31]:
dataset2.to_csv('stemmed_product.csv', index= False, encoding='utf-8')