# 2. Corpus Preprocessing

## 2.1. Load data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

base_path = '/content/drive/MyDrive/nlp/amazon'
reviews_train_df = pd.read_csv(base_path + '/data/pet_supplies_train.csv')
reviews_test_df = pd.read_csv(base_path + '/data/pet_supplies_test.csv')

## 2.2. Preprocess data

In [None]:
!pip install num2words

Collecting num2words
  Downloading num2words-0.5.12-py3-none-any.whl (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.2/125.2 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docopt>=0.6.2 (from num2words)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: docopt
  Building wheel for docopt (setup.py) ... [?25l[?25hdone
  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13705 sha256=9d59934a334ea760437eb6e7ad24ea39a69fea4ae4a8f390c7bbfcdda745bffc
  Stored in directory: /root/.cache/pip/wheels/fc/ab/d4/5da2067ac95b36618c629a5f93f809425700506f72c9732fac
Successfully built docopt
Installing collected packages: docopt, num2words
Successfully installed docopt-0.6.2 num2words-0.5.12


In [None]:
from string import punctuation
from num2words import num2words
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# Map Treebank tags to WordNet tags
tag_map = {'CD': wordnet.NOUN,
           'EX': wordnet.ADV,
           'IN': wordnet.ADV,
           'JJ': wordnet.ADJ,
           'NN': wordnet.NOUN,
           'PD': wordnet.ADJ,
           'RB': wordnet.ADV,
           'RP': wordnet.ADJ,
           'VB': wordnet.VERB
          }

def wordnet_pos_tag(tokens, tag_map=tag_map):
  '''Tag the given list of tokens using WordNet tags.'''
  return [(token, tag_map.get(treebank_tag[:2])) for token, treebank_tag in nltk.pos_tag(tokens)]

In [None]:
def lemmatization(token, tag, lemmatizer):
  '''Lemmatize token with POS tag.'''
  return token if tag is None else lemmatizer.lemmatize(token, tag)

In [None]:
def num_to_words(token):
  return num2words(token) if token.isdigit() else token

In [None]:
# Preprocessing pipeline
def clean_text(text,
               tokenizer=nltk.word_tokenize,
               punctuation=punctuation,
               stopwords=stopwords.words('english'),
               lemmatizer=WordNetLemmatizer()):
  words = []
  # Convert text to lowercase, tokenize and tag text
  tokens_tags = wordnet_pos_tag(tokenizer(text.lower()))
  for token, tag in tokens_tags:
    # Remove punctuation marks
    if token not in punctuation:
      tk = token.translate(str.maketrans('', '', punctuation))
      # Lemmatize word
      lmtword = lemmatization(tk, tag, lemmatizer)
      # Remove stopwords
      if lmtword not in stopwords and lmtword != '':
        # Convert numbers to words
        words.append(num_to_words(lmtword))
  return ' '.join(words)

In [None]:
reviews_train_df['preprocReviewText'] = reviews_train_df['reviewText'].map(clean_text)
reviews_train_df.drop(reviews_train_df[reviews_train_df['preprocReviewText'] == ''].index, inplace=True)

reviews_test_df['preprocReviewText'] = reviews_test_df['reviewText'].map(clean_text)
reviews_test_df.drop(reviews_test_df[reviews_test_df['preprocReviewText'] == ''].index, inplace=True)

In [None]:
reviews_train_df = reviews_train_df[['sentiment', 'preprocReviewText', 'reviewText']]
reviews_train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16226 entries, 0 to 16232
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   sentiment          16226 non-null  int64 
 1   preprocReviewText  16226 non-null  object
 2   reviewText         16226 non-null  object
dtypes: int64(1), object(2)
memory usage: 507.1+ KB


In [None]:
reviews_test_df = reviews_test_df[['sentiment', 'preprocReviewText', 'reviewText']]
reviews_test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4057 entries, 0 to 4058
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   sentiment          4057 non-null   int64 
 1   preprocReviewText  4057 non-null   object
 2   reviewText         4057 non-null   object
dtypes: int64(1), object(2)
memory usage: 126.8+ KB


## 2.3. Save preprocessed data

In [None]:
reviews_train_df.to_csv(base_path + '/data/preproc_pet_supplies_train.csv', index=False)
reviews_test_df.to_csv(base_path + '/data/preproc_pet_supplies_test.csv', index=False)

## 2.4. Comments

Resumen del preprocesamiento del corpus:
1. Transformación a minúsculas.
2. Tokenizacion con `TreebankWordTokenizer`.
3. Etiquetado de tokens con el objetivo de hacer una mejor lematización, ha habido que transformar las etiquetas por defecto (Treebank) a las etiquetas compatibles con el lematizador (WordNet).
4. Eliminación de signos de puntuación.
5. Lematización con `WordNetLemmatizer`.
6. Eliminación de stopwords.
7. Transformación de números a palabras.