# NLP Pre-Processing Demo

In [None]:
import numpy as np
import pandas as pd

Load some data. IMDB data available [here.](https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews?resource=download)

In [None]:
imdb_df = pd.read_csv("file_path_here")

First glance, define some printing functions for easier viewing.

In [None]:
# print a random sample from the series
def print_first_n(series, n=10):
    print(f"First n sample (n = {n})", end="\n\n")
    for i, sent in enumerate(series[:n]):
        print(f"{i+1}) {sent}")
        print()

# print a random sample from the series
def print_sample(series, n=10):
    print(f"Random sample (n = {n})", end="\n\n")
    for i, sent in enumerate(series.sample(n, replace=False)):
        print(f"{i+1}) {sent}")
        print()

In [None]:
# make a copy of the reviews
reviews = imdb_df['review'].copy()

In [None]:
# look at the first 10 reviews
print_first_n(reviews, 10)

## 1) Clean Data

In [None]:
import re

# remove the random breaks: <br /><br />
def remove_random_breaks(s):
    return re.sub("<br /><br />", " ", x)

# apply the function to all the reviews
reviews = reviews.apply(remove_random_breaks)

In [None]:
# print a random sample of 5 reviews
print_sample(reviews, 5)

## 2) Tokenize
Splitting a string up into words or meaningful/useful parts

In [None]:
# NLTK (Natural Language ToolKit) is a common package for doing language processing
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

#### For more info on NLTK, see [their documentation.](https://www.nltk.org/)

In [None]:
s = "I do not like green eggs and ham."

In [None]:
word_list = word_tokenize(s)

In [None]:
word_list

## 3) Remove stopwords (and punctuation)

In [None]:
# NLTK's basic stopwords lists
nltk.download('stopwords')
from nltk.corpus import stopwords

In [None]:
# get the English stopwords
stop_words = set(stopwords.words('english'))

Customize the stopwords list

In [None]:
# for sentiment analysis, we may want to keep all the negative words
# i.e. we want to keep all the words below
not_stop_words = [
    "aren't",
    'below',
    'can',
    "couldn't",
    'did',
    "didn't",
    "doesn't",
    "don't",
    'few',
    "hadn't",
    "hasn't",
    "haven't",
    "isn't",
    'more',
    'most',
    "mustn't",
    "needn't",
    'no',
    'not',
    'once',
    'only',
    'should',
    "should've",
    "shouldn't",
    'so',
    "wasn't",
    "weren't",
    "won't",
    "wouldn't"
]

# remove the words above from the stop-words list
for word in not_stop_words:
    stop_words.remove(word)

Add a set to identify punctuation

In [None]:
import string
punctuation = {char for char in string.punctuation}
punctuation.add('...')  # add elipses as well

In [None]:
# only keep words that aren't stop words and not punctuation
word_list = [w for w in word_list if (w not in stop_words and w not in punctuation)]

In [None]:
word_list

## 4) Lemmatize

In [None]:
nltk.download('omw-1.4')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

In [None]:
# initialize one of the lemmatizers
lemmatizer = WordNetLemmatizer()

In [None]:
lemmatizer.lemmatize('eggs')

In [None]:
[lemmatizer.lemmatize(word) for word in word_list]

## 5) Create a Representation

I.e. convert the words to numbers somehow.

### a) TF-IDF

In [None]:
sentence_list = [
    "I do not like green eggs and ham.",
    "I do not like them Sam-I-am.",
    "I do not like them here or there.",
    "I do not like them anywhere."
]

The Scikit-learn package has a bunch of realky good resources for ML in general, and they also have a TF-IDF function.

See the [scikit-learn docs](https://scikit-learn.org/stable/index.html) for details.

In [1]:
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

For more details on `TfidfVectorizer` see [here](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer).

In [2]:
# initialize the vecotrizer
vectorizer = TfidfVectorizer()

`TfidfVectorizer` will do some text preprocessing by default. I believe it only tokenizes though, and doesn't remove stopwords or anything else. See the doc for more details though.

In [None]:
# fit and transform the data
X = vectorizer.fit_transform(sentence_list)

In [None]:
# tell us what the vocabulary is
vectorizer.get_feature_names_out()

In [None]:
# X is a sparse array by default, so we want a dense one to see it properly.
X.todense()

Specify stop words.

You can give `TfidfVectorizer` a list of stopwords, or tell it to use its default ones for a given language, which is what we do here.

In [None]:
vectorizer = TfidfVectorizer(stop_words = 'english')

In [None]:
X = vectorizer.fit_transform(sentence_list)

In [None]:
vectorizer.get_feature_names_out()

#### Specify an entire pre-processing routine.

Sometimes you want more control over the pre-processing (specify your own stopwords, or lemmatization, or anything else).

Specifying the `analyzer` attribute in `TfidfVectorizer` tells it not to do any pre-processing. Instead, the function you provide will be applied instead.

That's what we'll do here.

In [None]:
# tokenize, lemmatize, and remove stopwords
def tok_lem_stop(sentence):

    # tokenize the sentence
    word_list = word_tokenize(sentence)

    # convert everything to lower case
    word_list = [word.lower() for word in word_list]

    # lemmatize the words in the sentence (only nouns)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in word_list]

    # remove all stopwords and punctuation
    word_list = [word for word in lemmatized_words if not (word in stop_words or word in punctuation)]
    
    return word_list

In [None]:
# tell TfidfVectorizer to use our tok_lem_stop function for pre-processing instead of it's default
vectorizer = TfidfVectorizer(analyzer=tok_lem_stop)

In [None]:
X = vectorizer.fit_transform(sentence_list)

In [None]:
vectorizer.get_feature_names_out()

In [None]:
X.todense()

### b) Word Embeddings

Converting words into vectors!

#### i) Custom trained

In [None]:
n = 50
word_list = [tok_lem_stop(sent) for sent in reviews[:n]]

In [None]:
# gensim is a topic modelling library for NLP and word2vec is a common word embedding algorithm
# gensim has both pre-trained and custom-trainable word2vec options
import gensim
from gensim.models import Word2Vec

In [None]:
# we can train a custom word2vec embedding based on our dataset (or a part of it, in this case, although feel free to include the whole dataset!)
model = Word2Vec(sentences=word_list, window=5, min_count=1, workers=4)

In [None]:
# get the embedding (i.e. vector) for a given word
model.wv['good']

In [None]:
# to get the embedding (vector) for a sentence, you can average over all the embeddings 
# of the words in the sentence
def get_average_embeddings(word_list, model):
    n = len(word_list)
    X = np.zeros((n, model.vector_size))
    
    for i, sent in enumerate(word_list):
        x = np.zeros((len(sent), model.vector_size))
        
        for j, word in enumerate(sent):
            x[j] = model.wv[word]

        X[i] = x.mean(axis=0)
    
    return X

In [None]:
avg_embeddings = get_average_embeddings(word_list, model)

#### ii) Pre-trained

Gensim has a bunch of pre-trained word embedding models available including word2vec and GloVe models.

In [None]:
import gensim.downloader

# Show all available models in gensim-data
print(list(gensim.downloader.info()['models'].keys()))

# Download the "glove-twitter-25" embeddings
# Note that running the below statement may take a minute or two.
# glove_vectors = gensim.downloader.load('glove-twitter-25')

### SpaCy

An out-of-the-box general purpose NLP library. 

For more info on SpaCy, see [their documentation](https://spacy.io/). I recommend doing the [**Spacy 101**](https://spacy.io/usage/spacy-101) part of their docs to familiarize yourself with it if you're just starting out. It is very easy to follow, and is a great introduction. Overall, SpaCy's docs are really good.

In [None]:
import spacy

In [None]:
# download spacy's core English model
!python -m spacy download en_core_web_sm

In [None]:
# load spacy's core English model
nlp = spacy.load("en_core_web_sm")

In [None]:
# apply spacy's NLP model to the example sentence
doc = nlp("This is an example.")

In [None]:
# print some fancy things
for token in doc:
    print(token)
    print(f"\tPOS: {token.pos_}")
    print(f"\tLemma: {token.lemma_}")
    print(f"\tIs Stopword: {token.is_stop}")    
    print(f"\tIs Punctuation: {token.is_punct}")

### Differences between NLTK and SpaCy
- tokenizer is slightly different
- stopword lists are slightly different (SpaCy has more stopwords)
- NLTK is more flexible, but SpaCy is easier to use

In [None]:
# tokenizing using NLTK
word_tokenize("I do not like them Sam-I-am.")

In [None]:
# tokenizing using SpaCy
[token for token in nlp("I do not like them Sam-I-am.")]