<a href="https://colab.research.google.com/github/Jiaye39/TimeSeriesAnalysis/blob/main/Pre_processing_5steps.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pre_processing_5steps

In [1]:
import requests
r = requests.get("http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz")
imdb_tgz = r.content

In [2]:
import io
import re
import tarfile

from tqdm.notebook import tqdm

good_file = re.compile(r"^aclImdb/(test|train)/(pos|neg)/.*\.txt$")

with tarfile.open(fileobj=io.BytesIO(r.content), mode="r:gz") as tgz:
    all_members = tgz.getmembers()
    data_files = list(filter(lambda x: x.isfile() and good_file.match(x.name) is not None, all_members))
    for f in tqdm(data_files):
        tgz.extract(f)

  0%|          | 0/50000 [00:00<?, ?it/s]

  tgz.extract(f)


In [3]:
from sklearn.datasets import load_files
train_data, test_data = load_files("./aclImdb/train", encoding="utf-8"), load_files("./aclImdb/test", encoding="utf-8")

In [4]:
label2txt = {label: txt for label, txt in enumerate(train_data.target_names)}
txt2label = {txt: label for label, txt in label2txt.items()}

In [5]:
type(train_data)

In [6]:
X_train, y_train = train_data.data, train_data.target
X_test, y_test = test_data.data, test_data.target

# The data
* `X_train` and `X_test` are `list` of 25000 `str` texts
* `y_train` and `y_test` are `list` of 25000 `int`, either `0` (negative review) or `1` (positive review)


In [7]:
import numpy as np

print("TRAIN data:")
print("class balance: ", np.bincount(y_train))
print()
print("TEST data:")
print("class balance: ", np.bincount(y_train))



TRAIN data:
class balance:  [12500 12500]

TEST data:
class balance:  [12500 12500]


In [8]:
!pip install eli5

Collecting eli5
  Downloading eli5-0.16.0-py2.py3-none-any.whl.metadata (18 kB)
Downloading eli5-0.16.0-py2.py3-none-any.whl (108 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.4/108.4 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: eli5
Successfully installed eli5-0.16.0


In [9]:
import nltk
nltk.download("punkt")
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [10]:
from nltk.tokenize import word_tokenize

In [11]:
import nltk
nltk.download("stopwords")

!pip install gensim
from gensim.corpora import Dictionary

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m42.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [12]:
import rich

from rich.console import Console
from rich.table import Table

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Preprocessing

In [14]:
def create_table(dictionary: Dictionary, n: int = 10) -> Table:
    table = Table(title=f"Top-{n} Most Frequent Tokens ({d.num_docs} documents, {d.num_pos} tokens, {len(d)} words in dictionary)")

    table.add_column("Token", justify="left", no_wrap=True)
    table.add_column("Corpus Frequency", justify="right")
    table.add_column("% of Tokens", justify="right")
    table.add_column("Document Frequency", justify="right")
    table.add_column("% of Documents", justify="right")

    for token, frequency in d.most_common(n=n):
        percent_tokens = frequency / d.num_pos
        doc_frequency = d.dfs[d.token2id[token]]
        percent_doc = doc_frequency / d.num_docs
        table.add_row(token, str(frequency), f"{percent_tokens:.1%}", str(doc_frequency), f"{percent_doc:.1%}")

    return table

## 1st Pre-Processing: Lowercase

In [15]:
def lowercase(text: str) -> list[str]:
    """Tokenize and preprocess the text.

    Normalize the text to lowercase

    Returns:
        tokens: a list of tokens
    """
    return [x.lower() for x in word_tokenize(text)]

In [16]:
X_train_tokenized = [lowercase(x) for x in tqdm(X_train)]

  0%|          | 0/25000 [00:00<?, ?it/s]

In [17]:
d = Dictionary(X_train_tokenized)

In [18]:
console = Console(record=True, width=80)
t = create_table(d, 20)
console.print(t, justify="center")
console.save_svg("all_vocab.svg", title="Vocabulary")

## 2nd Preprocessing: lowercase + stopping

In [19]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
from nltk.corpus import stopwords
stopwords = set(stopwords.words("english"))

In [21]:
print(stopwords)

{'mightn', 'me', 'these', "don't", 'if', "we're", 'was', "won't", 'the', "that'll", "we've", "you're", 'all', "they're", 'own', 'yours', 'by', 'it', 'ourselves', "doesn't", 'won', 'she', "needn't", 'whom', 'up', 'does', "couldn't", 'itself', 'below', "she'd", 'those', "i'll", "wasn't", 'in', 'to', 'having', 'him', 'your', 'are', "he'll", 'were', 'against', 'or', 'further', 'before', 'at', "he'd", 'ours', 'hers', "it'd", 'here', 'i', 'nor', 'doesn', 'm', 'yourselves', "weren't", 'themselves', 'needn', 'because', "they've", 'above', 'them', "it'll", 'his', 'other', "they'll", 'have', 'do', 'been', 're', "i'm", 'that', "you've", 'wouldn', 'some', 'very', 'herself', 'isn', 'doing', 'through', 'an', 'but', 'ma', 'can', 'for', 'more', 'on', 'myself', 'so', 'shan', 'about', 'be', 'did', 'a', 'no', "you'd", "she'll", 'd', 'any', 'ain', 'our', "you'll", 'each', 'my', "i'd", 'hasn', 'as', 't', "he's", 'their', 'you', 'only', 'hadn', "aren't", 'too', 'y', "mustn't", 'this', 'couldn', 'll', 'off',

In [22]:
def lower_stop(text: str, stopwords: set[str]) -> list[str]:
    """Tokenize and preprocess the text.

    Normalize the text to lowercase and remove stopwords.

    Returns:
        tokens: a list of tokens
    """
    tokens = lowercase(text)
    return [x for x in tokens if x not in stopwords]

In [23]:
X_train_tokenized = [lower_stop(text=x, stopwords=stopwords) for x in tqdm(X_train)]

  0%|          | 0/25000 [00:00<?, ?it/s]

In [24]:
console = Console(record=True, width=120)
d = Dictionary(X_train_tokenized)
t = create_table(d, 20)
console.print(t)


## 3rd Preprocessing: accept only alphabetical tokens

In [25]:
def lower_stop_only_alpha(text: str, stopwords: set[str]) -> list[str]:
    """Tokenize and preprocess the text.

    Normalize the text to lowercase, remove stopwords, remove tokens
    that are not entirely made of letters, remove tokens with only 1 character.

    Returns:
        tokens: a list of tokens
    """
    own_stopwords = {"br"}
    tokens = lower_stop(text=text, stopwords=stopwords | own_stopwords)
    return [x for x in tokens if x.isalpha() and len(x) > 1]

In [26]:
X_train_tokenized = [lower_stop_only_alpha(text=x, stopwords=stopwords) for x in tqdm(X_train)]

  0%|          | 0/25000 [00:00<?, ?it/s]

In [27]:
d = Dictionary(X_train_tokenized)

In [28]:
console = Console(record=True, width=120)
t = create_table(d, 20)
console.print(t)
console.save_svg("alpha_vocab.svg", title="Vocabulary")

## 4th Preprocessing - Stemming

* Reduces the vocabulary by reducing plurals, conjugations to their root form
* "making", "makes", "made" are transformed into "make"
* "cats", "cat" transform into "cat"
* **but** "movie" is transformed into "movi"

In [29]:
from nltk.stem import PorterStemmer


In [30]:
porter = PorterStemmer()

In [31]:
porter.stem("cats")

'cat'

In [32]:
porter.stem("making")

'make'

In [33]:
porter.stem("movie")

'movi'

In [34]:
def lower_stop_only_alpha_stem(text: str, stopwords: list[str]) -> list[str]:
    tokens = lower_stop_only_alpha(text, stopwords)
    return [porter.stem(x) for x in tokens]

In [35]:
X_train_tokenized = [lower_stop_only_alpha_stem(text=x, stopwords=stopwords) for x in tqdm(X_train)]

  0%|          | 0/25000 [00:00<?, ?it/s]

In [36]:
d = Dictionary(X_train_tokenized)


In [37]:
console = Console(record=True, width=120)
t = create_table(d, 20)
console.print(t)
console.save_svg("stem.svg", title="Vocabulary")

## 5th Preprocessing - NGrams

* 2-grams are made of 2 consecutive tokens in the text
* "the cat is blue" has 2 grams `["the", "cat"], ["cat", "is"], ["is", "blue"]`

In [38]:
from more_itertools import windowed

In [39]:
def get_ngrams(tokens: list[str], n: int) -> list[str]:
    return [" ".join(x) for x in windowed(tokens, n=n)]

In [40]:
tokens = lower_stop("I live in new york city", [])
print(tokens)

['i', 'live', 'in', 'new', 'york', 'city']


In [41]:
print(get_ngrams(tokens, n=2))

['i live', 'live in', 'in new', 'new york', 'york city']


In [42]:
print(get_ngrams(tokens, n=3))

['i live in', 'live in new', 'in new york', 'new york city']


In [43]:
def lower_stop_only_alpha_stem_ngrams(text: str, stopwords: list[str], ngrams: int) -> list[str]:
    tokens = lower_stop_only_alpha_stem(text, stopwords)
    n_grams = get_ngrams(tokens, ngrams)
    return n_grams


In [44]:
lower_stop_only_alpha_stem_ngrams("I live in the middle of new york city.", stopwords=stopwords, ngrams=3)

['live middl new', 'middl new york', 'new york citi']

In [45]:
X_train_tokenized = [lower_stop_only_alpha_stem_ngrams(x, stopwords, 2) for x in tqdm(X_train)]

  0%|          | 0/25000 [00:00<?, ?it/s]

In [46]:
d = Dictionary(X_train_tokenized)

In [47]:
console = Console(record=True, width=120)
t = create_table(d, 20)
console.print(t)
console.save_svg("bigrams.svg", title="Vocabulary")

In [48]:
t = create_table(d, 200)
console.print(t)

# Machine Learning

* Turn the 2-step process into a Pipeline
* Hyperparameters:
  * `C` of logistic regression
  * `analyzer` of vectorizer, to select the text pre-processing
  * `ngram_range` of vectorizer, to select ngrams
* Use GridSearchCV to identify the best hyperparameters

In [49]:
import nltk
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [50]:
import numpy as np

from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

In [51]:
stop_words = set(stopwords.words("english"))
porter = PorterStemmer()

def stop(text: str) -> list[str]:
    tokens = word_tokenize(text)
    return [x for x in tokens if x not in stop_words]

def stop_stem(text: str) -> list[str]:
    tokens = stop(text)
    return [porter.stem(x) for x in tokens]

In [52]:
pipe = make_pipeline(
    CountVectorizer(lowercase=True), LogisticRegression(max_iter=10000)
)

param_grid = {
    "logisticregression__C": np.logspace(-2, 2, 5),
    "countvectorizer__ngram_range": [(1, 1), (1, 2)],
    "countvectorizer__analyzer": [stop, stop_stem]
}

grid = GridSearchCV(pipe, param_grid=param_grid, verbose=4)

In [None]:
grid.fit(X_train, y_train)