<a href="https://colab.research.google.com/github/Jiaye39/TimeSeriesAnalysis/blob/main/Pre_processing_5steps.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pre_processing_5steps

In [None]:
import requests
r = requests.get("http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz")
imdb_tgz = r.content

In [None]:
import io
import re
import tarfile

from tqdm.notebook import tqdm

good_file = re.compile(r"^aclImdb/(test|train)/(pos|neg)/.*\.txt$")

with tarfile.open(fileobj=io.BytesIO(r.content), mode="r:gz") as tgz:
    all_members = tgz.getmembers()
    data_files = list(filter(lambda x: x.isfile() and good_file.match(x.name) is not None, all_members))
    for f in tqdm(data_files):
        tgz.extract(f)

In [None]:
from sklearn.datasets import load_files
train_data, test_data = load_files("./aclImdb/train", encoding="utf-8"), load_files("./aclImdb/test", encoding="utf-8")

In [None]:
label2txt = {label: txt for label, txt in enumerate(train_data.target_names)}
txt2label = {txt: label for label, txt in label2txt.items()}

In [None]:
type(train_data)

In [None]:
X_train, y_train = train_data.data, train_data.target
X_test, y_test = test_data.data, test_data.target

# The data
* `X_train` and `X_test` are `list` of 25000 `str` texts
* `y_train` and `y_test` are `list` of 25000 `int`, either `0` (negative review) or `1` (positive review)


In [None]:
import numpy as np

print("TRAIN data:")
print("class balance: ", np.bincount(y_train))
print()
print("TEST data:")
print("class balance: ", np.bincount(y_train))



In [None]:
!pip install eli5

In [None]:
import nltk
nltk.download("punkt")
nltk.download('punkt_tab')

In [None]:
from nltk.tokenize import word_tokenize

In [None]:
import nltk
nltk.download("stopwords")

!pip install gensim
from gensim.corpora import Dictionary

In [None]:
import rich

from rich.console import Console
from rich.table import Table

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Preprocessing

In [None]:
def create_table(dictionary: Dictionary, n: int = 10) -> Table:
    table = Table(title=f"Top-{n} Most Frequent Tokens ({d.num_docs} documents, {d.num_pos} tokens, {len(d)} words in dictionary)")

    table.add_column("Token", justify="left", no_wrap=True)
    table.add_column("Corpus Frequency", justify="right")
    table.add_column("% of Tokens", justify="right")
    table.add_column("Document Frequency", justify="right")
    table.add_column("% of Documents", justify="right")

    for token, frequency in d.most_common(n=n):
        percent_tokens = frequency / d.num_pos
        doc_frequency = d.dfs[d.token2id[token]]
        percent_doc = doc_frequency / d.num_docs
        table.add_row(token, str(frequency), f"{percent_tokens:.1%}", str(doc_frequency), f"{percent_doc:.1%}")

    return table

## 1st Pre-Processing: Lowercase

In [None]:
def lowercase(text: str) -> list[str]:
    """Tokenize and preprocess the text.

    Normalize the text to lowercase

    Returns:
        tokens: a list of tokens
    """
    return [x.lower() for x in word_tokenize(text)]

In [17]:
X_train_tokenized = [lowercase(x) for x in tqdm(X_train)]

In [18]:
d = Dictionary(X_train_tokenized)

In [19]:
console = Console(record=True, width=80)
t = create_table(d, 20)
console.print(t, justify="center")
console.save_svg("all_vocab.svg", title="Vocabulary")

## 2nd Preprocessing: lowercase + stopping

In [20]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
from nltk.corpus import stopwords
stopwords = set(stopwords.words("english"))

In [None]:
print(stopwords)

In [None]:
def lower_stop(text: str, stopwords: set[str]) -> list[str]:
    """Tokenize and preprocess the text.

    Normalize the text to lowercase and remove stopwords.

    Returns:
        tokens: a list of tokens
    """
    tokens = lowercase(text)
    return [x for x in tokens if x not in stopwords]

In [None]:
X_train_tokenized = [lower_stop(text=x, stopwords=stopwords) for x in tqdm(X_train)]

In [None]:
console = Console(record=True, width=120)
d = Dictionary(X_train_tokenized)
t = create_table(d, 20)
console.print(t)


## 3rd Preprocessing: accept only alphabetical tokens

In [None]:
def lower_stop_only_alpha(text: str, stopwords: set[str]) -> list[str]:
    """Tokenize and preprocess the text.

    Normalize the text to lowercase, remove stopwords, remove tokens
    that are not entirely made of letters, remove tokens with only 1 character.

    Returns:
        tokens: a list of tokens
    """
    own_stopwords = {"br"}
    tokens = lower_stop(text=text, stopwords=stopwords | own_stopwords)
    return [x for x in tokens if x.isalpha() and len(x) > 1]

In [None]:
X_train_tokenized = [lower_stop_only_alpha(text=x, stopwords=stopwords) for x in tqdm(X_train)]

In [None]:
d = Dictionary(X_train_tokenized)

In [None]:
console = Console(record=True, width=120)
t = create_table(d, 20)
console.print(t)
console.save_svg("alpha_vocab.svg", title="Vocabulary")

## 4th Preprocessing - Stemming

* Reduces the vocabulary by reducing plurals, conjugations to their root form
* "making", "makes", "made" are transformed into "make"
* "cats", "cat" transform into "cat"
* **but** "movie" is transformed into "movi"

In [None]:
from nltk.stem import PorterStemmer


In [None]:
porter = PorterStemmer()

In [None]:
porter.stem("cats")

In [None]:
porter.stem("making")

In [None]:
porter.stem("movie")

In [None]:
def lower_stop_only_alpha_stem(text: str, stopwords: list[str]) -> list[str]:
    tokens = lower_stop_only_alpha(text, stopwords)
    return [porter.stem(x) for x in tokens]

In [None]:
X_train_tokenized = [lower_stop_only_alpha_stem(text=x, stopwords=stopwords) for x in tqdm(X_train)]

In [None]:
d = Dictionary(X_train_tokenized)


In [None]:
console = Console(record=True, width=120)
t = create_table(d, 20)
console.print(t)
console.save_svg("stem.svg", title="Vocabulary")

## 5th Preprocessing - NGrams

* 2-grams are made of 2 consecutive tokens in the text
* "the cat is blue" has 2 grams `["the", "cat"], ["cat", "is"], ["is", "blue"]`

In [None]:
from more_itertools import windowed

In [None]:
def get_ngrams(tokens: list[str], n: int) -> list[str]:
    return [" ".join(x) for x in windowed(tokens, n=n)]

In [None]:
tokens = lower_stop("I live in new york city", [])
print(tokens)

In [None]:
print(get_ngrams(tokens, n=2))

In [None]:
print(get_ngrams(tokens, n=3))

In [None]:
def lower_stop_only_alpha_stem_ngrams(text: str, stopwords: list[str], ngrams: int) -> list[str]:
    tokens = lower_stop_only_alpha_stem(text, stopwords)
    n_grams = get_ngrams(tokens, ngrams)
    return n_grams


In [None]:
lower_stop_only_alpha_stem_ngrams("I live in the middle of new york city.", stopwords=stopwords, ngrams=3)

In [None]:
X_train_tokenized = [lower_stop_only_alpha_stem_ngrams(x, stopwords, 2) for x in tqdm(X_train)]

In [None]:
d = Dictionary(X_train_tokenized)

In [None]:
console = Console(record=True, width=120)
t = create_table(d, 20)
console.print(t)
console.save_svg("bigrams.svg", title="Vocabulary")

In [None]:
t = create_table(d, 200)
console.print(t)

# Machine Learning

* Turn the 2-step process into a Pipeline
* Hyperparameters:
  * `C` of logistic regression
  * `analyzer` of vectorizer, to select the text pre-processing
  * `ngram_range` of vectorizer, to select ngrams
* Use GridSearchCV to identify the best hyperparameters

In [None]:
import nltk
nltk.download("punkt")
nltk.download("stopwords")

In [None]:
import numpy as np

from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

In [None]:
stop_words = set(stopwords.words("english"))
porter = PorterStemmer()

def stop(text: str) -> list[str]:
    tokens = word_tokenize(text)
    return [x for x in tokens if x not in stop_words]

def stop_stem(text: str) -> list[str]:
    tokens = stop(text)
    return [porter.stem(x) for x in tokens]

In [None]:
pipe = make_pipeline(
    CountVectorizer(lowercase=True), LogisticRegression(max_iter=10000)
)

param_grid = {
    "logisticregression__C": np.logspace(-2, 2, 5),
    "countvectorizer__ngram_range": [(1, 1), (1, 2)],
    "countvectorizer__analyzer": [stop, stop_stem]
}

grid = GridSearchCV(pipe, param_grid=param_grid, verbose=4)

In [None]:
grid.fit(X_train, y_train)