<a href="https://colab.research.google.com/github/HassAlli/nlp_practice/blob/main/TextCorpus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Building a Text Corpus

In [None]:
corpus = [
    "The cat is sitting on the mat.",
    "She plays the piano beautifully.",
    "The sun sets in the west.",
    "Football is a popular sport worldwide.",
    "I enjoy reading books in my free time.",
    "The conference will be held next week.",
    "The company announced its new product launch.",
    "He went for a jog in the morning.",
    "Football is a popular sport worldwide."
]

labels = [
    "animal",
    "weather",
    "food",
    "animal",
    "weather",
    "food",
    "animal",
    "weather",
    "sports"
]

In [None]:
import numpy as np
import pandas as pd

corpus = np.array(corpus)
corpus_df = pd.DataFrame({
    'Document': corpus,
    'Category': labels
})

corpus_df = corpus_df[['Document', 'Category']]

corpus_df

# Preprocessing the Text Corpus


In [None]:
import nltk
import re
nltk.download('stopwords')

In [None]:
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')


def normalize_document(doc):
  # lower case and remove special characters & whitescpaces
  doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
  doc = doc.lower()
  doc = doc.strip() # remove any leading or trailing whitespace
  # tekonize document
  tokens = wpt.tokenize(doc)
  # filter stopwords out of the document
  filtered_tokens = [token for token in tokens if token not in stop_words]
  # re-create document from filtered tokens
  doc = ' '.join(filtered_tokens)

  return doc

normalize_corpus = np.vectorize(normalize_document)

In [None]:
norm_docs = normalize_corpus(corpus)
print(len(' '.join(norm_docs).split(' ')))
pd.DataFrame(norm_docs)

34


Unnamed: 0,0
0,cat sitting mat
1,plays piano beautifully
2,sun sets west
3,football popular sport worldwide
4,enjoy reading books free time
5,conference held next week
6,company announced new product launch
7,went jog morning
8,football popular sport worldwide


# Bag of Words Model

In [55]:
from sklearn.feature_extraction.text import CountVectorizer

# get bag of words features in sparse format
cv = CountVectorizer(min_df=0, max_df=1)
cv_matrix = cv.fit_transform(norm_docs)
cv_matrix = cv_matrix.toarray()
# get all unique words in the document/corpus
vocab = cv.get_feature_names_out()
# show document feature vector
pd.DataFrame(cv_matrix, columns=vocab)

Unnamed: 0,announced,beautifully,books,cat,company,conference,enjoy,free,held,jog,...,plays,product,reading,sets,sitting,sun,time,week,went,west
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,1,1,0,0,...,0,0,1,0,0,0,1,0,0,0
5,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
6,1,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Bag of N-Grams Model

In [None]:
# set the n-gram range to 1,2
bv = CountVectorizer(ngram_range=(2,2))
bv_matrix = bv.fit_transform(norm_docs)
bv_matrix = bv_matrix.toarray()

vocab = bv.get_feature_names_out()
pd.DataFrame(bv_matrix, columns=vocab)

Unnamed: 0,announced new,books free,cat sitting,company announced,conference held,enjoy reading,football popular,free time,held next,jog morning,...,piano beautifully,plays piano,popular sport,product launch,reading books,sets west,sitting mat,sport worldwide,sun sets,went jog
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,1,0,0
4,0,1,0,0,0,1,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
5,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
6,1,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
8,0,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,1,0,0


# TF-IDF Transformer

In [56]:
from sklearn.feature_extraction.text import TfidfTransformer

tt = TfidfTransformer(use_idf=True)
tt_matrix = tt.fit_transform(cv_matrix)

tt_matrix = tt_matrix.toarray()
vocab = cv.get_feature_names_out()
pd.DataFrame(np.round(tt_matrix, 2), columns=vocab)

Unnamed: 0,announced,beautifully,books,cat,company,conference,enjoy,free,held,jog,...,plays,product,reading,sets,sitting,sun,time,week,went,west
0,0.0,0.0,0.0,0.58,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.58,0.0,0.0,0.0,0.0,0.0
1,0.0,0.58,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.58,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.58,0.0,0.58,0.0,0.0,0.0,0.58
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.45,0.0,0.0,0.0,0.45,0.45,0.0,0.0,...,0.0,0.0,0.45,0.0,0.0,0.0,0.45,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0
6,0.45,0.0,0.0,0.0,0.45,0.0,0.0,0.0,0.0,0.0,...,0.0,0.45,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.58,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.58,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# TF-IDF Vectorizer

In [59]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(min_df=0, max_df=1, use_idf=True)
tv_matrix = tv.fit_transform(norm_docs)

tv_matrix = tv_matrix.toarray()
vocab = tv.get_feature_names_out()
pd.DataFrame(np.round(tt_matrix, 2), columns=vocab)

Unnamed: 0,announced,beautifully,books,cat,company,conference,enjoy,free,held,jog,...,plays,product,reading,sets,sitting,sun,time,week,went,west
0,0.0,0.0,0.0,0.58,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.58,0.0,0.0,0.0,0.0,0.0
1,0.0,0.58,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.58,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.58,0.0,0.58,0.0,0.0,0.0,0.58
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.45,0.0,0.0,0.0,0.45,0.45,0.0,0.0,...,0.0,0.0,0.45,0.0,0.0,0.0,0.45,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0
6,0.45,0.0,0.0,0.0,0.45,0.0,0.0,0.0,0.0,0.0,...,0.0,0.45,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.58,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.58,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
