In [29]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder
from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures
from transformers import BertModel, BertTokenizer
from transformers import BertTokenizer
import nltk
import torch

In [6]:
df = pd.read_csv("bbc_articles.csv")
df['text'] = df['title'] + ". " + df['brief']

In [9]:
# Noise Removal
def remove_noise(text):
    # Remove URLs
    text = re.sub(r'http[s]?://\S+', '', text)
    # Remove special characters except periods and commas
    text = re.sub(r'[^a-zA-Z0-9\s,.]', '', text)
    # Convert to lowercase
    return text.lower()

df['cleaned_text'] = df['text'].apply(remove_noise)
df['cleaned_text']

0       uk considering recognising palestine state, lo...
1       ny police probe alleged attack on propalestine...
2       propalestine group targets twickenham stadium ...
3       palestine action says it vandalised haulage fi...
4       israelgaza london school warns it may close ov...
                              ...                        
1492    ks3  ks4  gcse history un partition plan for p...
1493    two men admit verbally abusing propalestine pr...
1494    palestine action arms factory rooftop proteste...
1495    birmingham colmore row building targeted by pr...
1496    tories suspend oldham councillors who went on ...
Name: cleaned_text, Length: 1497, dtype: object

In [16]:
# Stemming and Lemmatization
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def stem_lemma_stop(text):
    tokens = word_tokenize(text)
    stemmed = [stemmer.stem(token) for token in tokens]
    lemmatized = [lemmatizer.lemmatize(token) for token in tokens]
    stop_words_set = set(stopwords.words('english'))
    no_stop_words_lemma = [word for word in lemmatized if word not in stop_words_set]
    no_stop_words_stem = [word for word in stemmed if word not in stop_words_set]
    return (' '.join(no_stop_words_lemma),' '.join(no_stop_words_stem))

df[['normalized_text_lemma', 'normalized_text_stem']] = df['cleaned_text'].apply(stem_lemma_stop).apply(pd.Series)
print(df['normalized_text_lemma'],"\n\n\n",df['normalized_text_stem'])

0       uk considering recognising palestine state , l...
1       ny police probe alleged attack propalestine ma...
2       propalestine group target twickenham stadium r...
3       palestine action say vandalised haulage firm ....
4       israelgaza london school warns may close pales...
                              ...                        
1492    ks3 ks4 gcse history un partition plan palesti...
1493    two men admit verbally abusing propalestine pr...
1494    palestine action arm factory rooftop protester...
1495    birmingham colmore row building targeted propa...
1496    tory suspend oldham councillor went propalesti...
Name: normalized_text_lemma, Length: 1497, dtype: object 


 0       uk consid recognis palestin state , lord camer...
1       ny polic probe alleg attack propalestin march ...
2       propalestin group target twickenham stadium re...
3       palestin action say vandalis haulag firm . red...
4       israelgaza london school warn may close palest...
           

In [17]:
#we'll be going with lemmatized text instead of stemmed text as lemmatization clumps similar text to a single value

# N-grams
def extract_ngrams(text, num):
    n_grams = ngrams(word_tokenize(text), num)
    return [' '.join(grams) for grams in n_grams]

# Combine all texts to a single corpus for n-gram analysis
corpus = ' '.join(df['normalized_text_lemma'].tolist())
# Extracting N-grams
unigrams = extract_ngrams(corpus, 1)
bigrams = extract_ngrams(corpus, 2)
trigrams = extract_ngrams(corpus, 3)

In [19]:
# Frequency analysis
def get_top_ngrams(ngram_list, n=10):
    freq_dist = nltk.FreqDist(ngram_list)
    return freq_dist.most_common(n)

print("Top 10 Unigrams:", get_top_ngrams(unigrams),"\n")
print("Top 10 Bigrams:", get_top_ngrams(bigrams),"\n")
print("Top 10 Trigrams:", get_top_ngrams(trigrams))

Top 10 Unigrams: [('.', 3063), ('palestine', 1245), (',', 1110), ('propalestine', 627), ('say', 526), ('action', 494), ('state', 397), ('protester', 379), ('london', 378), ('israel', 352)] 

Top 10 Bigrams: [('. palestine', 396), ('palestine action', 370), ('israel .', 257), ('palestine state', 248), ('say .', 247), ('propalestine march', 245), ('red paint', 245), ('partition plan', 245), ('march .', 244), ('propalestine group', 244)] 

Top 10 Trigrams: [('. palestine action', 368), ('propalestine march .', 244), ('un partition plan', 244), ('partition plan palestine', 244), ('recognise palestine state', 125), ('recognising palestine state', 123), ('red paint .', 123), ('. ks3 ks4', 123), ('ks3 ks4 gcse', 123), ('ks4 gcse history', 123)]


In [23]:
# Word-level Tokenization
df['word_tokens'] = df['normalized_text'].apply(word_tokenize)
# Sentence-level Tokenization
df['sentence_tokens'] = df['text'].apply(sent_tokenize)

print(df['word_tokens'],"\n\n\n",df['sentence_tokens'])

0       [uk, considering, recognising, palestine, stat...
1       [ny, police, probe, alleged, attack, propalest...
2       [propalestine, group, target, twickenham, stad...
3       [palestine, action, say, vandalised, haulage, ...
4       [israelgaza, london, school, warns, may, close...
                              ...                        
1492    [ks3, ks4, gcse, history, un, partition, plan,...
1493    [two, men, admit, verbally, abusing, propalest...
1494    [palestine, action, arm, factory, rooftop, pro...
1495    [birmingham, colmore, row, building, targeted,...
1496    [tory, suspend, oldham, councillor, went, prop...
Name: word_tokens, Length: 1497, dtype: object 


 0       [UK considering recognising Palestine state, L...
1       [NY police probe alleged attack on pro-Palesti...
2       [Pro-Palestine group targets Twickenham Stadiu...
3       [Palestine Action says it vandalised haulage f...
4       [Israel-Gaza: London school warns it may close...
                     

In [28]:
# Modern NLP techniques (BERT etc)

# Subword Tokenization

# tokenize a piece of text using BERT tokenizer
def tokenize_text(text):
    return tokenizer.tokenize(text)

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# BERT tokenize the text
df['subword_tokens_bert'] = df['text'].apply(tokenize_text)
df['subword_tokens_bert']

0       [uk, considering, rec, ##og, ##nis, ##ing, pal...
1       [ny, police, probe, alleged, attack, on, pro, ...
2       [pro, -, palestine, group, targets, t, ##wick,...
3       [palestine, action, says, it, van, ##dal, ##is...
4       [israel, -, gaza, :, london, school, warns, it...
                              ...                        
1492    [ks, ##3, /, ks, ##4, /, g, ##cs, ##e, history...
1493    [two, men, admit, verbal, ##ly, abu, ##sing, p...
1494    [palestine, action, arms, factory, rooftop, pr...
1495    [birmingham, :, col, ##more, row, building, ta...
1496    [tori, ##es, suspend, oldham, councillors, who...
Name: subword_tokens_bert, Length: 1497, dtype: object

In [31]:
# Contextualized Tokenization

#basically generating vector embeddings for the text corpus

# BERT model and tokenizer
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# encode text and get contextualized embeddings
def get_contextualized_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    # Get embeddings
    embeddings = outputs.last_hidden_state
    # Convert embeddings to np array
    return embeddings.cpu().numpy()

# Get embeddings
df['contextualized_embeddings'] = df['text'].apply(get_contextualized_embeddings)
df['contextualized_embeddings']

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


0       [[[-0.85742253, 0.18792677, -0.24154675, -0.40...
1       [[[-0.5532259, -0.15368998, -0.14605772, -0.36...
2       [[[-1.0767837, -0.076922745, -0.015406564, -0....
3       [[[-0.58144146, -0.2473306, -0.3095854, -0.676...
4       [[[-0.50207776, -0.34874398, -0.083566695, -0....
                              ...                        
1492    [[[-0.4664716, -0.063210584, -0.670615, -0.185...
1493    [[[-0.82238173, -0.20111781, -0.26917163, -1.0...
1494    [[[-0.5953143, -0.24393675, 0.16158737, -0.611...
1495    [[[-0.5854618, -0.2701547, -0.20922962, -0.624...
1496    [[[-0.8234787, 0.06940951, 0.10707852, -0.6584...
Name: contextualized_embeddings, Length: 1497, dtype: object

In [32]:
# This tokenization generates word embeddings in a vector format, keeping the context of the whole corpus in view.
# This preprocessing step is useful for tasks such as feeding this data into ML/DL models for performing downstream
# tasks such as text classification, sentiment classification, clustering, or anything related to text/NLP.