In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords') # Downloading stopwords corpus if not present
nltk.download('punkt')

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Reading the excel file using pandas
txt_data = pd.read_excel('/content/Assignment.xlsx')

In [None]:
# Showing first 5 rows of data
txt_data.head()

Unnamed: 0,Article
0,"Retailers, the makers of foods marketed for we..."
1,"Move over, Ozempic — there’s a new drug in tow..."
2,Sept 14 (Reuters) - Bristol Myers Squibb (BMY....
3,Austin Wolcott was 18 years old and pretty sur...
4,"Cancer, often referred to as the “emperor of a..."


In [None]:
# Checking shape of data
txt_data.shape

(25, 1)

In [None]:
# Getting the info about the data
txt_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Article  25 non-null     object
dtypes: object(1)
memory usage: 328.0+ bytes


In [None]:
def cln_up_articles(txt_data):
  txt = re.sub(r'[^\w\s]', '',txt_data)
  txt = txt.lower()
  tokens = nltk.word_tokenize(txt)
  stp_wrds = set(stopwords.words('english'))
  tokens = [token for token in tokens if token not in stp_wrds]
  cln_articles = ' '.join(tokens)
  return cln_articles

In [None]:
txt_data['Cln_Articles'] = txt_data['Article'].apply(cln_up_articles)

In [None]:
txt_data['Cln_Articles'].head()

0    retailers makers foods marketed weight loss ty...
1    move ozempic theres new drug town eli lillys z...
2    sept 14 reuters bristol myers squibb bmyn said...
3    austin wolcott 18 years old pretty sure wouldn...
4    cancer often referred emperor maladies unyield...
Name: Cln_Articles, dtype: object

In [None]:
# !pip install vaderSentiment



In [None]:
sia = SentimentIntensityAnalyzer()

In [None]:
def senti_class(txt_data):
  scores = sia.polarity_scores(txt_data)
  comp_score = scores['compound']
  if comp_score >= 0.5:
    return 'positive'
  else:
    return 'negative'

In [None]:
txt_data['mood_checker'] = txt_data['Cln_Articles'].apply(senti_class)

In [None]:
txt_data['mood_checker'].value_counts()

mood_checker
positive    16
negative     9
Name: count, dtype: int64

In [None]:
## run the code to check score of each text


# for index, row in txt_data.iterrows():
#   text = row['Cln_Articles']
#   scores = sia.polarity_scores(text)
#   comp_scores = scores['compound']
#   print(f'The finals score for text at {index} is {comp_scores}')

In [None]:
vect = TfidfVectorizer()
X = vect.fit_transform(txt_data['Cln_Articles'])

In [None]:
silhouette_scores = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(X)
    silhouette_scores.append(silhouette_score(X, labels))

optimal_k = silhouette_scores.index(max(silhouette_scores)) + 2
print(f"Optimal number of clusters (k): {optimal_k}")

Optimal number of clusters (k): 9


In [None]:
KM = KMeans(n_clusters=9, random_state=47)
cluster = KM.fit_predict(X)

In [None]:
print("Cluster assignments:")
txt_data['Cluster'] = cluster
print(txt_data[['Cln_Articles', 'Cluster']])

Cluster assignments:
                                         Cln_Articles  Cluster
0   retailers makers foods marketed weight loss ty...        7
1   move ozempic theres new drug town eli lillys z...        7
2   sept 14 reuters bristol myers squibb bmyn said...        4
3   austin wolcott 18 years old pretty sure wouldn...        4
4   cancer often referred emperor maladies unyield...        4
5   nov 28 reuters us food drug administration fda...        4
6   nov 21 reuters beigene 6160hk said tuesday ent...        3
7   sept 19 reuters drugmaker beigene 6160hk said ...        3
8   brukinsa first btk inhibitor approved follicul...        4
9   whether youre looking quick bite eat sitdown e...        5
10  federal judge new york dismissed lawsuit accus...        8
11  future fast food delivery diners within 12mile...        5
12  yum brands topped wall street estimates thirdq...        1
13  fancy taco bells nacho fries fastfood chain tw...        6
14  taco bell serving new toasted 

In [None]:
txt_data.head()

Unnamed: 0,Article,Cln_Articles,mood_checker,Cluster
0,"Retailers, the makers of foods marketed for we...",retailers makers foods marketed weight loss ty...,positive,7
1,"Move over, Ozempic — there’s a new drug in tow...",move ozempic theres new drug town eli lillys z...,negative,7
2,Sept 14 (Reuters) - Bristol Myers Squibb (BMY....,sept 14 reuters bristol myers squibb bmyn said...,negative,4
3,Austin Wolcott was 18 years old and pretty sur...,austin wolcott 18 years old pretty sure wouldn...,negative,4
4,"Cancer, often referred to as the “emperor of a...",cancer often referred emperor maladies unyield...,negative,4


In [None]:
# !pip install transformers



In [None]:
# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(txt_data['Cln_Articles'], txt_data['mood_checker'], test_size=0.2, random_state=42, shuffle=True)

# Initialize tokenizer and encode text data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
X_train_encoded = tokenizer(X_train.tolist(), padding=True, truncation=True, max_length=128, return_tensors="tf")
X_val_encoded = tokenizer(X_val.tolist(), padding=True, truncation=True, max_length=128, return_tensors="tf")

# Convert labels to one-hot encoding
y_train_encoded = tf.keras.utils.to_categorical(y_train.map({'positive': 1, 'negative': 0}), num_classes=2)
y_val_encoded = tf.keras.utils.to_categorical(y_val.map({'positive': 1, 'negative': 0}), num_classes=2)

# Convert BatchEncoding objects to dictionaries
X_train_encoded = {key: val.numpy() for key, val in X_train_encoded.items()}
X_val_encoded = {key: val.numpy() for key, val in X_val_encoded.items()}

In [None]:
print(np.unique(y_val_encoded))

[0. 1.]


In [None]:
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

optimizer = 'adam'
loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
metrics = ['accuracy']

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
model.summary()

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_189 (Dropout)       multiple                  0 (unused)
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109483778 (417.65 MB)
Trainable params: 109483778 (417.65 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
fit = model.fit(
    x=X_train_encoded,
    y=y_train_encoded,
    epochs=3,
    batch_size=32
)

Epoch 2/3
Epoch 3/3


In [None]:
val_loss, val_accuracy = model.evaluate(X_val_encoded, y_val_encoded)
print(f'Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy*100}%')

loss, accuracy = model.evaluate(X_train_encoded, y_train_encoded)
print(f'Train Loss: {loss}, Train Accuracy: {accuracy*100}%')