In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
data=pd.read_csv('/content/bbc-text (1).csv')

In [5]:
data.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 471 entries, 0 to 470
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  471 non-null    object
 1   text      471 non-null    object
dtypes: object(2)
memory usage: 7.5+ KB


In [7]:
data.isnull().sum()

Unnamed: 0,0
category,0
text,0


In [8]:
data.describe()

Unnamed: 0,category,text
count,471,471
unique,5,469
top,business,hague given up his pm ambition former conser...
freq,122,2


In [9]:
data['category'].value_counts()

Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
business,122
sport,109
politics,92
entertainment,78
tech,70


In [10]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [11]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [12]:
stop_words=set(stopwords.words('english'))
lemmatizer=WordNetLemmatizer()

In [13]:
def clean_text(text):
  text=text.lower()
  text=re.sub(r'[^a-zA-Z]',' ',text)
  tokens=nltk.word_tokenize(text)
  tokens=[lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
  cleaned_text=" ".join(tokens)
  return cleaned_text

In [14]:
data['cleaned_text']=data['text'].apply(clean_text)

In [15]:
data.head()

Unnamed: 0,category,text,cleaned_text
0,tech,tv future in the hands of viewers with home th...,tv future hand viewer home theatre system plas...
1,business,worldcom boss left books alone former worldc...,worldcom bos left book alone former worldcom b...
2,sport,tigers wary of farrell gamble leicester say ...,tiger wary farrell gamble leicester say rushed...
3,sport,yeading face newcastle in fa cup premiership s...,yeading face newcastle fa cup premiership side...
4,entertainment,ocean s twelve raids box office ocean s twelve...,ocean twelve raid box office ocean twelve crim...


In [16]:
from sklearn.model_selection import train_test_split
x=data['cleaned_text']
y=data['category']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42,stratify=y)

In [17]:
print(x_train.shape)

(376,)


In [18]:
print(y_test.shape)

(95,)


In [19]:
x_train.head()

Unnamed: 0,cleaned_text
168,blair dismisses quit claim report tony blair d...
242,china top trader japan china overtook u become...
199,card fraudsters targeting web new safeguard cr...
60,telegraph newspaper axe job daily sunday teleg...
185,almagro continues spanish surge unseeded nicol...


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer(max_features=5000,ngram_range=(1,2))
x_train_idf=tfidf.fit_transform(x_train)
x_test_idf=tfidf.transform(x_test)

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,accuracy_score

model=LogisticRegression(max_iter=2000)
model.fit(x_train_idf,y_train)
y_pred=model.predict(x_test_idf)
accuracy=accuracy_score(y_test,y_pred)
print("Accuracy : ",accuracy)
print(classification_report(y_test,y_pred))

Accuracy :  0.9473684210526315
               precision    recall  f1-score   support

     business       0.89      1.00      0.94        25
entertainment       0.93      0.88      0.90        16
     politics       1.00      0.94      0.97        18
        sport       1.00      1.00      1.00        22
         tech       0.92      0.86      0.89        14

     accuracy                           0.95        95
    macro avg       0.95      0.94      0.94        95
 weighted avg       0.95      0.95      0.95        95



In [22]:
import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense,Embedding,Dropout


In [23]:
tokenizer=Tokenizer(num_words=10000)
tokenizer.fit_on_texts(x_train)

In [24]:
x_train_seq=tokenizer.texts_to_sequences(x_train)
x_test_seq=tokenizer.texts_to_sequences(x_test)

In [25]:
print(x_train_seq)

[[71, 6440, 1724, 173, 76, 456, 71, 2106, 76, 30, 1110, 397, 4, 1724, 49, 180, 67, 87, 150, 40, 288, 79, 30, 45, 7, 4712, 3771, 121, 139, 5, 350, 397, 217, 3, 71, 218, 31, 1372, 26, 42, 3, 397, 1111, 3138, 4713, 866, 3, 71, 1, 173, 6441, 110, 27, 230, 3772, 311, 46, 3, 71, 1, 1584, 110, 27, 230, 1, 87, 150, 40, 9, 915, 3772, 151, 486, 46, 350, 264, 1276, 1188, 688, 3773, 4714, 487, 1, 867, 2352, 3139, 6442, 173, 456, 71, 868, 526, 237, 737, 1041, 620, 275, 816, 976, 55, 33, 2107, 2108, 1473, 288, 79, 289, 4715, 30, 3, 397, 1585, 546, 139, 3, 3773, 288, 79, 1, 144, 23, 2, 546, 1277, 1373, 866, 371, 3774, 3138, 1111, 2353, 817, 4716, 6443, 139, 350, 3, 3773, 30, 45, 94, 3140, 738, 527, 2354, 350, 16, 916, 6444, 2352, 6445, 2352, 3139, 75, 90, 129, 774, 488, 6446, 594, 6447, 621, 75, 869, 77, 397, 4717, 340, 2355, 3775, 665, 151, 1586, 775, 3141, 45, 1189, 4718, 151, 1190, 870, 818, 1278, 977, 1587, 547, 1042, 211, 3, 71, 1, 23, 68, 1112, 23, 351, 6448, 206, 1043, 8, 26, 1043, 173, 6449, 

In [26]:
x_train_pad=pad_sequences(x_train_seq,maxlen=200)
x_test_pad=pad_sequences(x_test_seq,maxlen=200)

In [27]:
print(len(y.unique()))

5


In [58]:
from keras.regularizers import l2
model=Sequential()
model.add(Embedding(input_dim=1000,output_dim=32,input_length=200))
model.add(LSTM(units=32,dropout=0.3,recurrent_dropout=0.3))
model.add(Dense(units=16,activation="relu",kernel_regularizer=l2(0.001)))
model.add(Dropout(0.5))
model.add(Dense(units=5,activation="softmax"))

model.build(input_shape=(None, 200))
model.summary()




In [60]:
model.compile(loss="sparse_categorical_crossentropy",optimizer="adam",metrics=["accuracy"])

In [61]:
model.fit(x_train_pad,y_train.factorize()[0],epochs=10,batch_size=16,validation_data=(x_test_pad,y_test.factorize()[0]))

Epoch 1/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 717ms/step - accuracy: 0.2285 - loss: 1.6283 - val_accuracy: 0.2316 - val_loss: 1.6264
Epoch 2/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 654ms/step - accuracy: 0.2542 - loss: 1.6144 - val_accuracy: 0.2316 - val_loss: 1.6252
Epoch 3/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 728ms/step - accuracy: 0.2461 - loss: 1.6076 - val_accuracy: 0.2316 - val_loss: 1.6213
Epoch 4/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 658ms/step - accuracy: 0.3126 - loss: 1.5625 - val_accuracy: 0.2316 - val_loss: 1.6347
Epoch 5/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 1s/step - accuracy: 0.3029 - loss: 1.5202 - val_accuracy: 0.2421 - val_loss: 1.7583
Epoch 6/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 2s/step - accuracy: 0.4526 - loss: 1.3740 - val_accuracy: 0.2632 - val_loss: 1.7475
Epoch 7/10
[1m24/24[0m [3

<keras.src.callbacks.history.History at 0x78b86282fce0>

In [68]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import create_optimizer
from tensorflow.keras.losses import SparseCategoricalCrossentropy

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def encode_texts(texts, tokenizer, max_len=128):
    return tokenizer(texts.tolist(), padding=True, truncation=True, return_tensors="tf", max_length=max_len)

train_encodings = encode_texts(x_train, tokenizer)
test_encodings = encode_texts(x_test, tokenizer)

model_bert = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(y.unique()), from_pt=True)
batch_size = 16
epochs = 3
num_train_steps = (len(x_train) // batch_size) * epochs
optimizer, lr_schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps
)
model_bert.compile(optimizer=optimizer, loss=SparseCategoricalCrossentropy(from_logits=True), metrics=["accuracy"])

history = model_bert.fit(
    x=dict(train_encodings),
    y=y_train.factorize()[0],
    validation_data=(dict(test_encodings), y_test.factorize()[0]),
    epochs=3,
    batch_size=16
)


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Epoch 2/3
Epoch 3/3
