In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

data = pd.read_csv("clean_data.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,Label,Text
0,0,Politics,budget to set scene for election gordon brown ...
1,1,Politics,army chiefs in regiments decision military chi...
2,3,Politics,observers to monitor uk election ministers wil...
3,4,Politics,kilroy names election seat target exchat show ...
4,5,Politics,donor attacks blairbrown feud the reported feu...


In [2]:
data.replace({0:"Politics", 1:"Sport", 2:"Technology", 3:"Entertainment", 4:"Business"}, inplace=True)
data.Label.value_counts()

Label
Sport            505
Business         503
Politics         403
Entertainment    369
Technology       347
Name: count, dtype: int64

In [3]:
label_encoder = LabelEncoder()
data['Label'] = label_encoder.fit_transform(data['Label'])

In [4]:
X_train, X_val, y_train, y_val = train_test_split(data["Text"], data["Label"], test_size=0.2, random_state=42)

In [5]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(list(X_val), truncation=True, padding=True, max_length=128)

In [6]:
import torch

class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(int(self.labels[idx])) 
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SentimentDataset(train_encodings, y_train.tolist())
val_dataset = SentimentDataset(val_encodings, y_val.tolist())

In [7]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5) 

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [9]:
trainer.train()
eval_results = trainer.evaluate()
print(f"Accuracy: {eval_results['eval_accuracy']}")
print(f"Loss: {eval_results['eval_loss']}")

  0%|          | 0/639 [00:00<?, ?it/s]

  0%|          | 0/54 [00:00<?, ?it/s]

{'eval_loss': 0.10274588316679001, 'eval_accuracy': 0.9835680751173709, 'eval_runtime': 45.9447, 'eval_samples_per_second': 9.272, 'eval_steps_per_second': 1.175, 'epoch': 1.0}


  0%|          | 0/54 [00:00<?, ?it/s]

{'eval_loss': 0.0964592695236206, 'eval_accuracy': 0.9812206572769953, 'eval_runtime': 41.9338, 'eval_samples_per_second': 10.159, 'eval_steps_per_second': 1.288, 'epoch': 2.0}
{'loss': 0.1652, 'grad_norm': 0.0162372849881649, 'learning_rate': 1.0876369327073553e-05, 'epoch': 2.35}


  0%|          | 0/54 [00:00<?, ?it/s]

{'eval_loss': 0.08361171931028366, 'eval_accuracy': 0.9859154929577465, 'eval_runtime': 43.7277, 'eval_samples_per_second': 9.742, 'eval_steps_per_second': 1.235, 'epoch': 3.0}
{'train_runtime': 2346.1096, 'train_samples_per_second': 2.175, 'train_steps_per_second': 0.272, 'train_loss': 0.13871729168720276, 'epoch': 3.0}


  0%|          | 0/54 [00:00<?, ?it/s]

Accuracy: 0.9859154929577465
Loss: 0.08361171931028366


In [15]:
test_text = ["The government has announced a new policy aimed at reducing carbon emissions by 50% by 2030."]
test_encodings = tokenizer(test_text, truncation=True, padding=True, max_length=128, return_tensors="pt")
outputs = model(**test_encodings)
preds = torch.argmax(outputs.logits, dim=1)
predicted_label = label_encoder.inverse_transform(preds.numpy())
print(f"Predicted label: {predicted_label[0]}")

Predicted label: Business


In [11]:
model.save_pretrained('./saved_model')
tokenizer.save_pretrained('./saved_model')

('./saved_model\\tokenizer_config.json',
 './saved_model\\special_tokens_map.json',
 './saved_model\\vocab.txt',
 './saved_model\\added_tokens.json')

In [12]:
import gradio as gr
model = BertForSequenceClassification.from_pretrained('./saved_model')
tokenizer = BertTokenizer.from_pretrained('./saved_model')

In [13]:
def predict(text):
    # Tokeniser le texte d'entrée
    inputs = tokenizer(text, truncation=True, padding=True, max_length=128, return_tensors="pt")
    # Obtenir les prédictions
    outputs = model(**inputs)
    preds = torch.argmax(outputs.logits, dim=1)
    # Convertir la prédiction en label
    predicted_label = label_encoder.inverse_transform(preds.numpy())
    return predicted_label[0]

In [14]:
# Create the Gradio interface
iface = gr.Interface(fn=predict, inputs="text", outputs="text", title="Text Classification", description="Enter a text to predict its category.")

# Launch the interface
iface.launch()

* Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.


