<a href="https://colab.research.google.com/github/Harish26242002/Data-Science-Portfolio/blob/main/Bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import pandas as pd

df = pd.read_csv('/content/it_tickets_dataset.csv')
df.head()

Unnamed: 0,Folder_Name,File_Name,File_Type,Description,Content
0,Hardware_Issue,ticket_01.json,JSON,User reported: Hard drive failure. Additional ...,"{""ticket_id"": ""TICK-HAR-0001"", ""ticket_type"": ..."
1,Hardware_Issue,ticket_01.pdf,PDF,User reported: Hard drive failure. Additional ...,IT Support Ticket: TICK-HAR-0001\nTicket Infor...
2,Hardware_Issue,ticket_02.json,JSON,User reported: Keyboard not working. Additiona...,"{""ticket_id"": ""TICK-HAR-0002"", ""ticket_type"": ..."
3,Hardware_Issue,ticket_02.pdf,PDF,User reported: Keyboard not working. Additiona...,IT Support Ticket: TICK-HAR-0002\nTicket Infor...
4,Hardware_Issue,ticket_03.json,JSON,User reported: USB port not working. Additiona...,"{""ticket_id"": ""TICK-HAR-0003"", ""ticket_type"": ..."


In [8]:
import json
import re

def extract_description(row):
    if row['File_Type'] == 'JSON':
        try:
            content_json = json.loads(row['Content'])
            return content_json.get('description')
        except json.JSONDecodeError:
            return None

    elif row['File_Type'] == 'PDF':
        text = row['Content']

        pattern = re.compile(
            r'Description\s*\n(.*?)(?:\n[A-Z][A-Za-z ]{2,}\n|\Z)',
            re.DOTALL
        )

        match = pattern.search(text)
        if match:
            return match.group(1).strip()

    return None


df['Description'] = df.apply(extract_description, axis=1)
display(df.head())


Unnamed: 0,Folder_Name,File_Name,File_Type,Description,Content
0,Hardware_Issue,ticket_01.json,JSON,User reported: Hard drive failure. Additional ...,"{""ticket_id"": ""TICK-HAR-0001"", ""ticket_type"": ..."
1,Hardware_Issue,ticket_01.pdf,PDF,User reported: Hard drive failure. Additional ...,IT Support Ticket: TICK-HAR-0001\nTicket Infor...
2,Hardware_Issue,ticket_02.json,JSON,User reported: Keyboard not working. Additiona...,"{""ticket_id"": ""TICK-HAR-0002"", ""ticket_type"": ..."
3,Hardware_Issue,ticket_02.pdf,PDF,User reported: Keyboard not working. Additiona...,IT Support Ticket: TICK-HAR-0002\nTicket Infor...
4,Hardware_Issue,ticket_03.json,JSON,User reported: USB port not working. Additiona...,"{""ticket_id"": ""TICK-HAR-0003"", ""ticket_type"": ..."


In [9]:
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords

# Download stopwords
nltk.download('stopwords')

# Load English stopwords
stop_words = set(stopwords.words('english'))

# Function to remove "User reported"
def remove_user_reported(text):
    if pd.isna(text):
        return text
    return re.sub(r'\buser reported\b', '', text, flags=re.IGNORECASE)

# Function to remove special characters
def remove_special_characters(text):
    if pd.isna(text):
        return text
    return re.sub(r'[^A-Za-z\s]', '', text)

# Function to remove stopwords
def remove_stopwords(text):
    if pd.isna(text):
        return text
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Function to convert text to UPPERCASE
def convert_to_uppercase(text):
    if pd.isna(text):
        return text
    return text.upper()

# Apply all functions on the 'desciriotn' column
df['Description'] = df['Description'].apply(remove_user_reported)
df['Description'] = df['Description'].apply(remove_special_characters)
df['Description'] = df['Description'].apply(remove_stopwords)
df['Description'] = df['Description'].apply(convert_to_uppercase)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
df.head()

Unnamed: 0,Folder_Name,File_Name,File_Type,Description,Content
0,Hardware_Issue,ticket_01.json,JSON,HARD DRIVE FAILURE ADDITIONAL DETAILS PHYSICAL...,"{""ticket_id"": ""TICK-HAR-0001"", ""ticket_type"": ..."
1,Hardware_Issue,ticket_01.pdf,PDF,HARD DRIVE FAILURE ADDITIONAL DETAILS PHYSICAL...,IT Support Ticket: TICK-HAR-0001\nTicket Infor...
2,Hardware_Issue,ticket_02.json,JSON,KEYBOARD WORKING ADDITIONAL DETAILS DEVICE WOR...,"{""ticket_id"": ""TICK-HAR-0002"", ""ticket_type"": ..."
3,Hardware_Issue,ticket_02.pdf,PDF,KEYBOARD WORKING ADDITIONAL DETAILS DEVICE WOR...,IT Support Ticket: TICK-HAR-0002\nTicket Infor...
4,Hardware_Issue,ticket_03.json,JSON,USB PORT WORKING ADDITIONAL DETAILS ISSUE STAR...,"{""ticket_id"": ""TICK-HAR-0003"", ""ticket_type"": ..."


In [11]:
# =========================================================
# BERT Text Classification (CPU ONLY, OLD transformers SAFE)
# X = Description
# Y = Folder_Name
# =========================================================

# pip install transformers datasets torch scikit-learn pandas

import pandas as pd
import numpy as np
import torch

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)

# -------------------------
# 1. Load Dataset

X = df["Description"].astype(str)
y = df["Folder_Name"].astype(str)

# -------------------------
# 2. Encode Labels
# -------------------------
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
num_labels = len(label_encoder.classes_)

print("Classes:", label_encoder.classes_)

# -------------------------
# 3. HuggingFace Dataset
# -------------------------
hf_dataset = Dataset.from_dict({
    "text": X.tolist(),
    "label": y_encoded.tolist()
})

hf_dataset = hf_dataset.train_test_split(test_size=0.2, seed=42)

# -------------------------
# 4. Tokenizer
# -------------------------
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

hf_dataset = hf_dataset.map(tokenize, batched=True)
hf_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "label"]
)

# -------------------------
# 5. Model
# -------------------------
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=num_labels
)

# -------------------------
# 6. Training Arguments (OLD VERSION SAFE)
# -------------------------
training_args = TrainingArguments(
    output_dir="./bert_ticket_classifier",
    eval_strategy="epoch",          # âœ… OLD VERSION PARAM
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    no_cuda=True,                   # CPU ONLY
    dataloader_pin_memory=False,
    logging_steps=10,
    save_strategy="epoch"
)

# -------------------------
# 7. Metrics
# -------------------------
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="weighted"
    )
    acc = accuracy_score(labels, preds)

    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

# -------------------------
# 8. Trainer
# -------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_dataset["train"],
    eval_dataset=hf_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# -------------------------
# 9. Train
# -------------------------
trainer.train()

# -------------------------
# 10. Evaluate
# -------------------------
results = trainer.evaluate()
print("Evaluation Results:", results)

# -------------------------
# 11. Inference
# -------------------------
def predict_folder(text):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True
    )
    outputs = model(**inputs)
    pred_id = torch.argmax(outputs.logits, dim=1).item()
    return label_encoder.inverse_transform([pred_id])[0]

# Example Prediction
print(
    predict_folder(
        "Ransomware detected on system and files encrypted"
    )
)


Classes: ['Access_Request' 'Hardware_Issue' 'Network_Problem' 'Password_Reset'
 'Security_Incident' 'Software_Issue' 'System_Error']


Map:   0%|          | 0/112 [00:00<?, ? examples/s]

Map:   0%|          | 0/28 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8639,1.680018,0.392857,0.234609,0.392857,0.282878
2,1.6384,1.516763,0.464286,0.675,0.464286,0.459006
3,1.2421,1.275017,0.75,0.880357,0.75,0.770166


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Evaluation Results: {'eval_loss': 1.2750169038772583, 'eval_accuracy': 0.75, 'eval_precision': 0.8803571428571428, 'eval_recall': 0.75, 'eval_f1': 0.7701659451659452, 'eval_runtime': 10.2667, 'eval_samples_per_second': 2.727, 'eval_steps_per_second': 0.39, 'epoch': 3.0}
Security_Incident


# BERT vs DistilBERT vs RoBERTa

| Feature | BERT | DistilBERT | RoBERTa |
|------|------|------------|---------|
| Full Name | Bidirectional Encoder Representations from Transformers | Distilled version of BERT | Robustly Optimized BERT Pretraining Approach |
| Developed By | Google | Hugging Face | Facebook AI (Meta) |
| Model Size | Large (110M+ parameters) | Smaller (~66M parameters) | Larger than BERT (125M+ parameters) |
| Speed | Slower | ~60% faster than BERT | Slower than BERT |
| Memory Usage | High | Low | Very High |
| Architecture | Transformer Encoder | Transformer Encoder (Compressed) | Transformer Encoder |
| Training Data | BooksCorpus + Wikipedia | Same as BERT | Much larger & more diverse datasets |
| Training Strategy | MLM + NSP | MLM only (no NSP) | MLM only (no NSP), dynamic masking |
| Masking | Static masking | Static masking | Dynamic masking |
| Next Sentence Prediction (NSP) | Yes | No | No |
| Performance | Strong baseline | Slightly lower than BERT | Better than BERT on most tasks |
| Accuracy | High | Mediumâ€“High | Very High |
| Fine-tuning Time | Longer | Shorter | Longer |
| Best Use Case | High-accuracy NLP tasks | Lightweight & real-time NLP | State-of-the-art NLP performance |
| Deployment | Heavy production systems | Edge / low-latency systems | Research & high-end production |
| Examples | Sentiment Analysis, QA | Chatbots, Text Classification | NLU, Benchmark-heavy tasks |

---

## When to Use Which?

- **BERT** â†’ When you want a **balanced model** with strong accuracy  
- **DistilBERT** â†’ When you need **speed, lower memory, and faster inference**  
- **RoBERTa** â†’ When you want **maximum accuracy** and can afford higher compute cost
