<a href="https://colab.research.google.com/github/HTMLHrishi/ExcelProject/blob/main/GENAIProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 📦 Step 1: Install and Import
import pandas as pd
import numpy as np
from google.colab import files
import io
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

# 📤 Step 2: Upload CSV file (Expecting: "feedback", "category")
uploaded = files.upload()
df = pd.read_csv(next(iter(uploaded)))


Saving college_feedback_200.csv to college_feedback_200.csv


In [None]:
# 🧼 Step 3: Basic cleaning
df.dropna(subset=["feedback", "category"], inplace=True)
df = df[df["feedback"].str.len() > 10]
df = df.reset_index(drop=True)

# 🔢 Step 4: Encode labels
df["label"] = df["category"].astype("category").cat.codes
label_map = dict(enumerate(df["category"].astype("category").cat.categories))

# 🧪 Step 5: Train-test split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# 🔤 Step 6: Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=128)

train_encodings = tokenize_function(train_df["feedback"].tolist())
test_encodings = tokenize_function(test_df["feedback"].tolist())

# 📚 Step 7: Dataset class
class FeedbackDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels.reset_index(drop=True)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long) # Explicitly cast to torch.long
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = FeedbackDataset(train_encodings, train_df["label"])
test_dataset = FeedbackDataset(test_encodings, test_df["label"])

# 🧠 Step 8: Load BERT model
num_labels = df["label"].nunique()
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels, problem_type="single_label_classification") # Specify problem_type
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# ⚙️ Step 9: Training setup
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="no",
    logging_steps=10,
    report_to="none"
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = (preds == labels).mean()
    return {"accuracy": acc}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# 🚀 Step 10: Train!
print("🔥 Training with BERT...")
trainer.train()
print("✅ Training complete!")

# 🧪 Step 11: Predict on sample
def classify_feedback(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits).item()
    return label_map[prediction]

# ✅ Test it
sample = "The library has poor Wi-Fi and broken chairs."
print(f"\nSample Feedback:\n{sample}\nPredicted Category: {classify_feedback(sample)}")


# 📊 Step 12: Predict all
df["predicted_category"] = df["feedback"].apply(classify_feedback)
print("\n📈 Prediction Distribution:")
print(df["predicted_category"].value_counts())

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🔥 Training with BERT...


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0932,0.930347,0.85
2,0.4293,0.323647,1.0
3,0.2376,0.194142,1.0


✅ Training complete!

Sample Feedback:
The library has poor Wi-Fi and broken chairs.
Predicted Category: Facilities

📈 Prediction Distribution:
predicted_category
Academics          44
Hostel             41
Administration     40
Facilities         38
Extracurricular    37
Name: count, dtype: int64
