<a href="https://colab.research.google.com/github/Karthikt04/NM/blob/main/MyZoom.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
# Install dependencies
!pip install --quiet transformers datasets torch scikit-learn pandas openpyxl gradio

In [29]:
#Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict
import torch

In [30]:
# Upload and load data
from google.colab import files
print("Upload 'train.xlsx':")
_ = files.upload()
print("Upload 'evaluation.xlsx':")
_ = files.upload()

df_train = pd.read_excel('train.xlsx')
df_eval = pd.read_excel('evaluation.xlsx')

Upload 'train.xlsx':


Saving train.xlsx to train.xlsx
Upload 'evaluation.xlsx':


Saving evaluation.xlsx to evaluation.xlsx


In [31]:
# Inspect & balance
# Drop NaNs
df_train.dropna(subset=['text','reason','label'], inplace=True)
print(df_train['label'].value_counts())

# (Optional) simple augmentation for negative class:
neg_df = df_train[df_train.label == 0]
pos_df = df_train[df_train.label == 1]
# create mismatched pairs = shuffle reasons
shuffled = pos_df.copy()
shuffled['reason'] = np.random.permutation(shuffled['reason'].values)
df_aug = pd.concat([df_train, shuffled]).reset_index(drop=True)
print("After augmentation:", df_aug['label'].value_counts())

label
1    2061
Name: count, dtype: int64
After augmentation: label
1    4122
Name: count, dtype: int64


In [33]:
# Preprocessing helper (if needed)
def preprocess_text(text):
    return text.strip()

df_aug['text'] = df_aug['text'].apply(preprocess_text)

In [34]:
# Tokenization
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

def tokenize_fn(batch):
    return tokenizer(batch['text'], batch['reason'], truncation=True, padding='max_length', max_length=128)

# Convert to Hugging Face Dataset
hf_train = Dataset.from_pandas(df_aug)
hf_eval = Dataset.from_pandas(df_eval)

# Tokenize
hf_train = hf_train.map(tokenize_fn, batched=True)
hf_eval = hf_eval.map(tokenize_fn, batched=True)

# Set format
hf_train.set_format(type='torch', columns=['input_ids','attention_mask','label'])
hf_eval.set_format(type='torch', columns=['input_ids','attention_mask','label'])

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Map:   0%|          | 0/4122 [00:00<?, ? examples/s]

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

In [42]:
# Model & TrainingArguments
import os
os.environ["WANDB_DISABLED"] = "true"
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    report_to="none",
    save_total_limit=1
)


loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://huggingface.co/bert-base-uncased/re

In [44]:
# Metrics function
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    cm = confusion_matrix(labels, preds)
    return {'accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1}

In [45]:
# Trainer setup & train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_train,
    eval_dataset=hf_eval,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: reason, text. If reason, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 4122
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1548


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
# Evaluate & visualize
metrics = trainer.evaluate()
print(metrics)

# Confusion matrix
import seaborn as sns, matplotlib.pyplot as plt
cm = confusion_matrix(hf_eval['label'], np.argmax(trainer.predict(hf_eval).predictions, axis=1))
sns.heatmap(cm, annot=True, fmt='d')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# Save model
model.push_to_hub('my-zoom-fb-validation-model')  # requires HF login

In [None]:
# Gradio demo (optional)
import gradio as gr

def predict_fn(text, reason):
    inputs = tokenizer(text, reason, truncation=True, padding='max_length', max_length=128, return_tensors='pt')
    outputs = model(**inputs)
    prob = torch.softmax(outputs.logits, dim=1)[0,1].item()
    return {'Aligned (1)': prob, 'Not Aligned (0)': 1-prob}

iface = gr.Interface(fn=predict_fn,
                     inputs=[gr.Textbox(label='Feedback'), gr.Textbox(label='Reason')],
                     outputs=[gr.Label(num_top_classes=2)],
                     title='My Zoom: Feedback Validator')
iface.launch(share=True)