In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
import numpy as np

df = pd.read_csv('./train.csv')  
df = df[['cleaned_transcript', 'primary_call_reason']]

label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['primary_call_reason'])

X_train, X_test, y_train, y_test = train_test_split(df['cleaned_transcript'], df['label'], test_size=0.2, random_state=42)

train_data = pd.DataFrame({'text': X_train, 'label': y_train})
test_data = pd.DataFrame({'text': X_test, 'label': y_test})

train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)

dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': test_dataset
})

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [2]:
def preprocess_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512)

tokenized_datasets = dataset_dict.map(preprocess_function, batched=True)
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


Map:   0%|          | 0/65986 [00:00<?, ? examples/s]

Map:   0%|          | 0/667 [00:00<?, ? examples/s]

In [3]:
# import os
# os.environ["WANDB_DISABLED"] = "true"


In [None]:

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=100,
    per_device_eval_batch_size=100,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
)

# Start training
trainer.train()

In [9]:
model.save_pretrained('./bert_2epochs_cleaned_transcript_tuned')
tokenizer.save_pretrained('./bert_2epochs_cleaned_transcript_tuned')

('./bert_2epochs_cleaned_transcript_tuned\\tokenizer_config.json',
 './bert_2epochs_cleaned_transcript_tuned\\special_tokens_map.json',
 './bert_2epochs_cleaned_transcript_tuned\\vocab.txt',
 './bert_2epochs_cleaned_transcript_tuned\\added_tokens.json')

In [26]:
final_test = pd.read_csv('./test.csv')
# df_all = pd.read_csv('./callsf0d4f5a.csv')
# final_test = final_test.merge(df_all[['call_id', 'call_transcript']], on='call_id', how='left')
test_texts = final_test['cleaned_transcript'].tolist()

def predict_primary_reason(texts, batch_size=302):
    predictions = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Predicting primary call reasons"):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            batch_predictions = torch.argmax(logits, dim=-1).cpu().numpy()
        predictions.extend(batch_predictions)
    return predictions

predicted_labels = predict_primary_reason(test_texts)
predicted_reasons = label_encoder.inverse_transform(predicted_labels)

# Add predictions to the final_test DataFrame
final_test['primary_call_reason'] = predicted_reasons



Predicting primary call reasons: 100%|██████████| 18/18 [00:53<00:00,  2.97s/it]


In [29]:
final_test.drop(['cleaned_transcript'], axis=1)

In [14]:
final_test.to_csv('./test_harsh_&_rohan_cleaned_transcript.csv', index=False)

In [32]:
hh = pd.read_csv('./test_harsh_&_rohan_cleaned_transcript.csv')

In [34]:
hh['primary_call_reason']

0       IRROPS
1       IRROPS
2       IRROPS
3       IRROPS
4       IRROPS
         ...  
5152    IRROPS
5153    IRROPS
5154    IRROPS
5155    IRROPS
5156    IRROPS
Name: primary_call_reason, Length: 5157, dtype: object