In [None]:
import pandas as pd 
import numpy as np

from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive/churn_gpt/

In [None]:
!pip install transformers

In [25]:
# Import required libraries
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tqdm import tqdm
import pandas as pd 
import numpy as np

In [26]:
data = pd.read_csv('SA_model_data', low_memory=False, index_col=0)
data.reset_index(drop=True,inplace=True)
data, final_data = train_test_split(data, test_size = 0.8, random_state = 42, stratify=data['emotion'])
data

Unnamed: 0,emotion,text
20334,rq,fun time friend beer pic
5420,fear,simpli said sorri got car got hous feel restless
1992,anger,ok eye doctor guy take forev sit cold room mys...
24762,worry,listen ryan adam sick
8251,happiness,relax thank hope get put foot tomorrow enjoy day
...,...,...
14994,sadness,know everi babi differ feel like alreadi exhau...
20866,rq,watch star trek yet thought watch last night w...
13049,sadness,sick
14939,sadness,feel broke fix


In [27]:
# Load the data
#data = data[data['type'].isin(['churn', 'non_churn'])]  # Filter out 'cool-off' instances
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [28]:
train_data.emotion.value_counts()

worry        651
anger        648
rq           641
sadness      641
happiness    640
love         632
fear         627
Name: emotion, dtype: int64

In [29]:
# Create a custom dataset class
class ProspectDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.label_mapping = {  'anger': 0,
                                'fear': 1,
                                'happiness': 2,
                                'love': 3,
                                'rq': 4,
                                'sadness': 5,
                                'worry': 6}

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        text = row["text"]
        label = row["emotion"]

        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
            return_attention_mask=True,
            return_token_type_ids=False,
        )

        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "label": torch.tensor(self.label_mapping[label], dtype=torch.long),
        }

In [30]:
# Set up hyperparameters, tokenizer, and datasets
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MAX_LEN = 100
BATCH_SIZE = 32
EPOCHS = 10
LEARNING_RATE = 2e-5
NUM_CLASSES = 7

In [None]:
!pip install sentencepiece

In [31]:
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-large")
train_dataset = ProspectDataset(train_data, tokenizer, MAX_LEN)
test_dataset = ProspectDataset(test_data, tokenizer, MAX_LEN)

In [32]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [33]:
# Load the pre-trained model and fine-tune
model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-large", num_labels=NUM_CLASSES)#.to(device)
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * EPOCHS)

Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.out

In [34]:
# Training loop
model.train()
print("Training:")
for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    train_loss = 0.0
    for batch in tqdm(train_loader, desc="Batch", leave=False):
        input_ids = batch["input_ids"]#.to(device)
        attention_mask = batch["attention_mask"]#.to(device)
        labels = batch["label"]#.to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        train_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_train_loss = train_loss / len(train_loader)
    print(f"Average training loss: {avg_train_loss:.4f}")

Training:
Epoch 1/10


                                                      

ValueError: Input nan is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.

In [None]:
# Model evaluation
model.eval()
print("\nTesting:")
predictions, true_labels = [], []
test_loss = 0.0
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Batch", leave=False):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        test_loss += loss.item()
        logits = outputs.logits.detach().cpu().numpy()
        label_ids = labels.to("cpu").numpy()

        predictions.extend(np.argmax(logits, axis=1))
        true_labels.extend(label_ids)

avg_test_loss = test_loss / len(test_loader)
print(f"Average testing loss: {avg_test_loss:.4f}")

# Results

In [None]:
!pip install seaborn

In [None]:
# Import required functions and libraries
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
# Print classification report and confusion matrix
print("Classification Report:")
print(classification_report(true_labels, predictions, target_names=["churn", "non-churn"]))

conf_matrix = confusion_matrix(true_labels, predictions)
print("Confusion Matrix:")
print(conf_matrix)

# Calculate false positive and false negative scores
false_positive = conf_matrix[0][1]
false_negative = conf_matrix[1][0]
print(f"False Positive Score: {false_positive} (predict )")
print(f"False Negative Score: {false_negative}")

# Visualize confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["churn", "non-churn"], yticklabels=["churn", "non-churn"])
plt.title("Confusion Matrix")
plt.xlabel("Predicted label")
plt.ylabel("True label")
plt.show()

# Save the fine-tuned model

In [None]:
# model.save_pretrained("fine_tuned_multilingual_bert")
# tokenizer.save_pretrained("fine_tuned_multilingual_bert")

# Load the fine-tuned model for future use

In [None]:
# loaded_model = BertForSequenceClassification.from_pretrained("fine_tuned_multilingual_bert").to(device)
# loaded_tokenizer = BertTokenizer.from_pretrained("fine_tuned_multilingual_bert")