In [None]:
!pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.30.0-py3-none-any.whl (302 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/302.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m297.0/302.4 kB[0m [31m9.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.4/302.4 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.

In [None]:
!pip install datasets transformers

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.0-py3-none-an

**BERT Training and Testing**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import os

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

df = pd.read_excel('/content/Telco_customer_churn.xlsx')
df['Total Charges'] = pd.to_numeric(df['Total Charges'], errors='coerce')
df['Total Charges'].fillna(value=df['Total Charges'].median(), inplace=True)

def concatenate_text(x):
    features = [
        f"Gender: {x['Gender']}, Senior: {x['Senior Citizen']}",
        f"Partner: {x['Partner']}, Dependent: {x['Dependents']}",
        "has multiple lines" if x['Multiple Lines'] == 'Yes' else "does not have multiple lines",
        f"uses {x['Internet Service']} internet service" if x['Internet Service'] != 'No' else "does not use internet service",
        f"is on a {x['Contract']} contract",
        "subscribes to streaming TV" if x['Streaming TV'] == 'Yes' else "does not subscribe to streaming TV",
        "subscribes to streaming movies" if x['Streaming Movies'] == 'Yes' else "does not subscribe to streaming movies",
        "uses paperless billing" if x['Paperless Billing'] == 'Yes' else "does not use paperless billing",
        "has tech support" if x['Tech Support'] == 'Yes' else "no tech support",
        f"Tenure: {x['Tenure Months']} months, Monthly charges: {x['Monthly Charges']} dollars, Total charges: {x['Total Charges']} dollars."
    ]
    return {"text": ' '.join(features), "label": int(x['Churn Value'])}

df = df.apply(concatenate_text, axis=1, result_type='expand')
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

base_path = '/content/drive/My Drive/Colab Model Outputs'
output_dir = f'{base_path}/bert/results'
logging_dir = f'{base_path}/bert/logs'

# Ensure the directories exist
os.makedirs(output_dir, exist_ok=True)
os.makedirs(logging_dir, exist_ok=True)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding="max_length", max_length=512)

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    logging_dir=logging_dir,
    logging_steps=10,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer saved to {output_dir}")

test_results = trainer.evaluate(test_dataset)
print(f"Test Results: {test_results}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4930 [00:00<?, ? examples/s]

Map:   0%|          | 0/1056 [00:00<?, ? examples/s]

Map:   0%|          | 0/1057 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4272,0.488895,0.769886,0.362205,0.873418,0.228477
2,0.4827,0.432644,0.800189,0.582178,0.724138,0.486755
3,0.3751,0.433305,0.798295,0.640809,0.652921,0.629139
4,0.4905,0.446542,0.775568,0.623211,0.599388,0.649007
5,0.2828,0.461509,0.783144,0.548323,0.678049,0.460265
6,0.3259,0.547212,0.785038,0.577281,0.659574,0.513245
7,0.305,0.538699,0.785038,0.552268,0.682927,0.463576
8,0.3365,0.524608,0.782197,0.569288,0.655172,0.503311
9,0.2837,0.536514,0.779356,0.598967,0.623656,0.576159
10,0.312,0.591586,0.777462,0.576577,0.632411,0.529801


Model saved to /content/drive/My Drive/Colab Model Outputs/bert/results


Test Results: {'eval_loss': 0.4115445017814636, 'eval_accuracy': 0.7947019867549668, 'eval_f1': 0.5469728601252608, 'eval_precision': 0.6787564766839378, 'eval_recall': 0.458041958041958, 'eval_runtime': 38.3643, 'eval_samples_per_second': 27.552, 'eval_steps_per_second': 0.886, 'epoch': 10.0}
