In [None]:
!pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.30.0-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.4/302.4 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.w

In [None]:
!pip install transformers datasets


Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers)
  Downloading huggingface_hub-0.23.0-p

**DistilBERT Training and Testing**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import os

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Load and preprocess your dataset
df = pd.read_excel('/content/Telco_customer_churn.xlsx')
df['Total Charges'] = pd.to_numeric(df['Total Charges'], errors='coerce')
df['Total Charges'].fillna(value=df['Total Charges'].median(), inplace=True)

def concatenate_text(x):
    features = [
        f"Gender: {x['Gender']}, Senior: {x['Senior Citizen']}",
        f"Partner: {x['Partner']}, Dependent: {x['Dependents']}",
        "has multiple lines" if x['Multiple Lines'] == 'Yes' else "does not have multiple lines",
        f"uses {x['Internet Service']} internet service" if x['Internet Service'] != 'No' else "does not use internet service",
        f"is on a {x['Contract']} contract",
        "subscribes to streaming TV" if x['Streaming TV'] == 'Yes' else "does not subscribe to streaming TV",
        "subscribes to streaming movies" if x['Streaming Movies'] == 'Yes' else "does not subscribe to streaming movies",
        "uses paperless billing" if x['Paperless Billing'] == 'Yes' else "does not use paperless billing",
        "has tech support" if x['Tech Support'] == 'Yes' else "no tech support",
        f"Tenure: {x['Tenure Months']} months, Monthly charges: {x['Monthly Charges']} dollars, Total charges: {x['Total Charges']} dollars."
    ]
    return {"text": ' '.join(features), "label": int(x['Churn Value'])}

df = df.apply(concatenate_text, axis=1, result_type='expand')
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

base_path = '/content/drive/My Drive/Model_Outputs'
output_dir = f'{base_path}/distilbert/results'
logging_dir = f'{base_path}/distilbert/logs'

# Ensure the directories exist
os.makedirs(output_dir, exist_ok=True)
os.makedirs(logging_dir, exist_ok=True)

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding="max_length", max_length=512)

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    logging_dir=logging_dir,
    logging_steps=10,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer saved to {output_dir}")

# Evaluate on test set
test_results = trainer.evaluate(test_dataset)
print(f"Test Results: {test_results}")


Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/6338 [00:00<?, ? examples/s]

Map:   0%|          | 0/669 [00:00<?, ? examples/s]

Map:   0%|          | 0/36 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5317,0.428807,0.789238,0.560748,0.656934,0.48913
2,0.4302,0.426533,0.793722,0.589286,0.651316,0.538043
3,0.3688,0.446698,0.793722,0.579268,0.659722,0.516304
4,0.398,0.43909,0.778774,0.478873,0.68,0.369565
5,0.4505,0.434958,0.799701,0.617143,0.650602,0.586957
6,0.3947,0.443868,0.798206,0.610951,0.650307,0.576087
7,0.36,0.460171,0.792227,0.601719,0.636364,0.570652
8,0.4265,0.471328,0.783259,0.564565,0.630872,0.51087
9,0.3848,0.486669,0.7713,0.53211,0.608392,0.472826
10,0.366,0.498093,0.775785,0.553571,0.611842,0.505435


Model and tokenizer saved to /content/drive/My Drive/Model_Outputs/distilbert/results


Test Results: {'eval_loss': 0.3818110227584839, 'eval_accuracy': 0.8888888888888888, 'eval_f1': 0.6666666666666666, 'eval_precision': 1.0, 'eval_recall': 0.5, 'eval_runtime': 0.6891, 'eval_samples_per_second': 52.239, 'eval_steps_per_second': 2.902, 'epoch': 10.0}


**ALBERT MODEL TRAINING AND TESTIN**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

from transformers import AlbertTokenizer, AlbertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import os

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Load and preprocess your dataset
df = pd.read_excel('/content/Telco_customer_churn.xlsx')
df['Total Charges'] = pd.to_numeric(df['Total Charges'], errors='coerce')
df['Total Charges'].fillna(value=df['Total Charges'].median(), inplace=True)

def concatenate_text(x):
    features = [
        f"Gender: {x['Gender']}, Senior: {x['Senior Citizen']}",
        f"Partner: {x['Partner']}, Dependent: {x['Dependents']}",
        "has multiple lines" if x['Multiple Lines'] == 'Yes' else "does not have multiple lines",
        f"uses {x['Internet Service']} internet service" if x['Internet Service'] != 'No' else "does not use internet service",
        f"is on a {x['Contract']} contract",
        "subscribes to streaming TV" if x['Streaming TV'] == 'Yes' else "does not subscribe to streaming TV",
        "subscribes to streaming movies" if x['Streaming Movies'] == 'Yes' else "does not subscribe to streaming movies",
        "uses paperless billing" if x['Paperless Billing'] == 'Yes' else "does not use paperless billing",
        "has tech support" if x['Tech Support'] == 'Yes' else "no tech support",
        f"Tenure: {x['Tenure Months']} months, Monthly charges: {x['Monthly Charges']} dollars, Total charges: {x['Total Charges']} dollars."
    ]
    return {"text": ' '.join(features), "label": int(x['Churn Value'])}

df = df.apply(concatenate_text, axis=1, result_type='expand')
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

base_path = '/content/drive/My Drive/Colab Model Outputs'
output_dir = f'{base_path}/albert/results'
logging_dir = f'{base_path}/albert/logs'

# Ensure the directories exist
os.makedirs(output_dir, exist_ok=True)
os.makedirs(logging_dir, exist_ok=True)

tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=2)

def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding="max_length", max_length=512)

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    logging_dir=logging_dir,
    logging_steps=10,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer saved to {output_dir}")

# Evaluate on test set
test_results = trainer.evaluate(test_dataset)
print(f"Test Results: {test_results}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4930 [00:00<?, ? examples/s]

Map:   0%|          | 0/1056 [00:00<?, ? examples/s]

Map:   0%|          | 0/1057 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5066,0.449659,0.786932,0.642289,0.617737,0.668874
2,0.5189,0.4441,0.773674,0.623622,0.594595,0.655629
3,0.3813,0.441807,0.797348,0.577075,0.715686,0.483444
4,0.4734,0.466982,0.767992,0.624809,0.581197,0.675497
5,0.2886,0.451144,0.805871,0.594059,0.738916,0.496689
6,0.405,0.423808,0.797348,0.625874,0.662963,0.592715
7,0.3821,0.425784,0.804924,0.633452,0.684615,0.589404


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5066,0.449659,0.786932,0.642289,0.617737,0.668874
2,0.5189,0.4441,0.773674,0.623622,0.594595,0.655629
3,0.3813,0.441807,0.797348,0.577075,0.715686,0.483444
4,0.4734,0.466982,0.767992,0.624809,0.581197,0.675497
5,0.2886,0.451144,0.805871,0.594059,0.738916,0.496689
6,0.405,0.423808,0.797348,0.625874,0.662963,0.592715
7,0.3821,0.425784,0.804924,0.633452,0.684615,0.589404
8,0.3922,0.416549,0.80303,0.647458,0.663194,0.63245
9,0.3372,0.426673,0.785985,0.628289,0.624183,0.63245
10,0.3602,0.438371,0.784091,0.592857,0.643411,0.549669


Model saved to /content/drive/My Drive/Colab Model Outputs/albert/results


Test Results: {'eval_loss': 0.40960749983787537, 'eval_accuracy': 0.793755912961211, 'eval_f1': 0.6148409893992934, 'eval_precision': 0.6214285714285714, 'eval_recall': 0.6083916083916084, 'eval_runtime': 41.3586, 'eval_samples_per_second': 25.557, 'eval_steps_per_second': 0.822, 'epoch': 10.0}
