<a href="https://colab.research.google.com/github/MarioOrtega78/All-functionality-in-place/blob/main/moat_transfomers_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task
Build a transformer binary classification system to determine if companies have a competitive advantage (moat) from the business excerpts of their 10-K fillings using the data from the "moat_data.xlsx" file.

## Load and preprocess data

### Subtask:
Load the data from your Excel document, handle potential missing values, and prepare the text data for model training.


**Reasoning**:
Load the data from the Excel file into a pandas DataFrame and display the first few rows and information about the DataFrame, including missing values.



In [None]:
import pandas as pd

file_path = "/content/1000_moat_dataset.csv.xlsx"
try:
    df = pd.read_excel(file_path)
    print("DataFrame loaded successfully.")
    display(df.head())
    df.info()
except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
except Exception as e:
    print(f"An error occurred: {e}")

DataFrame loaded successfully.


Unnamed: 0,text,moat
0,"Coca?Cola Consolidated, Inc., a Delaware corpo...",1
1,The automotive finance industry is highly comp...,0
2,The Company’s ability to compete successfully ...,1
3,The nonalcoholic beverage industry is highly c...,0
4,We are subject to changes in general economic ...,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    999 non-null    object
 1   moat    999 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 15.7+ KB


In [None]:
# Handle missing values in the 'moat' column (fill with 0)
df['moat'] = df['moat'].fillna(0)

# Convert the 'text' column to string type
df['text'] = df['text'].astype(str)

print("\nDataFrame after handling missing values and converting 'text' column:")
df.info()


DataFrame after handling missing values and converting 'text' column:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    999 non-null    object
 1   moat    999 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 15.7+ KB


In [None]:
from sklearn.model_selection import train_test_split

X = df['text']
y = df['moat']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Data splitting complete.")
print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

NameError: name 'df' is not defined

In [None]:
!pip install transformers torch

Collecting transformers
  Downloading transformers-4.53.2-py3-none-any.whl.metadata (40 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/40.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.53.2-py3-none-any.whl (10.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m81.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
Successfully installed transformers-4.53.2


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Choose a pre-trained model
model_name = "bert-base-uncased"  # You can change this to other models like 'roberta-base', 'distilbert-base-uncased', etc.

# Load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2) # num_labels=2 for binary classification

print(f"Tokenizer and model '{model_name}' loaded successfully.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenizer and model 'bert-base-uncased' loaded successfully.


## Fine-tune the transformer model

### Subtask:
Fine-tune the chosen transformer model on your prepared dataset for the binary classification task.

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import get_linear_schedule_with_warmup
from transformers.optimization import AdamW # Corrected import
from tqdm.notebook import tqdm

# Define a custom dataset
class MoatDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = int(self.labels[item])

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Set parameters for fine-tuning
MAX_LEN = 256
BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 2e-5

# Create datasets and data loaders
train_dataset = MoatDataset(X_train.tolist(), y_train.tolist(), tokenizer, MAX_LEN)
test_dataset = MoatDataset(X_test.tolist(), y_test.tolist(), tokenizer, MAX_LEN)

train_data_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_data_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Set up the optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Fine-tuning loop
print("Starting fine-tuning...")
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    model.train()
    running_loss = 0.0
    correct_predictions = 0

    for d in tqdm(train_data_loader):
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

        running_loss += loss.item()
        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)

    epoch_loss = running_loss / len(train_data_loader)
    epoch_acc = correct_predictions.double() / len(train_dataset)

    print(f'Train Loss: {epoch_loss:.4f} Accuracy: {epoch_acc:.4f}')

print("\nFine-tuning complete.")

ImportError: cannot import name 'AdamW' from 'transformers.optimization' (/usr/local/lib/python3.11/dist-packages/transformers/optimization.py)

In [None]:
!pip uninstall transformers -y

Found existing installation: transformers 4.53.1
Uninstalling transformers-4.53.1:
  Successfully uninstalled transformers-4.53.1


In [None]:
import pandas as pd

file_path = "/content/1000_moat_dataset.csv.xlsx"
try:
    df = pd.read_excel(file_path)
    print("DataFrame loaded successfully.")
    display(df.head())
    df.info()
except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
except Exception as e:
    print(f"An error occurred: {e}")

DataFrame loaded successfully.


Unnamed: 0,text,moat
0,"Coca?Cola Consolidated, Inc., a Delaware corpo...",1
1,The automotive finance industry is highly comp...,0
2,The Company’s ability to compete successfully ...,1
3,The nonalcoholic beverage industry is highly c...,0
4,We are subject to changes in general economic ...,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    999 non-null    object
 1   moat    999 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 15.7+ KB


In [None]:
# Handle missing values in the 'moat' column (fill with 0)
df['moat'] = df['moat'].fillna(0)

# Convert the 'text' column to string type
df['text'] = df['text'].astype(str)

print("\nDataFrame after handling missing values and converting 'text' column:")
df.info()


DataFrame after handling missing values and converting 'text' column:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    999 non-null    object
 1   moat    999 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 15.7+ KB


In [None]:
from sklearn.model_selection import train_test_split

X = df['text']
y = df['moat']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Data splitting complete.")
print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

NameError: name 'df' is not defined

In [None]:
!pip install transformers torch



In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Choose a pre-trained model
model_name = "bert-base-uncased"  # You can change this to other models like 'roberta-base', 'distilbert-base-uncased', etc.

# Load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2) # num_labels=2 for binary classification

print(f"Tokenizer and model '{model_name}' loaded successfully.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenizer and model 'bert-base-uncased' loaded successfully.


## Fine-tune the transformer model

### Subtask:
Fine-tune the chosen transformer model on your prepared dataset for the binary classification task.

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW
from tqdm.notebook import tqdm

# Define a custom dataset
class MoatDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = int(self.labels[item])

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length', # Corrected argument
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Set parameters for fine-tuning
MAX_LEN = 256
BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 2e-5

# Create datasets and data loaders
train_dataset = MoatDataset(X_train.tolist(), y_train.tolist(), tokenizer, MAX_LEN)
test_dataset = MoatDataset(X_test.tolist(), y_test.tolist(), tokenizer, MAX_LEN)

train_data_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_data_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Set up the optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Fine-tuning loop
print("Starting fine-tuning...")
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    model.train()
    running_loss = 0.0
    correct_predictions = 0

    for d in tqdm(train_data_loader):
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

        running_loss += loss.item()
        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)

    epoch_loss = running_loss / len(train_data_loader)
    epoch_acc = correct_predictions.double() / len(train_dataset)

    print(f'Train Loss: {epoch_loss:.4f} Accuracy: {epoch_acc:.4f}')

print("\nFine-tuning complete.")

NameError: name 'X_train' is not defined

## Evaluate the model

### Subtask:
Evaluate the fine-tuned model's performance on the testing set using appropriate metrics.

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, confusion_matrix
import numpy as np

# Set the model to evaluation mode
model.eval()

predictions = []
true_labels = []

# Predict on the test set
with torch.no_grad():
    for d in test_data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        logits = outputs.logits
        _, preds = torch.max(logits, dim=1)

        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Calculate metrics
accuracy = accuracy_score(true_labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='binary')
auc_roc = roc_auc_score(true_labels, predictions)
conf_matrix = confusion_matrix(true_labels, predictions)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC-ROC: {auc_roc:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix)

NameError: name 'torch' is not defined