In [10]:
# Install necessary libraries
#!pip install transformers torch pandas scikit-learn openpyxl

import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import TrainingArguments, Trainer
import numpy as np

# Load data
transactions_df = pd.read_excel('/content/Book1.xlsx')  # Ensure file path is correct
keyword_mapping_df = pd.read_csv('/content/categories_and_keywords.csv')  # Ensure file path is correct

# Prepare category mapping
category_mapping = {category: i for i, category in enumerate(keyword_mapping_df['Category'].unique())}
keyword_to_category = keyword_mapping_df.set_index('Keywords')['Category'].to_dict()

def categorize_description(description, keyword_to_category):
    description = description.lower()
    for keyword, category in keyword_to_category.items():
        if keyword in description:
            return category
    return 'UNKNOWN'

# Add a column for initial categories based on keywords
transactions_df['initial_category'] = transactions_df['Description'].apply(lambda x: categorize_description(x, keyword_to_category))

# Include 'UNKNOWN' in the category mapping if not present
if 'UNKNOWN' not in category_mapping:
    new_category_id = len(category_mapping)
    category_mapping['UNKNOWN'] = new_category_id

# Update the keyword mapping with the new 'UNKNOWN' category
keyword_mapping_df = pd.concat([
    keyword_mapping_df,
    pd.DataFrame({'Keyword': ['unknown'], 'Category': ['UNKNOWN']})
], ignore_index=True)

# Save updated keyword mapping
keyword_mapping_df.to_csv('/content/updated_categories_and_keywords.csv', index=False)

# Convert initial categories to numeric labels
transactions_df['labels'] = transactions_df['initial_category'].map(category_mapping)

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(category_mapping))

# Tokenize function
def tokenize_function(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

# Prepare text data
texts = transactions_df['Description'].tolist()
labels = transactions_df['labels'].tolist()

# Split data
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels,
    test_size=0.2,
    random_state=42
)

# Tokenize data
train_encodings = tokenize_function(train_texts)
test_encodings = tokenize_function(test_texts)

# Create dataset class
class TransactionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TransactionDataset(train_encodings, train_labels)
test_dataset = TransactionDataset(test_encodings, test_labels)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=4,  # Increased epochs for better training
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_dir='./logs',  # Added logging
    logging_steps=10,      # Adjust logging steps
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(eval_results)

# Make predictions
predictions = trainer.predict(test_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=-1)

# Calculate metrics
accuracy = accuracy_score(test_labels, predicted_labels)
print(f"Accuracy: {accuracy}")
print(classification_report(test_labels, predicted_labels))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss
1,3.8458,1.4807
2,1.2065,0.388853
3,0.4229,0.196248
4,0.2144,0.163847


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'eval_loss': 0.1638467162847519, 'eval_runtime': 30.3491, 'eval_samples_per_second': 2.405, 'eval_steps_per_second': 0.066, 'epoch': 4.0}
Accuracy: 0.9863013698630136
              precision    recall  f1-score   support

           2       1.00      1.00      1.00        57
          14       0.00      0.00      0.00         1
         169       0.94      1.00      0.97        15

    accuracy                           0.99        73
   macro avg       0.65      0.67      0.66        73
weighted avg       0.97      0.99      0.98        73



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from transformers import Trainer, TrainingArguments
import torch
import os

# Define paths
transactions_path = '/content/Book1.xlsx'
keyword_mapping_path = '/content/categories_and_keywords.csv'
model_path = '/content/finetuned_model'

# Load data
transactions_df = pd.read_excel(transactions_path)
keyword_mapping_df = pd.read_csv(keyword_mapping_path) if os.path.exists(keyword_mapping_path) else pd.DataFrame(columns=['Keywords', 'Category'])

# Prepare category mapping
category_mapping = {category: i for i, category in enumerate(keyword_mapping_df['Category'].unique())}
keyword_to_category = keyword_mapping_df.set_index('Keywords')['Category'].to_dict()

def categorize_description(description, keyword_to_category):
    description = description.lower()
    for keyword, category in keyword_to_category.items():
        if keyword in description:
            return category
    return 'UNKNOWN'

# Add a column for initial categories based on keywords
transactions_df['initial_category'] = transactions_df['Description'].apply(lambda x: categorize_description(x, keyword_to_category))

# Include 'UNKNOWN' in the category mapping if not present
if 'UNKNOWN' not in category_mapping:
    new_category_id = len(category_mapping)
    category_mapping['UNKNOWN'] = new_category_id

# Update the keyword mapping with the new 'UNKNOWN' category
if 'UNKNOWN' not in keyword_mapping_df['Category'].values:
    keyword_mapping_df = pd.concat([
        keyword_mapping_df,
        pd.DataFrame({'Keywords': ['unknown'], 'Category': ['UNKNOWN']})
    ], ignore_index=True)

keyword_mapping_df.to_csv(keyword_mapping_path, index=False)

# Convert initial categories to numeric labels
transactions_df['labels'] = transactions_df['initial_category'].map(category_mapping)

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
num_labels = len(category_mapping)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

# Tokenize function
def tokenize_function(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

# Prepare text data
texts = transactions_df['Description'].tolist()
labels = transactions_df['labels'].tolist()

# Split data
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels,
    test_size=0.2,
    random_state=42
)

# Tokenize data
train_encodings = tokenize_function(train_texts)
test_encodings = tokenize_function(test_texts)

# Create dataset class
class TransactionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TransactionDataset(train_encodings, train_labels)
test_dataset = TransactionDataset(test_encodings, test_labels)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=0.3,  # Adjust epochs as needed
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(eval_results)

def predict_and_update(description, max_attempts=8):
    global model, category_mapping, keyword_to_category, keyword_mapping_df
    for attempt in range(max_attempts):
        # Check if description matches known keywords
        for keyword, category in keyword_to_category.items():
            if keyword in description.lower():
                return category

        # Perform prediction with BERT
        inputs = tokenizer(description, return_tensors="pt")
        outputs = model(**inputs)
        prediction = torch.argmax(outputs.logits, dim=1).item()

        # Convert prediction id to category name
        predicted_category = [k for k, v in category_mapping.items() if v == prediction][0]

        # If prediction is UNKNOWN
        if predicted_category == 'UNKNOWN':
            if attempt < max_attempts - 1:
                print(f"Attempt {attempt + 1}/{max_attempts}: Unknown category for description: '{description}'.")
                new_category = input("Please enter the category: ").strip()

                if new_category not in category_mapping:
                    # Add new category to mappings
                    new_category_id = len(category_mapping)
                    category_mapping[new_category] = new_category_id
                    keyword_to_category[description] = new_category
                    keyword_mapping_df.loc[len(keyword_mapping_df)] = [description, new_category]
                    keyword_mapping_df.to_csv(keyword_mapping_path, index=False)

                    # Update model with new category
                    num_labels = len(category_mapping)
                    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

                    # Prepare updated dataset
                    new_labels = transactions_df['labels'].tolist() + [new_category_id]
                    new_texts = transactions_df['Description'].tolist() + [description]

                    train_encodings = tokenize_function(new_texts)
                    train_dataset = TransactionDataset(train_encodings, new_labels)

                    trainer.train_dataset = train_dataset
                    trainer.train()

                return new_category
            else:
                return 'UNKNOWN'
        return predicted_category

# Apply predictions
transactions_df['Category'] = transactions_df['Description'].apply(predict_and_update)

# Save updated Excel file with only Category column
transactions_df[['Category']].to_excel('/content/updated_transactions.xlsx', index=False)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss
0,No log,3.867151


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'eval_loss': 3.8671514987945557, 'eval_runtime': 31.5896, 'eval_samples_per_second': 2.311, 'eval_steps_per_second': 0.063, 'epoch': 0.3157894736842105}
