In [None]:
!pip install -U spacy pandas
!python -m spacy download en_core_web_sm



Collecting pandas
  Downloading pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Downloading pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 2.0.3
    Uninstalling pandas-2.0.3:
      Successfully uninstalled pandas-2.0.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 24.4.1 requires pandas<2.2.2dev0,>=2.0, but you have pandas 2.2.2 which is incompatible.
google-colab 1.0.0 requires pandas==2.0.3, but you have pandas 2.2.2 which is incompatible.[0m[31m
[0mSuccessfully installed pandas-2.2.2
Collecting en-core-web-sm==3.7.1
  Downloading htt

In [None]:
import pandas as pd
from transformers import XLNetTokenizer, XLNetForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch

# Load data
transactions_df = pd.read_excel('/content/Book1.xlsx')  # Ensure file path is correct
keyword_mapping_path = '/content/categories_and_keywords.csv'
keyword_mapping_df = pd.read_csv(keyword_mapping_path) if os.path.exists(keyword_mapping_path) else pd.DataFrame(columns=['Keywords', 'Category'])

# Prepare category mapping
category_mapping = {category: i for i, category in enumerate(keyword_mapping_df['Category'].unique())}
keyword_to_category = keyword_mapping_df.set_index('Keywords')['Category'].to_dict()

def categorize_description(description, keyword_to_category):
    description = description.lower()
    for keyword, category in keyword_to_category.items():
        if keyword in description:
            return category
    return 'UNKNOWN'

# Add a column for initial categories based on keywords
transactions_df['initial_category'] = transactions_df['Description'].apply(lambda x: categorize_description(x, keyword_to_category))

# Include 'UNKNOWN' in the category mapping if not present
if 'UNKNOWN' not in category_mapping:
    new_category_id = len(category_mapping)
    category_mapping['UNKNOWN'] = new_category_id

# Update the keyword mapping with the new 'UNKNOWN' category
if 'UNKNOWN' not in keyword_mapping_df['Category'].values:
    keyword_mapping_df = pd.concat([
        keyword_mapping_df,
        pd.DataFrame({'Keywords': ['unknown'], 'Category': ['UNKNOWN']})
    ], ignore_index=True)

keyword_mapping_df.to_csv(keyword_mapping_path, index=False)

# Convert initial categories to numeric labels
transactions_df['labels'] = transactions_df['initial_category'].map(category_mapping)

# Initialize tokenizer and model
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
num_labels = len(category_mapping)
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=num_labels)

# Tokenize function
def tokenize_function(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

# Prepare text data
texts = transactions_df['Description'].tolist()
labels = transactions_df['labels'].tolist()

# Split data
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels,
    test_size=0.2,
    random_state=42
)

# Tokenize data
train_encodings = tokenize_function(train_texts)
test_encodings = tokenize_function(test_texts)

# Create dataset class
class TransactionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TransactionDataset(train_encodings, train_labels)
test_dataset = TransactionDataset(test_encodings, test_labels)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=1,  # Adjust epochs as needed
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(eval_results)

def predict_and_update(description, max_attempts=8):
    global model  # Ensure model is treated as a global variable
    for attempt in range(max_attempts):
        # Check if description matches known keywords
        for keyword, category in keyword_to_category.items():
            if keyword in description.lower():
                return category

        # Perform prediction with XLNet
        inputs = tokenizer(description, return_tensors="pt")
        outputs = model(**inputs)
        prediction = torch.argmax(outputs.logits, dim=1).item()

        # Convert prediction id to category name
        predicted_category = [k for k, v in category_mapping.items() if v == prediction][0]

        # If prediction is UNKNOWN
        if predicted_category == 'UNKNOWN':
            if attempt < max_attempts - 1:
                print(f"Attempt {attempt + 1}/{max_attempts}: Unknown category for description: '{description}'.")
                new_category = input("Please enter the category: ").strip()

                if new_category not in category_mapping:
                    # Add new category to mappings
                    new_category_id = len(category_mapping)
                    category_mapping[new_category] = new_category_id
                    keyword_to_category[description] = new_category
                    keyword_mapping_df.loc[len(keyword_mapping_df)] = [description, new_category]
                    keyword_mapping_df.to_csv(keyword_mapping_path, index=False)

                    # Update model with new category
                    num_labels = len(category_mapping)
                    model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=num_labels)

                    # Prepare updated dataset
                    new_labels = transactions_df['labels'].tolist() + [new_category_id]
                    new_texts = transactions_df['Description'].tolist() + [description]

                    train_encodings = tokenize_function(new_texts)
                    train_dataset = TransactionDataset(train_encodings, new_labels)

                    trainer.train_dataset = train_dataset
                    trainer.train()

                return new_category
            else:
                return 'UNKNOWN'
        return predicted_category

# Apply predictions
transactions_df['Category'] = transactions_df['Description'].apply(predict_and_update)

# Save updated Excel file with all original columns plus 'Category'
output_excel_path = '/content/updated_transactions.xlsx'
transactions_df.to_excel(output_excel_path, index=False)
print(f"Updated Excel file saved at: {output_excel_path}")


spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss
1,3.0121,0.66884


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'eval_loss': 0.6688399910926819, 'eval_runtime': 38.8047, 'eval_samples_per_second': 1.881, 'eval_steps_per_second': 0.052, 'epoch': 1.0}
Updated Excel file saved at: /content/updated_transactions.xlsx
