In [1]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

In [None]:
import os
import random
from sklearn.model_selection import train_test_split
import csv

# Set random seed for reproducibility
random.seed(42)

# Get list of all csv files, excluding "product_info.csv"
csv_dir = os.getcwd()  # Current working directory is the same as where the notebook is located
csv_files = [os.path.join(csv_dir, f) for f in os.listdir(csv_dir) if f.endswith(".csv") and f != "product_info.csv"]

# Combine all csv files except "product_info.csv"
all_csv_files = csv_files

# Shuffle files and split (95% train, 5% eval)
train_files, eval_files = train_test_split(all_csv_files, test_size=0.05, random_state=42)

print(f"Train files found: {len(train_files)}")

# Function to read all content from a list of csv files
def read_csv_files(file_list):
    data = []
    for file_path in file_list:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                reader = csv.DictReader(f)
                data.extend(list(reader))
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
    return data

# Read train and eval data
train_data = read_csv_files(train_files)
eval_data = read_csv_files(eval_files)

print(f"Train data: {len(train_data)}")
print(f"Evaluation data: {len(eval_data)}")

# Create a simple dataset object (you may need to adapt this based on your specific requirements)
class CSVDataset:
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]

train_dataset = CSVDataset(train_data)
eval_dataset = CSVDataset(eval_data)

# Optionally, if you need to save the datasets to files
import pandas as pd

# Save train dataset to csv file
df_train = pd.DataFrame(train_data)
df_train.to_csv('train_dataset.csv', index=False)

print("Train dataset saved to 'train_dataset.csv'")

# Save eval dataset to csv file
df_eval = pd.DataFrame(eval_data)
df_eval.to_csv('eval_dataset.csv', index=False)

print("Evaluation dataset saved to 'eval_dataset.csv'")


Train files: 4
Eval files: 1
Train data: 887686
Evaluation data: 206725
Train dataset saved to 'train_dataset.csv'
Evaluation dataset saved to 'eval_dataset.csv'


In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total # of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
)

trainer = Trainer(
    model=model,                         # the instantiated ðŸ¤— Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset= eval_dataset            # evaluation dataset
)

In [None]:
#trainer.train()
#trainer.evaluate()