In [1]:
import torch

# Test tensor operation on GPU
if torch.cuda.is_available():
    device = torch.device("cuda")
    x = torch.tensor([1.0, 2.0, 3.0], device=device)
    print("Tensor on GPU:", x)
else:
    print("CUDA not available.")

CUDA not available.


In [2]:
import pandas as pd

# Parameters
file_path = "Content-Summary_CleanData.csv"  # Replace with your file path
chunksize = 10_000  # Number of rows per chunk
# Adjust the total number of rows to use for training (optional)
rows_to_read = 100_000

# Initialize variables
data_chunks = []
rows_loaded = 0

# Read file in chunks
for chunk in pd.read_csv(file_path, chunksize=chunksize, encoding="utf-8"):
    remaining_rows = rows_to_read - rows_loaded
    if remaining_rows <= 0:
        break

    if len(chunk) > remaining_rows:
        data_chunks.append(chunk.iloc[:remaining_rows])
    else:
        data_chunks.append(chunk)

    rows_loaded += len(chunk)

# Combine chunks into a DataFrame
df = pd.concat(data_chunks, ignore_index=True)

print(f"Loaded {len(df)} rows into memory.")

Loaded 100000 rows into memory.


In [3]:
from transformers import BartTokenizer

# Initialize the tokenizer
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

# Preprocess function for tokenization


def preprocess_chunk(chunk):
    inputs = chunk["Content"].tolist()
    targets = chunk["Summary"].tolist()

    # Tokenize inputs and targets
    model_inputs = tokenizer(inputs, max_length=1024,
                             truncation=True, padding=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128,
                           truncation=True, padding=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


# Tokenize each chunk
tokenized_data = []
for i, chunk in enumerate(data_chunks):
    print(f"Processing chunk {i + 1}...")
    tokenized_chunk = preprocess_chunk(chunk)
    tokenized_data.append(tokenized_chunk)

print("Tokenization complete.")

  from .autonotebook import tqdm as notebook_tqdm


Processing chunk 1...




Processing chunk 2...
Processing chunk 3...
Processing chunk 4...
Processing chunk 5...
Processing chunk 6...
Processing chunk 7...
Processing chunk 8...
Processing chunk 9...
Processing chunk 10...
Tokenization complete.


In [4]:
from datasets import Dataset

# Convert each tokenized chunk to Dataset and save
for i, tokenized_chunk in enumerate(tokenized_data):
    print(f"Converting chunk {i + 1} to Dataset format...")
    chunk_dataset = Dataset.from_dict(tokenized_chunk)

    # Save each chunk to disk
    chunk_dataset.save_to_disk(f"tokenized_chunk_{i + 1}")
    print(f"Saved tokenized_chunk_{i + 1} to disk.")

Converting chunk 1 to Dataset format...


Saving the dataset (1/1 shards): 100%|██████████| 10000/10000 [00:00<00:00, 215058.32 examples/s]


Saved tokenized_chunk_1 to disk.
Converting chunk 2 to Dataset format...


Saving the dataset (1/1 shards): 100%|██████████| 10000/10000 [00:00<00:00, 260207.08 examples/s]


Saved tokenized_chunk_2 to disk.
Converting chunk 3 to Dataset format...


Saving the dataset (1/1 shards): 100%|██████████| 10000/10000 [00:00<00:00, 252340.57 examples/s]


Saved tokenized_chunk_3 to disk.
Converting chunk 4 to Dataset format...


Saving the dataset (1/1 shards): 100%|██████████| 10000/10000 [00:00<00:00, 279814.27 examples/s]


Saved tokenized_chunk_4 to disk.
Converting chunk 5 to Dataset format...


Saving the dataset (1/1 shards): 100%|██████████| 10000/10000 [00:00<00:00, 277733.53 examples/s]


Saved tokenized_chunk_5 to disk.
Converting chunk 6 to Dataset format...


Saving the dataset (1/1 shards): 100%|██████████| 10000/10000 [00:00<00:00, 273004.46 examples/s]


Saved tokenized_chunk_6 to disk.
Converting chunk 7 to Dataset format...


Saving the dataset (1/1 shards): 100%|██████████| 10000/10000 [00:00<00:00, 129870.29 examples/s]


Saved tokenized_chunk_7 to disk.
Converting chunk 8 to Dataset format...


Saving the dataset (1/1 shards): 100%|██████████| 10000/10000 [00:00<00:00, 217321.45 examples/s]


Saved tokenized_chunk_8 to disk.
Converting chunk 9 to Dataset format...


Saving the dataset (1/1 shards): 100%|██████████| 10000/10000 [00:00<00:00, 244990.10 examples/s]


Saved tokenized_chunk_9 to disk.
Converting chunk 10 to Dataset format...


Saving the dataset (1/1 shards): 100%|██████████| 10000/10000 [00:00<00:00, 258875.69 examples/s]

Saved tokenized_chunk_10 to disk.





In [5]:
from datasets import load_from_disk, concatenate_datasets

# Load tokenized chunks one by one
train_chunks = []
for i in range(len(tokenized_data)):  # Adjust this loop based on the number of saved chunks
    print(f"Loading tokenized_chunk_{i + 1}...")
    chunk = load_from_disk(f"tokenized_chunk_{i + 1}")
    train_chunks.append(chunk)

# Combine all chunks into a single training dataset
train_dataset = concatenate_datasets(train_chunks)

print("All tokenized chunks loaded and combined into a single training dataset.")

Loading tokenized_chunk_1...
Loading tokenized_chunk_2...
Loading tokenized_chunk_3...
Loading tokenized_chunk_4...
Loading tokenized_chunk_5...
Loading tokenized_chunk_6...
Loading tokenized_chunk_7...
Loading tokenized_chunk_8...
Loading tokenized_chunk_9...
Loading tokenized_chunk_10...
All tokenized chunks loaded and combined into a single training dataset.


In [6]:
from transformers import BartForConditionalGeneration, TrainingArguments, Trainer

# Load the BART model
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

# Remove the 'predict_with_generate' argument
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="no",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    save_strategy="epoch"
)

# Generate predictions manually if needed


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(
        predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Optionally compute metrics like ROUGE, BLEU, etc.
    return {}


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


# Train the model
trainer.train()




  trainer = Trainer(


Step,Training Loss


KeyboardInterrupt: 

In [None]:
# Evaluate the model
metrics = trainer.evaluate()
print("Evaluation Metrics:", metrics)

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_bart")
tokenizer.save_pretrained("./fine_tuned_bart")