In [None]:


# Import pandas for data manipulation
import pandas as pd

# --- Load the dataset ---
# **IMPORTANT**: Replace 'financial_sentiment.csv' with the actual filename
#              if you uploaded a file with a different name.
try:
    # Try loading with default comma separator
    df = pd.read_csv('/content/financial_sentiment.csv', encoding='ISO-8859-1') # Added encoding for potential issues
    print("CSV loaded successfully.")
except FileNotFoundError:
    print("Error: CSV file not found. Make sure the file name is correct and it's uploaded.")
    # You might need to add code here to mount Google Drive if the file is there,
    # or provide an alternative path.
    df = None
except Exception as e:
    print(f"An error occurred: {e}")
    # If the first attempt fails, try specifying the separator if it's not a comma,
    # or try a different encoding like 'utf-8'
    # Example: df = pd.read_csv('your_file.csv', sep=';', encoding='utf-8')
    df = None


# --- Basic Exploration (only if df was loaded successfully) ---
if df is not None:
    print("\n--- First 5 Rows ---")
    print(df.head())

    print("\n--- Dataset Info ---")
    df.info()

    # Let's rename columns for clarity if needed (adjust based on actual names)
    # Example: df.rename(columns={'Sentence': 'text', 'Sentiment': 'label'}, inplace=True)
    # Check the actual column names from df.info() or df.head() first!
    # For now, let's assume the names are 'Sentence' and 'Sentiment' as you mentioned.

    print("\n--- Sentiment Value Counts ---")
    # Check the unique values and their counts in the 'Sentiment' column
    print(df['Sentiment'].value_counts())

    print("\n--- Check for Missing Values ---")
    print(df.isnull().sum())

    # Clean up potential extra columns if 'Index' is just a row number
    # if 'Index' in df.columns:
    #     df = df[['Sentence', 'Sentiment']] # Keep only relevant columns
    #     print("\nKept only 'Sentence' and 'Sentiment' columns.")

    # Store the assumed column names for later use
    text_column = 'Sentence'
    label_column = 'Sentiment'

    print(f"\nUsing '{text_column}' as the text column.")
    print(f"Using '{label_column}' as the label column.")




In [None]:
df.head()

In [None]:
df['Sentiment'].value_counts()

In [None]:
# Install necessary libraries from Hugging Face
!pip install datasets transformers[torch] accelerate -U

# --- Now the rest of your imports and code ---
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset, DatasetDict # Hugging Face datasets library
import numpy as np
import pandas as pd # Make sure pandas is imported if not already

# --- Make sure df is loaded and has the correct columns ---
if 'df' not in locals() or df is None:
    print("Error: DataFrame 'df' not found or is None. Please load the data first.")
    # Add fallback or stop execution if df isn't loaded
else:
    text_column = 'Sentence' # Make sure this matches your actual column name
    label_column = 'Sentiment' # Make sure this matches your actual column name

    # --- 1. Label Encoding ---
    # Create a label encoder object
    label_encoder = LabelEncoder()

    # Fit the encoder on the unique sentiment labels and transform the column
    df['label'] = label_encoder.fit_transform(df[label_column])

    # Store the mapping from labels to IDs and vice-versa (useful later)
    label2id = {label: id for id, label in enumerate(label_encoder.classes_)}
    id2label = {id: label for label, id in label2id.items()}

    print("--- Label Encoding ---")
    print("Original Labels:", label_encoder.classes_)
    print("Encoded Labels (sample):", df['label'].head().tolist())
    print("Label to ID mapping:", label2id)
    print("ID to Label mapping:", id2label)

    # Keep only the text and the new numerical label
    df_processed = df[[text_column, 'label']].rename(columns={text_column: 'text'}) # Rename to 'text' and 'label' for consistency

    # --- 2. Split Data into Training and Testing Sets ---
    # Stratify ensures that the proportion of labels is roughly the same in train and test sets
    train_df, test_df = train_test_split(
        df_processed,
        test_size=0.2,        # Use 20% of the data for testing
        random_state=42,      # For reproducible results
        stratify=df_processed['label'] # Important for imbalanced datasets
    )

    print("\n--- Data Splitting ---")
    print("Total examples:", len(df_processed))
    print("Training examples:", len(train_df))
    print("Testing examples:", len(test_df))
    print("Training label distribution:\n", train_df['label'].value_counts(normalize=True).sort_index())
    print("Testing label distribution:\n", test_df['label'].value_counts(normalize=True).sort_index())


    # --- 3. Convert pandas DataFrames to Hugging Face Datasets ---
    train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
    test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))

    # Combine them into a DatasetDict (standard practice for Hugging Face)
    raw_datasets = DatasetDict({
        'train': train_dataset,
        'test': test_dataset
    })

    print("\n--- Hugging Face Dataset Conversion ---")
    print(raw_datasets)

    # Display a sample from the training set in the new format
    print("\nSample from training dataset:")
    print(raw_datasets['train'][0])

In [None]:
from transformers import AutoTokenizer

# --- Choose the model checkpoint ---
# Using DistilBERT - it's faster and often performs well for classification.
# Other options: 'bert-base-uncased', 'ProsusAI/finbert' (if you want a finance-specific model)
model_checkpoint = "distilbert-base-uncased"

# --- Load the tokenizer ---
# AutoTokenizer automatically selects the correct tokenizer class based on the checkpoint
try:
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    print(f"Tokenizer for '{model_checkpoint}' loaded successfully.")
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    # Handle error, maybe stop execution or try a different checkpoint
    tokenizer = None

if tokenizer is not None:
    # --- Define the tokenization function ---
    def tokenize_function(examples):
        # Tokenize the 'text' batch.
        # truncation=True ensures that inputs longer than the model's max length are cut off.
        # padding=True pads shorter sequences to the max length in the batch (or overall if max_length specified).
        return tokenizer(examples["text"], truncation=True, padding=True)

    # --- Apply the tokenization function to the datasets ---
    # We use batched=True for faster processing.
    # The map function applies our tokenize_function to batches of examples.
    print("\nTokenizing datasets...")
    tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
    print("Tokenization complete.")

    # --- Remove the original text column ---
    # The model doesn't need the raw text after tokenization.
    # It needs input_ids, attention_mask, and labels.
    tokenized_datasets = tokenized_datasets.remove_columns(["text"])

    # --- Rename label column to match expected Hugging Face Trainer format ---
    # The Trainer expects the label column to be named 'labels'. We already named it 'label'.
    # Let's rename it just to be safe and standard.
    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

    # --- Set the format to PyTorch tensors ---
    # This prepares the dataset for input into a PyTorch model.
    tokenized_datasets.set_format("torch")

    print("\n--- Post-Tokenization Processing ---")
    print("Columns removed: ['text']")
    print("Column 'label' renamed to 'labels'")
    print("Dataset format set to 'torch'")

    print("\n--- Final Tokenized Dataset Structure ---")
    print(tokenized_datasets)

    # Check an example from the tokenized training set
    print("\nSample from tokenized training dataset:")
    print(tokenized_datasets['train'][0])
    # Notice the new columns: 'input_ids', 'attention_mask', and 'labels'

In [None]:
from transformers import AutoModelForSequenceClassification

# --- Get the number of labels from our label mapping ---
num_labels = len(label2id) # This should be 3 in our case (neutral, positive, negative/other)
print(f"Number of labels: {num_labels}")

# --- Load the model ---
# AutoModelForSequenceClassification automatically adds a classification head
# suited for the number of labels we specify.
try:
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint,                     # e.g., "distilbert-base-uncased"
        num_labels=num_labels,                # The number of output classes
        id2label=id2label,                    # Pass mapping for nicer output labels
        label2id=label2id                     # Pass mapping
    )
    print(f"Model '{model_checkpoint}' loaded successfully for sequence classification.")
    # You can optionally print model structure:
    # print(model)
except Exception as e:
    print(f"Error loading model: {e}")
    # Handle error
    model = None

# --- Check if GPU is available and move model to GPU ---
import torch

if model is not None:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    print(f"Model moved to device: {device}")

    # Verify the model configuration (optional)
    print("\n--- Model Configuration ---")
    print(f"Expected number of labels: {model.config.num_labels}")
    print(f"Label mapping (id2label): {model.config.id2label}")

In [None]:
# Make sure necessary libraries are installed (evaluate might need separate install)
!pip install evaluate scikit-learn -U # Add scikit-learn just in case

import evaluate # Hugging Face's library for evaluation metrics
import numpy as np

# --- Load the metrics ---
# We'll use accuracy and F1-score (which includes precision and recall implicitly for micro/macro)
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
# You could also load precision and recall separately if needed:
# precision_metric = evaluate.load("precision")
# recall_metric = evaluate.load("recall")


# --- Define the compute_metrics function ---
def compute_metrics(eval_pred):
    """Computes accuracy and F1 score for evaluation predictions."""
    logits, labels = eval_pred # Unpack the predictions tuple

    # Get the predicted class by finding the index with the highest logit score
    predictions = np.argmax(logits, axis=-1)

    # Calculate accuracy
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)

    # Calculate F1 score - use 'weighted' average for imbalanced datasets
    # 'macro' treats each class equally, 'micro' aggregates globally
    f1_weighted = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
    f1_macro = f1_metric.compute(predictions=predictions, references=labels, average="macro")


    # Return metrics as a dictionary
    return {
        "accuracy": accuracy["accuracy"],
        "f1_weighted": f1_weighted["f1"],
        "f1_macro": f1_macro["f1"]
        # Add precision/recall if you loaded them
        # "precision": precision_metric.compute(predictions=predictions, references=labels, average="weighted")["precision"],
        # "recall": recall_metric.compute(predictions=predictions, references=labels, average="weighted")["recall"],
    }

# --- Test the function with dummy data (optional) ---
print("Testing compute_metrics function with dummy data:")
dummy_logits = np.array([[0.1, 0.9, 0.0], [-0.1, 0.1, 1.0], [0.8, 0.1, 0.1]])
dummy_labels = np.array([1, 2, 0]) # Corresponding true labels
dummy_eval_pred = (dummy_logits, dummy_labels)
test_metrics = compute_metrics(dummy_eval_pred)
print(test_metrics)
# Expected output: accuracy=1.0, f1_weighted=1.0, f1_macro=1.0 as predictions match labels perfectly here.

In [None]:
from transformers import TrainingArguments, Trainer

# --- Define Training Arguments ---

# Choose a name for your model repository (if pushing to Hub) or local output folder
repo_name = "finetuned-financial-sentiment" # You can change this name

# Check if running in Colab (for suggested output directory)
try:
    import google.colab
    output_dir = f"./{repo_name}" # Save output within the Colab environment's file system
    print(f"Running in Colab. Output directory set to: {output_dir}")
except ImportError:
    output_dir = f"./{repo_name}" # Save in current directory if not in Colab
    print(f"Not running in Colab. Output directory set to: {output_dir}")


training_args = TrainingArguments(
    output_dir=output_dir,                  # Directory to save model checkpoints and logs
    num_train_epochs=3,                     # Total number of training epochs (3 is a common starting point)
    per_device_train_batch_size=16,         # Batch size per GPU for training (adjust based on GPU memory)
    per_device_eval_batch_size=32,          # Batch size per GPU for evaluation
    learning_rate=2e-5,                     # Learning rate (common default for fine-tuning transformers)
    weight_decay=0.01,                      # Strength of weight decay regularization
    evaluation_strategy="epoch",            # Evaluate model performance at the end of each epoch
    save_strategy="epoch",                  # Save model checkpoint at the end of each epoch
    logging_strategy="epoch",               # Log metrics at the end of each epoch
    load_best_model_at_end=True,            # Load the best checkpoint (based on metric) when training finishes
    metric_for_best_model="f1_weighted",    # Use weighted F1 score to determine the best model
    greater_is_better=True,                 # Higher F1 score is better
    push_to_hub=False,                      # Set to True if you want to upload to Hugging Face Hub (requires login)
    report_to="tensorboard",                # Log to tensorboard (can view graphs in Colab)
    # Optional: If you have issues with CUDA memory, try gradient accumulation
    # gradient_accumulation_steps=2,
)

# --- Initialize the Trainer ---

# Make sure all components exist before initializing
if 'model' in locals() and model is not None and \
   'tokenized_datasets' in locals() and \
   'tokenizer' in locals() and tokenizer is not None and \
   'compute_metrics' in locals():

    trainer = Trainer(
        model=model,                            # The instantiated Transformers model to be trained
        args=training_args,                     # Training arguments, defined above
        train_dataset=tokenized_datasets["train"], # Training dataset
        eval_dataset=tokenized_datasets["test"],  # Evaluation dataset
        tokenizer=tokenizer,                    # Tokenizer (needed for padding consistency)
        compute_metrics=compute_metrics,        # Function to compute metrics during evaluation
    )
    print("Trainer initialized successfully.")
else:
    print("Error: One or more components (model, datasets, tokenizer, compute_metrics) not found.")
    trainer = None # Set trainer to None if initialization fails

# (Optional) If you want to push to Hugging Face Hub (requires push_to_hub=True above)
# You'll need to log in. Uncomment and run the following lines in a separate cell:
# from huggingface_hub import notebook_login
# notebook_login()

In [None]:
# --- Start Training ---
if trainer is not None:
    print("Starting training...")
    try:
        train_result = trainer.train()
        print("Training finished.")

        # --- Optional: Save metrics and state ---
        metrics = train_result.metrics
        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()
        print("Training metrics saved.")

        # The trainer automatically saves the best model if load_best_model_at_end=True
        # You can also save it manually if needed:
        # trainer.save_model(f"{output_dir}/final_model")
        # print(f"Final model saved to {output_dir}/final_model")

    except Exception as e:
        print(f"An error occurred during training: {e}")
else:
    print("Trainer was not initialized. Cannot start training.")

In [None]:
# --- Evaluate the final loaded model on the test set ---
if trainer is not None:
    print("\nEvaluating the final model on the test set...")
    eval_results = trainer.evaluate() # Uses the test_dataset specified during Trainer init

    print("\n--- Final Evaluation Results ---")
    print(eval_results)

    # You can also save these final metrics
    trainer.log_metrics("eval", eval_results)
    trainer.save_metrics("eval", eval_results)
else:
    print("Trainer not available for evaluation.")

In [None]:
from transformers import pipeline
import torch # Ensure torch is imported

# --- Create a text classification pipeline ---

# Use the 'trainer.model' which is the best model loaded after training
# Or, if you saved the model, you could load it from the output directory:
# model_path = f"{output_dir}/best_model_or_last_checkpoint" # Adjust path as needed
# classifier = pipeline("text-classification", model=model_path, tokenizer=tokenizer, device=device)

if trainer is not None and tokenizer is not None:
    # Ensure the model is on the correct device (it should be, but double-check)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    trainer.model.to(device)

    # Create pipeline using the model and tokenizer from the trainer
    classifier = pipeline(
        "text-classification",        # Task type
        model=trainer.model,          # The fine-tuned model
        tokenizer=tokenizer,        # The tokenizer used during training
        device=0 if device.type == 'cuda' else -1 # Specify GPU (0) or CPU (-1) for pipeline
    )
    print(f"Classifier pipeline created on device: {device}")

    # --- Example Sentences ---
    new_sentences = [
        "Revenue increased significantly compared to the last quarter.",
        "The company announced unexpected losses and job cuts.",
        "Stock prices remained relatively stable despite market fluctuations.",
        "Analysts are neutral about the future prospects of the tech sector.",
        "Earnings per share beat expectations, causing a surge in stock value.",
        "There are concerns about the upcoming regulatory changes."
    ]

    # --- Get Predictions ---
    print("\n--- Predicting Sentiment for New Sentences ---")
    predictions = classifier(new_sentences)

    # Print predictions
    for sentence, prediction in zip(new_sentences, predictions):
        print(f"Sentence: {sentence}")
        print(f"Predicted Label: {prediction['label']} (ID: {label2id[prediction['label']]})") # Use label2id mapping
        print(f"Confidence Score: {prediction['score']:.4f}")
        print("-" * 30)

else:
    print("Cannot create pipeline: Trainer or Tokenizer not available.")