In [6]:
#!pip install optuna transformers scikit-learn gradio pandas
!pip install pandas torch optuna transformers datasets matplotlib seaborn gradio wordcloud nltk scikit-learn

Collecting torch
  Obtaining dependency information for torch from https://files.pythonhosted.org/packages/d3/1d/a257913c89572de61316461db91867f87519146e58132cdeace3d9ffbe1f/torch-2.3.1-cp311-cp311-win_amd64.whl.metadata
  Using cached torch-2.3.1-cp311-cp311-win_amd64.whl.metadata (26 kB)
Collecting optuna
  Obtaining dependency information for optuna from https://files.pythonhosted.org/packages/15/da/68883911855d8b4d521f9a370e4e6aab8232b91c1d8d5a8348c4680c6642/optuna-3.6.1-py3-none-any.whl.metadata
  Using cached optuna-3.6.1-py3-none-any.whl.metadata (17 kB)
Collecting datasets
  Obtaining dependency information for datasets from https://files.pythonhosted.org/packages/3f/59/46818ebeb708234a60e42ccf409d20709e482519d2aa450b501ddbba4594/datasets-2.19.2-py3-none-any.whl.metadata
  Using cached datasets-2.19.2-py3-none-any.whl.metadata (19 kB)
Collecting gradio
  Obtaining dependency information for gradio from https://files.pythonhosted.org/packages/78/99/5d6f18958ee8f82b9bf858232cd48f

ERROR: Cannot uninstall 'TBB'. It is a distutils installed project and thus we cannot accurately determine which files belong to it which would lead to only a partial uninstall.


RoBERTa:

> Why: RoBERTa has shown excellent performance across various NLP tasks, often outperforming BERT due to its robust optimization techniques and larger training corpus. It handles nuances in text very well, which is crucial for sentiment analysis.
    Strengths: Strong performance on a wide range of tasks, robust pre-training.

DistilBERT:

> Why: DistilBERT is a distilled version of BERT that retains much of BERT's performance while being faster and more resource-efficient. It provides a good trade-off between performance and computational efficiency.
    Strengths: Faster and lighter than BERT, making it suitable for applications where speed and resource usage are considerations.

Electra:

> Why: Electra uses a different pre-training approach where the model learns to distinguish real input tokens from fake ones generated by another model. This approach is computationally efficient and leads to strong performance.
    Strengths: Computationally efficient, strong performance, particularly effective on downstream tasks after pre-training.

Train and Evaluate All Models:

> Train each model with default hyperparameters and evaluate their performance on the validation set.

Select the Best Model:

> Select the model with the highest F1 score on the validation set.

Fine-Tune the Best Model Using Optuna:

> Use Optuna to optimize the hyperparameters for the best-performing model.

Store All Models and Tokenizers:

> Store the trained models and their tokenizers in the models dictionary for use during prediction.

Predict Function:

> The predict function takes a model, tokenizer, and text input, and returns the predicted label.

Predict Sample Function:

> The predict_sample function takes a sample index, retrieves the sample text, and gets predictions from all models, including the fine-tuned model.
    The function returns the text, ground truth label, and predictions from each model.

Gradio Interface:

> Use the Gradio interface to select a sample and display predictions from all models, including the fine-tuned model.

In [7]:
import os
import pandas as pd
import torch
import optuna
from transformers import (
    AutoTokenizer,
    RobertaForSequenceClassification,
    DistilBertForSequenceClassification,
    ElectraForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import gradio as gr
import logging
import numpy as np
import json
import yaml
from collections import Counter
from wordcloud import WordCloud
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('Sentiment-Analysis')

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    logger.info(f"Using device: {torch.cuda.get_device_name(0)}")

model_names = {
    'RoBERTa': ('roberta-base', RobertaForSequenceClassification),
    'DistilBERT': ('distilbert-base-uncased', DistilBertForSequenceClassification),
    'Electra': ('google/electra-base-discriminator', ElectraForSequenceClassification)
}

# Load abbreviations from the YAML file
with open('./data/abbreviations.yaml', 'r') as file:
    abbreviations = yaml.safe_load(file)

def replace_abbreviations(text):
    for abbr, full_form in abbreviations.items():
        text = text.replace(abbr, full_form)
    return text

class SentimentDataset(Dataset):
    """
    Custom Dataset class for sentiment analysis.
    """
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def validate_labels(labels):
    unique_labels = set(labels)
    logger.info(f"Unique labels: {unique_labels}")
    assert all(label in [0, 1, 2, 3, 4] for label in unique_labels), "Labels are out of the expected range."

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    return {
        'eval_accuracy': accuracy,
        'eval_precision': precision,
        'eval_recall': recall,
        'eval_f1': f1
    }

def plot_confusion_matrix(labels, preds, model_name):
    conf_mat = confusion_matrix(labels, preds)
    plt.figure(figsize=(10, 7))
    sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix for {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.tight_layout()  # Adjust layout to fit all elements
    plt.show()

def train_model(model_name, train_df, learning_rate=5e-5, batch_size=16, num_epochs=3, use_early_stopping=False):
    logger.info(f"Training model: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_names[model_name][0])
    model_class = model_names[model_name][1]
    model = model_class.from_pretrained(model_names[model_name][0], num_labels=5)

    # Preprocess data and split into training and validation sets
    train_df['text'] = train_df['text'].apply(replace_abbreviations)  # Replace abbreviations
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        train_df['text'].tolist(), train_df['label'].tolist(), test_size=0.2, random_state=42)

    validate_labels(train_labels)
    validate_labels(val_labels)

    train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
    val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

    train_dataset = SentimentDataset(train_encodings, train_labels)
    val_dataset = SentimentDataset(val_encodings, val_labels)

    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=num_epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        evaluation_strategy="epoch",
        learning_rate=learning_rate,
        load_best_model_at_end=use_early_stopping,
        metric_for_best_model="eval_f1",
        greater_is_better=True,
    )

    data_collator = DataCollatorWithPadding(tokenizer)

    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)] if use_early_stopping else []

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=callbacks
    )

    trainer.train()
    eval_results = trainer.evaluate()
    predictions = trainer.predict(val_dataset)

    # Save the model and tokenizer
    model.save_pretrained(f"./models/{model_name}")
    tokenizer.save_pretrained(f"./models/{model_name}")

    # Save evaluation results
    eval_results['predictions'] = predictions.predictions.tolist()  # Convert to list for serialization
    eval_results['labels'] = val_labels  # Add true labels for confusion matrix generation
    with open(f"./models/{model_name}_eval_results.json", "w") as f:
        json.dump(eval_results, f)

    # Remove the 'predictions' and 'labels' keys for logging
    eval_results_to_log = {k: v for k, v in eval_results.items() if k not in ['predictions', 'labels']}
    
    logger.info(f"Evaluation results for {model_name}: {eval_results_to_log}")
    
    # Plot confusion matrix
    predictions = np.argmax(np.array(eval_results['predictions']), axis=1)
    logger.info(f"Confusion matrix data - Labels: {eval_results['labels'][:10]} Predictions: {predictions[:10]}")
    plot_confusion_matrix(eval_results['labels'], predictions, model_name)

    return model, tokenizer, eval_results

def load_model(model_name):
    tokenizer = AutoTokenizer.from_pretrained(f"./models/{model_name}")
    model_class = model_names[model_name][1]
    model = model_class.from_pretrained(f"./models/{model_name}")
    # Load evaluation results
    with open(f"./models/{model_name}_eval_results.json", "r") as f:
        eval_results = json.load(f)
    
    # Remove the 'predictions' and 'labels' keys for logging
    eval_results_to_log = {k: v for k, v in eval_results.items() if k not in ['predictions', 'labels']}
    
    logger.info(f"Loaded evaluation results for {model_name}: {eval_results_to_log}")
    
    # Plot confusion matrix
    predictions = np.argmax(np.array(eval_results['predictions']), axis=1)
    logger.info(f"Confusion matrix data - Labels: {eval_results['labels'][:10]} Predictions: {predictions[:10]}")
    plot_confusion_matrix(eval_results['labels'], predictions, model_name)
    
    return model, tokenizer, eval_results

def objective(trial):
    model_name = best_model_name  # Use the best model selected from initial training
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    batch_size = trial.suggest_int("batch_size", 16, 32)
    train_df = pd.read_csv('./data/Sentiment_Training.csv', sep=';')
    _, _, eval_results = train_model(model_name, train_df, learning_rate, batch_size)
    return eval_results['eval_f1']

# Load the data
train_df = pd.read_csv('./data/Sentiment_Training.csv', sep=';')
test_df = pd.read_csv('./data/Sentiment_Test.csv', sep=';')

# Explorative Datenanalyse (EDA)
# Display the first few rows and information of the training dataset
print("First few rows of the training dataset:")
print(train_df.head())

print("\nInformation about the training dataset:")
print(train_df.info())

# Calculate text length
train_df['text_length'] = train_df['text'].apply(len)
print("\nStatistics of text lengths in the training dataset:")
print(train_df['text_length'].describe())

# Histogram of text lengths
plt.figure(figsize=(10, 6))
plt.hist(train_df['text_length'], bins=50, edgecolor='black')
plt.title('Distribution of Text Lengths')
plt.xlabel('Text Length')
plt.ylabel('Frequency')
plt.grid(axis='y')
plt.show()

# Distribution of labels
print("\nDistribution of labels in the training dataset:")
print(train_df['label'].value_counts().sort_index())

# Plot of label distribution
label_distribution = train_df['label'].value_counts().sort_index()
plt.figure(figsize=(10, 6))
label_distribution.plot(kind='bar')
plt.title('Distribution of Labels')
plt.xlabel('Label')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.grid(axis='y')
plt.show()

# Function to clean and tokenize text
def tokenize(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

# Tokenize all words
stop_words = set(["a", "an", "the", "and", "or", "but", "if", "in", "on", "with", "as", "of", "at", "by", "for", "from", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"])
all_words = train_df['text'].apply(tokenize).sum()

# Find most common words
word_counts = Counter(all_words)
common_words = word_counts.most_common(20)

# Generate WordCloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_counts)

# Plot WordCloud
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Most Common Words in Texts')
plt.show()

# Text lengths grouped by label and described
text_length_by_label = train_df.groupby('label')['text_length'].describe()

# Boxplot of text lengths by label
plt.figure(figsize=(10, 6))
train_df.boxplot(column='text_length', by='label', grid=False)
plt.title('Text Lengths by Label')
plt.suptitle('')
plt.xlabel('Label')
plt.ylabel('Text Length')
plt.show()

# Output text lengths by label
print("\nText lengths by label:")
print(text_length_by_label)

# Replace abbreviations in test data
test_df['text'] = test_df['text'].apply(replace_abbreviations)

# Initial training and evaluation
models = {}
results = {}
for model_name in model_names.keys():
    model_dir = f"./models/{model_name}"
    if os.path.exists(model_dir):
        logger.info(f"Loading existing model: {model_name}")
        model, tokenizer, eval_results = load_model(model_name)
        if 'eval_f1' not in eval_results:
            logger.info(f"Re-evaluating model: {model_name} to include 'eval_f1' metric.")
            torch.cuda.empty_cache()
            model, tokenizer, eval_results = train_model(model_name, train_df)
    else:
        # Clear CUDA cache before training each model
        torch.cuda.empty_cache()
        model, tokenizer, eval_results = train_model(model_name, train_df)
    models[model_name] = (model, tokenizer)
    results[model_name] = eval_results

if not results:
    logger.info("No models were trained. Exiting.")
    exit()

# Verify that eval_f1 score is in results
for model_name, eval_results in results.items():
    if 'eval_f1' not in eval_results:
        raise KeyError(f"Model {model_name} evaluation results do not contain 'eval_f1' metric.")

# Select the best model based on eval_f1 score
best_model_name = max(results, key=lambda k: results[k]['eval_f1'])
logger.info(f"Best model: {best_model_name} with F1 score: {results[best_model_name]['eval_f1']}")

# Clear CUDA cache before fine-tuning
torch.cuda.empty_cache()

# Check if the fine-tuned model exists
fine_tuned_model_dir = f"./models/{best_model_name}_fine_tuned"
if os.path.exists(fine_tuned_model_dir):
    logger.info(f"Loading existing fine-tuned model: {best_model_name}")
    best_model, best_tokenizer, best_eval_results = load_model(f"{best_model_name}_fine_tuned")
else:
    # Fine-tune the best model using Optuna
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=20)

    best_trial = study.best_trial
    logger.info(f"Best trial: {best_trial.params}")

    # Train the best model with the best hyperparameters
    best_learning_rate = best_trial.params['learning_rate']
    best_batch_size = best_trial.params['batch_size']
    best_model, best_tokenizer, best_eval_results = train_model(best_model_name, train_df, best_learning_rate, best_batch_size, use_early_stopping=True)

    # Save the fine-tuned model and evaluation results
    best_model.save_pretrained(fine_tuned_model_dir)
    best_tokenizer.save_pretrained(fine_tuned_model_dir)
    with open(f"./models/{best_model_name}_fine_tuned_eval_results.json", "w") as f:
        json.dump(best_eval_results, f)

def predict(model, tokenizer, text):
    text = replace_abbreviations(text)  # Replace abbreviations in the input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    model.to(device)
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1).item()
    return predictions

# Gradio interface
def predict_sample(sample_index):
    sample = test_df.iloc[sample_index]
    text = sample['text']
    ground_truth = sample['label']

    predictions = {}
    for model_name, (model, tokenizer) in models.items():
        predictions[model_name] = predict(model, tokenizer, text)
    
    predictions['Fine-Tuned'] = predict(best_model, best_tokenizer, text)
    
    return {
        "text": text,
        "ground_truth": ground_truth,
        "predictions": predictions
    }

sample_dropdown = gr.inputs.Dropdown(choices=[i for i in range(len(test_df))], label="Select a Sample")

iface = gr.Interface(
    fn=predict_sample,
    inputs=sample_dropdown,
    outputs="json",
    description="Sentiment Analysis Prediction"
)

iface.launch()


ModuleNotFoundError: No module named 'torch'