<a href="https://colab.research.google.com/github/Natural-Language-Processing-SS24/task2/blob/main/Rezepte_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets scikit-learn
!pip install transformers
!pip install optuna
!pip install optuna-integration
!pip install torch
!pip install python-docx
!pip install accelerate -U
!pip install transformers[torch]
!pip install gradio

## Bibliotheken importieren

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import ast
from google.colab import files
from sklearn.model_selection import train_test_split
from transformers import (
    BertTokenizer, BertForSequenceClassification,
    DistilBertTokenizer, DistilBertForSequenceClassification,
    Trainer, TrainingArguments
)
from datasets import Dataset, DatasetDict
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import seaborn as sns
import gradio as gr
import torch

## Dateien hochladen und laden

In [None]:
# Dateien hochladen
uploaded = files.upload()

training_data_path = 'Recipes_Training.csv'
test_data_path = 'Recipes_Test.csv'

# Laden der Datensätze
training_df = pd.read_csv(training_data_path, delimiter=';')
test_df = pd.read_csv(test_data_path, delimiter=';')

## Explorative Datenanalyse

### Daten anzeigen

In [None]:
# Informationen und erste Zeilen der DataFrames anzeigen
print(training_df.info())
print(training_df.head())
print(test_df.info())
print(test_df.head())

### Verteilung der Küchen

In [None]:
# Verteilung der Küchen
cuisine_counts = training_df['cuisine'].value_counts()
plt.figure(figsize=(12, 6))
cuisine_counts.plot(kind='bar')
plt.title('Verteilung der Küchen im Trainingsdatensatz')
plt.xlabel('Küche')
plt.ylabel('Anzahl der Rezepte')
plt.show()

### Häufigste Zutaten analysieren

In [None]:
# Häufigste Zutaten
all_ingredients = []
for ingredients_list in training_df['ingredients']:
    ingredients = ast.literal_eval(ingredients_list)
    all_ingredients.extend(ingredients)

ingredient_counts = Counter(all_ingredients)

# Top 20 häufigste Zutaten
top_ingredients = ingredient_counts.most_common(20)
ingredients, counts = zip(*top_ingredients)
plt.figure(figsize=(12, 6))
plt.bar(ingredients, counts)
plt.title('Top 20 häufigste Zutaten')
plt.xlabel('Zutat')
plt.ylabel('Häufigkeit')
plt.xticks(rotation=90)
plt.show()

### Top-Zutaten für jede Küche

In [None]:
# Funktion, um die Top-Zutaten für jede Küche zu erhalten
def get_top_ingredients_by_cuisine(cuisine):
    cuisine_data = training_df[training_df['cuisine'] == cuisine]
    all_ingredients = []
    for ingredients_list in cuisine_data['ingredients']:
        ingredients = ast.literal_eval(ingredients_list)
        all_ingredients.extend(ingredients)
    ingredient_counts = Counter(all_ingredients)
    return ingredient_counts.most_common(10)

# Top-Zutaten für jede Küche
cuisines = training_df['cuisine'].unique()
top_ingredients_by_cuisine = {cuisine: get_top_ingredients_by_cuisine(cuisine) for cuisine in cuisines}

# Top-Zutaten für jede Küche plotten
fig, axes = plt.subplots(nrows=4, ncols=2, figsize=(16, 20))
fig.tight_layout(pad=6.0)

for ax, (cuisine, top_ingredients) in zip(axes.flatten(), top_ingredients_by_cuisine.items()):
    ingredients, counts = zip(*top_ingredients)
    ax.bar(ingredients, counts)
    ax.set_title(f'Top 10 Zutaten in {cuisine.capitalize()} Küche')
    ax.set_xlabel('Zutat')
    ax.set_ylabel('Häufigkeit')
    ax.tick_params(axis='x', rotation=90)

plt.show()

## Datenvorbereitung für Modell-Training

### Trainings- und Validierungsdaten splitten

In [None]:
# Trainings- und Validierungsdaten splitten
train_df, val_df = train_test_split(training_df, test_size=0.2, stratify=training_df['cuisine'], random_state=42)

# Datensätze in Hugging Face Dataset-Format umwandeln
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
datasets = DatasetDict({"train": train_dataset, "val": val_dataset})

# Maximale Länge für Padding und Trunkierung definieren
max_length = 128

# Label-Konvertierung erstellen
label2id = {label: i for i, label in enumerate(training_df['cuisine'].unique())}
id2label = {i: label for label, i in label2id.items()}

### Text und Labels vorbereiten

In [None]:
# Text und Labels vorbereiten
def preprocess_function(examples):
    inputs = tokenizer(examples['ingredients'], padding='max_length', truncation=True, max_length=max_length)
    inputs['labels'] = [label2id[label] for label in examples['cuisine']]
    return inputs

## Funktion zur Modell-Trainierung

In [None]:
# Funktion zur Modell-Trainierung
def train_model(model_name, model_class, tokenizer_class, num_labels, num_epochs=3):
    global tokenizer
    tokenizer = tokenizer_class.from_pretrained(model_name)
    
    tokenized_datasets = datasets.map(preprocess_function, batched=True)

    model = model_class.from_pretrained(model_name, num_labels=num_labels)
    
    training_args = TrainingArguments(
        output_dir='./results',
        evaluation_strategy="epoch",
        logging_strategy="steps",
        logging_steps=10,
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=num_epochs,
        weight_decay=0.01,
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["val"],
    )
    
    trainer.train()
    
    # Evaluierung
    eval_results = trainer.evaluate()
    print(f"Evaluation results for {model_name}:")
    print(eval_results)
    
    # Vorhersagen und Berechnung der Metriken
    val_predictions = trainer.predict(tokenized_datasets["val"]).predictions
    val_predictions = np.argmax(val_predictions, axis=1)
    val_labels = tokenized_datasets["val"]['labels']
    
    # Klassifikationsbericht
    print(f"Classification Report for {model_name}:")
    print(classification_report(val_labels, val_predictions, target_names=list(label2id.keys())))
    
    # Konfusionsmatrix
    cm = confusion_matrix(val_labels, val_predictions)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', xticklabels=list(label2id.keys()), yticklabels=list(label2id.keys()), cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title(f'Confusion Matrix for {model_name}')
    plt.show()
    
    return model

## Modelle trainieren

In [None]:
# Anzahl der Klassen
num_labels = len(training_df['cuisine'].unique())

# Modelle trainieren
distilbert_model = train_model('distilbert-base-uncased', DistilBertForSequenceClassification, DistilBertTokenizer, num_labels, num_epochs=3)
bert_base_model = train_model('bert-base-uncased', BertForSequenceClassification, BertTokenizer, num_labels, num_epochs=3)
bert_large_model = train_model('bert-large-uncased', BertForSequenceClassification, BertTokenizer, num_labels, num_epochs=3)

## Gradio-Oberfläche erstellen

In [None]:
# Funktion zur Vorhersage mit allen Modellen
def predict_recipe(recipe):
    inputs = tokenizer(recipe, padding='max_length', truncation=True, max_length=max_length, return_tensors="pt")
    predictions = {}

    # DistilBERT
    with torch.no_grad():
        outputs = distilbert_model(**inputs)
    distilbert_prediction = torch.argmax(outputs.logits, dim=-1).item()
    predictions['DistilBERT'] = id2label[distilbert_prediction]

    # BERT Base
    with torch.no_grad():
        outputs = bert_base_model(**inputs)
    bert_base_prediction = torch.argmax(outputs.logits, dim=-1).item()
    predictions['BERT Base'] = id2label[bert_base_prediction]

    # BERT Large
    with torch.no_grad():
        outputs = bert_large_model(**inputs)
    bert_large_prediction = torch.argmax(outputs.logits, dim=-1).item()
    predictions['BERT Large'] = id2label[bert_large_prediction]

    actual_cuisine = training_df[training_df['ingredients'].apply(lambda x: recipe in x)].iloc[0]['cuisine']
    return recipe, predictions, actual_cuisine

# Gradio-Oberfläche
def show_recipe_options():
    recipes = training_df['ingredients'].apply(lambda x: x[:50] + '...').tolist()
    return recipes

gr.Interface(
    fn=predict_recipe,
    inputs=gr.Dropdown(show_recipe_options(), label="Select a Recipe"),
    outputs=[
        gr.Textbox(label="Full Recipe"),
        gr.Textbox(label="Predictions"),
        gr.Textbox(label="Actual Cuisine"),
    ],
    live=True
).launch()