<a href="https://colab.research.google.com/github/LidiiaMelnyk95/FSU_Jena_scripts/blob/main/spelling_correction_gpt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
df = pd.read_csv('/content/replacement_spelling_deduplicated-2.csv', sep =';')

In [None]:
df.columns

Index(['Unnamed: 0', 'SPELLING', 'Comment'], dtype='object')

In [None]:
df = df.dropna(subset = 'SPELLING')

In [None]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,SPELLING,Comment
0,1543,:) Bei Partner:innenakrobatikfestivals stapeln...,:) Bei Partner:innenakrobatikfestivals stapel...
1,2558,:D okay ... deswegen geht einem die Regenboge...,:D okay ... deswegen geht einem die Regenboge...
2,679,"""Das Naturgesetz"" lmaoIch hoffe du trägst kei...","""Das Naturgesetz"" lmaoIch hoffe du trägst kei..."
3,2290,"""ja schön. Diese Person kann sich ja auch dem...","""ja schön. Diese Person kann sich ja auch dem..."
4,822,"""Meine Seele"", ""mein Körper"" ""ich"", ""ich"", .....","""Meine Seele"", ""mein Körper"" ""ich"", ""ich"", ....."
5,820,"""Na los mach's Maul auf"". Bei so einer Formul...","""Na los mach das Maul auf"". Bei so einer Form..."
6,2006,"""Sollten wir in diesem Fall Gefühle über die R...","""Sollten wir in diesem Fall Gefühle über die ..."
7,2805,Aber ich sehe in keinster Weise wie meine Sich...,Aber ich sehe in keinster Weise wie meine Sic...
8,1117,TheWastedAccount12 Achso und lese gerade dass ...,"Ach so und lese gerade, dass du der Beleidigu..."
9,2773,"Ach, super. Dann danke für die sachliche Klar...","Ach, super. Dann danke für die sachliche Klar..."


In [None]:
import pandas as pd
from transformers import pipeline, AutoTokenizer

import pandas as pd
from transformers import AutoTokenizer, pipeline
from nltk.translate import bleu_score

def load_data(file_path):
    return pd.read_csv(file_path, sep=';')

def generate_text(model_name, wrong_version, corrected_version, tokenizer, generator):
    prompt = f"Das ist ein Beispieltext, der korrigiert werden soll: {wrong_version}. Die korrigierte Version von '{corrected_version}' lautet:"
    tokens = tokenizer.encode(prompt, add_special_tokens=False)

    try:
        generated_text = generator(corrected_version, max_length=512, num_return_sequences=1)[0]["generated_text"]
    except Exception as e:
        print(f"An error occurred during generation: {e}")
        generated_text = corrected_version  # Use the original text if an error occurs

    return generated_text

def calculate_bleu_scores(reference_texts, generated_texts):
    bleu_refs = [bleu_score.sentence_bleu([ref.split()], gen.split()) for ref, gen in zip(reference_texts, generated_texts)]
    average_bleu_score = sum(bleu_refs) / len(bleu_refs)
    return average_bleu_score

def main():
    # Load data
    file_path = "/content/replacement_spelling_deduplicated-2.csv"
    df = load_data(file_path)

    # Initialize tokenizer and generator
    model_name = "dbmdz/german-gpt2"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    generator = pipeline("text-generation", model=model_name, tokenizer=tokenizer)

    # Generate corrected text for each comment in the dataframe
    generated_texts = [generate_text(model_name, wrong_version, corrected_version, tokenizer, generator)
                   for wrong_version, corrected_version in zip(df['Comment'], df['SPELLING'])]

    # Write generated text to the "Generated" column
    df["Generated"] = generated_texts

    # Calculate BLEU scores
    bleu_score = calculate_bleu_scores(df["Comment"], generated_texts)
    print(f"Average BLEU score: {bleu_score:.4f}")

if __name__ == "__main__":
    main()


In [None]:
import pandas as pd
from rouge import Rouge

def calculate_rouge_scores(df, reference_column, generated_column):
    rouge = Rouge()

    def calculate_rouge(row):
        reference_text = row[reference_column]
        generated_text = row[generated_column]

        scores = rouge.get_scores(generated_text, reference_text)[0]

        return {
            'ROUGE-1': scores.get('rouge-1', {}).get('f', 0.0),
            'ROUGE-2': scores.get('rouge-2', {}).get('f', 0.0),
            'ROUGE-3': scores.get('rouge-3', {}).get('f', 0.0),
        }

    rouge_scores_df = df.apply(calculate_rouge, axis=1, result_type='expand')
    df = pd.concat([df, rouge_scores_df], axis=1)

    return df

# Calculate ROUGE scores
df = calculate_rouge_scores(df, 'Comment', 'Generated')
print(df)


In [None]:
df["ROUGE-1"].mean(), df['ROUGE-2'].mean(), df['ROUGE-3'].mean()

(0.9096590359184268, 0.8663294078717427, 0.0)

In [None]:
from nltk.translate import bleu_score
bleu_refs = [[text.split()] for text in df["Comment"].tolist()]
bleu_sys = [text.split() for text in list(df['Generated_2'].values)]
bleu_score = bleu_score.corpus_bleu(bleu_refs, bleu_sys)

print(f"BLEU score: {bleu_score}")

BLEU score: 0.9029008232280601


In [None]:
from nltk.util import ngrams
from nltk import word_tokenize
from nltk.corpus import wordnet as wn

def precision(candidate, reference, n):
    """
    Calculate the precision of n-grams in a text.

    Args:
    - candidate (str): The generated text
    - reference (str): The reference text
    - n (int): The n-gram order

    Returns:
    - float: The precision score
    """
    candidate_ngrams = ngrams(word_tokenize(candidate), n)
    reference_ngrams = ngrams(word_tokenize(reference), n)
    candidate_ngrams_set = set(candidate_ngrams)
    reference_ngrams_set = set(reference_ngrams)
    common_ngrams = candidate_ngrams_set.intersection(reference_ngrams_set)
    precision = len(common_ngrams) / len(candidate_ngrams_set)
    return precision

In [None]:
def brevity_penalty(candidate, reference):
    """
    Calculate the brevity penalty for the precision score.

    Args:
    - candidate (str): The generated text
    - reference (str): The reference text

    Returns:
    - float: The brevity penalty
    """
    candidate_length = len(word_tokenize(candidate))
    reference_length = len(word_tokenize(reference))
    if candidate_length > reference_length:
        brevity_penalty = 1
    else:
        brevity_penalty = np.exp(1 - reference_length / candidate_length)
    return brevity_penalty

In [None]:
def gleu(candidate, reference, max_order=4):
    """
    Calculate the GLEU score for a generated text compared to a reference text.

    Args:
    - candidate (str): The generated text
    - reference (str): The reference text
    - max_order (int): The maximum n-gram order to consider (default: 4)

    Returns:
    - float: The GLEU score
    """
    precision_scores = []
    for n in range(1, max_order + 1):
        precision_scores.append(precision(candidate, reference, n))
    brevity_penalty_score = brevity_penalty(candidate, reference)
    gleu_score = brevity_penalty_score * np.exp(np.mean(np.log(precision_scores)))
    return gleu_score

In [None]:
# Apply GLEU calculation to each row
df['gleu'] = df.apply(lambda row: glue(row['Comment'], row['Generated']), axis=1)

# Calculate the mean GLEU score
gleu_score = df['gleu'].mean()

# Print or use the GLEU score as needed
print(f"GLEU score: {gleu_score}")

0.6385209191579728

In [None]:
!pip install Levenshtein
from Levenshtein import distance

In [None]:
df['distance'] = df.apply(lambda row: distance(row['Comment'], row['Generated']), axis=1)

df.distance.mean()

263.79333333333335