# Featurea Derivation for the Different Datasets

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import utils.helper_functions as helper_functions
from collections import Counter
import re
from sentence_transformers import SentenceTransformer
from nltk.tokenize import sent_tokenize

### Load Dataset Files

In [2]:
en_wiki_features_df =  pd.read_pickle("human_generated.pkl")
gpt_features_df =  pd.read_pickle("ai_generated.pkl")

# Derive TF-IDF Feature
Process: TF-IDF derived for human-generated texts and subset of AI-generated texts (e.g., basic AI-rephrased texts).

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer

def get_top_indices_and_vectorizer(texts):
    # Initialize the TfidfVectorizer with uni- and bigram options
    tfidf = TfidfVectorizer(ngram_range=(1, 2))
    
    # Fit the vectorizer on the texts
    tfidf.fit(texts)
    
    # Get the feature names (uni- and bigrams)
    feature_names = tfidf.get_feature_names()

    # Get the document-term matrix (DTM) of the corpus
    dtm = tfidf.transform(texts)

    # Get the sum of the tf-idf scores for each feature across all documents
    sum_tfidf = dtm.sum(axis=0)

    # Convert the DTM to a dense matrix for easier manipulation
    dense_dtm = dtm.todense()

    # Get the indices of the top 500 features with the highest tf-idf scores
    top_indices = sum_tfidf.argsort()[0, -500:]
    top_indices = top_indices.tolist()[0]
    top_features = []
    # Get the feature names (uni- and bigrams) of the top 500 features
    for top_val in top_indices:
        top_features.append(feature_names[top_val])
        
    return tfidf, top_indices, top_features

def calc_tfidf(text, vectorizer, top_indices):
    # Transform the new document into a DTM
    new_dtm = vectorizer.transform([text])

    # Get the tf-idf scores for the top 500 features of the new document
    new_tfidf = [new_dtm[0, i] for i in top_indices]
    
    return new_tfidf

#### Define Datasets

In [41]:
main_df = de_wiki_features_df.copy()
gpt_df = de_gpt_features_df.copy()
gpt_df

#### Define Type of AI-written text

In [65]:
gpt_source = "generated_base"
gpt_filtered = gpt_df[gpt_df["source"] == gpt_source]

#### Combine human-generated and AI-written texts
Should be 100 human- and 100 AI-texts

In [66]:
text_list = main_df.text.tolist() + gpt_filtered.text.tolist()
len(text_list)

200

#### Get 500 top uni- and bi-grams

In [67]:
tfidf, top_indices, top_features = get_top_indices_and_vectorizer(text_list)

#### Calculate TF-IDF per text for human-generated texts

In [68]:
main_df["tfidf_{}".format(gpt_source)] = main_df.text.apply(lambda x: calc_tfidf(x, tfidf, top_indices))
main_df

In [87]:
#main_df.to_pickle("Data/de_wiki_features_df.pkl")

#### Calculate TF-IDF per text for AI-generated texts

In [71]:
# add column "tiidf" if it does not exist yet
#gpt_df["tfidf"] = None

for index, row in gpt_df.iterrows():
    if row.source == gpt_source:
        gpt_df.at[index, "tfidf"] = calc_tfidf(row.text, tfidf, top_indices)

In [73]:
gpt_df

Unnamed: 0,title_en,title_language,language,date,category,text,character_count,words_count,source,author,...,ppl_max,ppl_mean,sentence_bert,sentence_bert_dist,Fold_1,Fold_2,Fold_3,Fold_4,Fold_5,tfidf
8,Vienna,Wien,de,2023-04-10,geography,"Die Republik Österreich hat neun Bundesländer,...",1020,144,rephrase_base,ChatGPT,...,41.739979,17.835852,"[-0.0060472703, -0.0015850338, 0.012966703, 0....",0.718966,test,train,train,train,train,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
9,Vienna,Wien,de,2023-04-10,geography,Wien ist Österreichs Bundeshauptstadt und eine...,1167,155,rephrase_expert,ChatGPT,...,25.480711,16.853344,"[-0.021692196, 0.0011809485, 0.02170519, 0.004...",0.703729,test,train,train,train,train,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
10,Vienna,Wien,de,2023-04-10,geography,Wien ist die Hauptstadt von Österreich und lie...,1840,264,generated_base,ChatGPT,...,61.625816,19.912042,"[-0.013687565, -0.0053922576, -0.0005174721, 0...",0.741820,test,train,train,train,train,"[0.0, 0.0, 0.0, 0.030498856144209132, 0.0, 0.0..."
11,Vienna,Wien,de,2023-04-10,geography,"Wien, die österreichische Hauptstadt, ist eine...",1297,189,generated_expert,ChatGPT,...,24.912964,14.705893,"[0.00030970015, -0.023622207, 0.031076685, 0.0...",0.715953,test,train,train,train,train,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.03894820279197, 0...."
16,Himalayas,Himalaya,de,2023-04-10,geography,"Asien beheimatet das höchste Gebirge der Erde,...",888,118,rephrase_base,ChatGPT,...,72.549171,24.495172,"[-0.0005591136, 0.035370592, -0.0149539765, -0...",0.663978,train,train,train,train,train,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1583,Constitution,Verfassung,de,2023-04-11,politics,Die Verfassung! Was für ein wichtiges Thema! D...,1771,247,generated_expert,ChatGPT,...,4847.800781,260.403628,"[0.022768332, 0.0022064543, -0.0016067001, 0.0...",0.727045,train,train,train,train,train,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1596,Electronegativity,Elektronegativität,de,2023-04-11,chemistry,"Die Fähigkeit von Atomen, bindende Elektronenp...",1517,196,rephrase_base,ChatGPT,...,63.900864,26.798258,"[-0.014504041, -0.01202015, -0.009371709, -0.0...",0.632077,train,train,train,train,train,"[0.0, 0.0, 0.0, 0.0, 0.03402081860285245, 0.0,..."
1597,Electronegativity,Elektronegativität,de,2023-04-11,chemistry,"Die Elektronegativität gibt an, wie gut ein At...",949,129,rephrase_expert,ChatGPT,...,68.056511,38.604194,"[-0.014926519, -0.010599102, 0.00010431206, -0...",0.620899,train,train,train,train,train,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1598,Electronegativity,Elektronegativität,de,2023-04-11,chemistry,Die Elektronegativität beschreibt die Fähigkei...,1522,195,generated_base,ChatGPT,...,31.118761,17.95147,"[0.0018844944, -0.016691329, -0.03601046, -0.0...",0.624994,train,train,train,train,train,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [86]:
#gpt_df.to_pickle("Data/de_gpt_features_df.pkl")

## Create Sub-Datasets

In [43]:
# Filter wiki df for language only
wiki_de = wiki_df[wiki_df['language'] == "de"]

gpt_de = gpt_df[gpt_df['language'] == "de"]
"""
gpt_en_rephr_b = news_feature_df[news_feature_df["source"] == "rephrase_base"]
gpt_en_rephr_e = news_feature_df[news_feature_df["source"] == "rephrase_expert"]
gpt_en_gen_b = news_feature_df[news_feature_df["source"] == "generated_base"]
gpt_en_gen_e = news_feature_df[news_feature_df["source"] == "generated_expert"]"""

'\ngpt_en_rephr_b = news_feature_df[news_feature_df["source"] == "rephrase_base"]\ngpt_en_rephr_e = news_feature_df[news_feature_df["source"] == "rephrase_expert"]\ngpt_en_gen_b = news_feature_df[news_feature_df["source"] == "generated_base"]\ngpt_en_gen_e = news_feature_df[news_feature_df["source"] == "generated_expert"]'

## Create Feature DFs

### Define DF

In [44]:
df = gpt_de.copy()
df

Unnamed: 0,title_en,title_language,language,date,category,text,character_count,words_count,source,author,gpt_feature
8,Vienna,Wien,de,2023-04-10,geography,"Die Republik Österreich hat neun Bundesländer,...",1020,144,rephrase_base,ChatGPT,"Ja, dieser Text wurde von ChatGPT generiert."
9,Vienna,Wien,de,2023-04-10,geography,Wien ist Österreichs Bundeshauptstadt und eine...,1167,155,rephrase_expert,ChatGPT,"Ja, der Text wurde von ChatGPT generiert."
10,Vienna,Wien,de,2023-04-10,geography,Wien ist die Hauptstadt von Österreich und lie...,1840,264,generated_base,ChatGPT,Der Text wurde von einem menschlichen Autor ge...
11,Vienna,Wien,de,2023-04-10,geography,"Wien, die österreichische Hauptstadt, ist eine...",1297,189,generated_expert,ChatGPT,"Ja, dieser Text wurde von ChatGPT generiert."
16,Himalayas,Himalaya,de,2023-04-10,geography,"Asien beheimatet das höchste Gebirge der Erde,...",888,118,rephrase_base,ChatGPT,"Ja, der folgende Text wurde von ChatGPT generi..."
...,...,...,...,...,...,...,...,...,...,...,...
1583,Constitution,Verfassung,de,2023-04-11,politics,Die Verfassung! Was für ein wichtiges Thema! D...,1771,247,generated_expert,ChatGPT,"Ja, der folgende Text wurde von ChatGPT generi..."
1596,Electronegativity,Elektronegativität,de,2023-04-11,chemistry,"Die Fähigkeit von Atomen, bindende Elektronenp...",1517,196,rephrase_base,ChatGPT,"Ja, der folgende Text wurde von ChatGPT generi..."
1597,Electronegativity,Elektronegativität,de,2023-04-11,chemistry,"Die Elektronegativität gibt an, wie gut ein At...",949,129,rephrase_expert,ChatGPT,"Ja, dieser Text wurde von ChatGPT generiert."
1598,Electronegativity,Elektronegativität,de,2023-04-11,chemistry,Die Elektronegativität beschreibt die Fähigkei...,1522,195,generated_base,ChatGPT,"Ja, dieser Text wurde von ChatGPT generiert."


### Language

In [37]:
lang = "de"
lang_tool_lang = "de-DE"

### Derive Features

In [None]:
import nltk
# FEATURE ChatGPT ANSWER
df = helper_functions.ordinal_gpt_feature(df)


df['character_count'] = df.text.str.len()
df['words_count'] = df.text.apply(lambda x: len(str(x).split(' ')))

# FEATURE TITLE OCCURENCE
df = helper_functions.title_occurence(df)

# FEATURES FOR OCCURENCE OF WORDS
#df = helper_functions.count_word_occurence(df, ["the", "it", "is", "nevertheless", "although", "however", "therefore"], add_blanks=True)

# FEATURE FOR NUMBER OF SENTENCES
df['sentence_count'] = helper_functions.count_sentences_raw_text(df, "hybrid")

# FEATURE AVERAGE NUMBER OF WORDS PER SENTENCE
#df["avg_words_per_sentence"] = helper_functions.words_per_sentence(df)

# FEATURE COUNT OF QUOTATION MARKS
df['quotation_count'] = df['text'].str.count('\"')

# FEATURE COUNT OF UNIQUE WORDS ABSOLUTE
df["unique_words_count"] = df.text.apply(lambda x: len(Counter(re.sub(r'[^A-Za-z \n]', '', x).lower().split())))

# FEATURE COUNT OF UNIQUE WORDS ABSOLUTE RELATIVE TO ALL WORDS IN TEXT
df["unique_words_relative"] = df["unique_words_count"] / df["words_count"]

# FEATURE COUNT OF SPECIAL CHARACTERS
pattern = r'[0-9a-z.?¿!¡,\n çñáãâàîïíìóôòéèêúûùäöüß]'  # those are excluded from count -> removed from text
df["special_char_count"] = df.text.apply(lambda x: len(re.sub(pattern,'', x.lower())))

df = helper_functions.add_flesch_scores(df)

df["personal_pronoun_relative"] = df.text.apply(lambda x: helper_functions.count_personal_pronouns(x, "rel", lang))
df["personal_pronoun_count"] = df.text.apply(lambda x: helper_functions.count_personal_pronouns(x, "abs", lang))

df["stats"] = df["text"].apply(helper_functions.calculate_paragraph_stats)
df[["words_per_paragraph_mean", "words_per_paragraph_stdev", "sentences_per_paragraph_mean", "sentences_per_paragraph_stdev"]] = pd.DataFrame(df["stats"].tolist(), index=df.index)
# Drop the original 'stats' column
df.drop(columns=["stats"], inplace=True)
df["punctuation_count"] = df.text.apply(helper_functions.count_punctuation)

df["paragraph_count"] = df.text.apply(helper_functions.count_paragraphs)

df["pos_per_sentence_mean"] = df.text.apply(lambda x: helper_functions.get_avg_pos_types(x, lang))

df["stats"] = df["text"].apply(helper_functions.get_sentence_stats)
df[["unique_words_per_sentence_mean", "unique_words_per_sentence_stdev", "words_per_sentence_mean", "words_per_sentence_stdev"]] = pd.DataFrame(df["stats"].tolist(), index=df.index)
# Drop the original 'stats' column
df.drop(columns=["stats"], inplace=True)

df["uppercase_letters_relative"] = df.text.apply(helper_functions.uppercase_percentage)
df["discourse_marker_count"] = df.text.apply(lambda x: helper_functions.discourse_marker_count(x, lang))
df["stop_word_count"] = df.text.apply(lambda x: helper_functions.count_stopwords(x, lang))
df["multi_blank_count"] = df.text.apply(helper_functions.count_double_blanks)


import language_tool_python

if lang == "en" or lang == "fr" or lang == "es":
    # FEATURE LANGUAGE FINDINGS
    tool = language_tool_python.LanguageTool(lang_tool_lang)
    df['grammar_error_count'] = df.text.apply(lambda x: len(tool.check(x)))
else:
    tool = language_tool_python.LanguageToolPublicAPI(lang_tool_lang)
    df["grammar_error_count"] = None
    for index, row in df.iterrows():
        print(index)
        sentences = sent_tokenize(row.text)
        error_count = 0
        for sentence in sentences:
            error_count += len(tool.check(sentence))
        df.at[index, 'grammar_error_count'] = error_count

df["sentiment"] = df.text.apply(lambda x: helper_functions.get_sentiment(x, lang))
df[["sentiment_polarity", "sentiment_subjectivity"]] = pd.DataFrame(df["sentiment"].tolist(), index=df.index)
# Drop the original 'sentiment' column
df.drop(columns=["sentiment"], inplace=True)


# FEATURE PERPLEXITY
df = helper_functions.add_perplexity(df, lang)

model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v2')
df["sent_vec_stats"] = df.text.apply(lambda x: helper_functions.sentence_vector_mean_vector_and_distance(x, model))
df[["sentence_bert", "sentence_bert_dist"]] = pd.DataFrame(df["sent_vec_stats"].tolist(), index=df.index)
df.drop(columns=["sent_vec_stats"], inplace=True)


In [None]:
df

In [50]:
df.to_pickle("Data/en_gpt_features_df.pkl")