# Feature Derivation for the Different Datasets

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import utils.helper_functions as helper_functions
from collections import Counter
import re
from sentence_transformers import SentenceTransformer
from nltk.tokenize import sent_tokenize

### Load Dataset Files

In [None]:
# wiki or news
domain = "wiki"

human_features_df =  pd.read_pickle("{}_human_generated.pkl".format(domain))
gpt_features_df =  pd.read_pickle("{}_chatgpt_generated.pkl".format(domain))

# Derive TF-IDF Feature
Process: TF-IDF derived for human-generated texts and subset of AI-generated texts (e.g., basic AI-rephrased texts).

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def get_top_indices_and_vectorizer(texts):
    # Initialize the TfidfVectorizer with uni- and bigram options
    tfidf = TfidfVectorizer(ngram_range=(1, 2))
    
    # Fit the vectorizer on the texts
    tfidf.fit(texts)
    
    # Get the feature names (uni- and bigrams)
    feature_names = tfidf.get_feature_names_out()

    # Get the document-term matrix (DTM) of the corpus
    dtm = tfidf.transform(texts)

    # Get the sum of the tf-idf scores for each feature across all documents
    sum_tfidf = dtm.sum(axis=0)

    # Convert the DTM to a dense matrix for easier manipulation
    dense_dtm = dtm.todense()

    # Get the indices of the top 500 features with the highest tf-idf scores
    top_indices = sum_tfidf.argsort()[0, -500:]
    top_indices = top_indices.tolist()[0]
    top_features = []
    # Get the feature names (uni- and bigrams) of the top 500 features
    for top_val in top_indices:
        top_features.append(feature_names[top_val])
        
    return tfidf, top_indices, top_features

def calc_tfidf(text, vectorizer, top_indices):
    # Transform the new document into a DTM
    new_dtm = vectorizer.transform([text])

    # Get the tf-idf scores for the top 500 features of the new document
    new_tfidf = [new_dtm[0, i] for i in top_indices]
    
    return new_tfidf

#### Define Datasets

In [None]:
main_df = human_features_df.copy()
gpt_df = gpt_features_df.copy()
gpt_df

#### Define Type of AI-written text

In [None]:
gpt_source = "generated_base"
language = "fr"

main_df = main_df[main_df["language"] == language]
gpt_df = gpt_df[gpt_df["language"] == language]

gpt_filtered = gpt_df[gpt_df["source"] == gpt_source]

In [None]:
main_df

#### Combine human-generated and AI-written texts
Should be 100 human- and 100 AI-texts

In [None]:
text_list = main_df.text.tolist() + gpt_filtered.text.tolist()
len(text_list)

#### Get 500 top uni- and bi-grams

In [None]:
tfidf, top_indices, top_features = get_top_indices_and_vectorizer(text_list)

#### Calculate TF-IDF per text for human-generated texts

In [None]:
main_df["tfidf_{}".format(gpt_source)] = main_df.text.apply(lambda x: calc_tfidf(x, tfidf, top_indices))
main_df

In [None]:
#main_df.to_pickle("Data/de_wiki_features_df.pkl")

#### Calculate TF-IDF per text for AI-generated texts

In [None]:
# add column "tiidf" if it does not exist yet
gpt_filtered["tfidf"] = None

for index, row in gpt_filtered.iterrows():
    if row.source == gpt_source:
        gpt_filtered.at[index, "tfidf"] = calc_tfidf(row.text, tfidf, top_indices)

In [None]:
gpt_filtered

In [None]:
#gpt_df.to_pickle("Data/de_gpt_features_df.pkl")

## Create Sub-Datasets

In [None]:
# Filter df for language only
#language = "en"
human_final = main_df[main_df['language'] == language]

gpt_final = gpt_filtered[gpt_filtered['language'] == language]
"""
gpt_en_rephr_b = news_feature_df[news_feature_df["source"] == "rephrase_base"]
gpt_en_rephr_e = news_feature_df[news_feature_df["source"] == "rephrase_expert"]
gpt_en_gen_b = news_feature_df[news_feature_df["source"] == "generated_base"]
gpt_en_gen_e = news_feature_df[news_feature_df["source"] == "generated_expert"]"""

In [None]:
human_final

## Create Feature DFs

### Define DF to create feature for

Note: Create DF either for human OR AI generated texts

In [None]:
df = human_final.copy()
df

### Language

In [None]:
lang = "fr"
lang_tool_lang = "fr-FR"

### Derive Features

In [None]:
import nltk
# FEATURE ChatGPT ANSWER
df = helper_functions.ordinal_gpt_feature(df)


df['character_count'] = df.text.str.len()
df['words_count'] = df.text.apply(lambda x: len(str(x).split(' ')))

# FEATURE TITLE OCCURENCE
df = helper_functions.title_occurence(df)

# FEATURES FOR OCCURENCE OF WORDS
#df = helper_functions.count_word_occurence(df, ["the", "it", "is", "nevertheless", "although", "however", "therefore"], add_blanks=True)

# FEATURE FOR NUMBER OF SENTENCES
df['sentence_count'] = helper_functions.count_sentences_raw_text(df, "hybrid")

# FEATURE AVERAGE NUMBER OF WORDS PER SENTENCE
#df["avg_words_per_sentence"] = helper_functions.words_per_sentence(df)

# FEATURE COUNT OF QUOTATION MARKS
df['quotation_count'] = df['text'].str.count('\"')

# FEATURE COUNT OF UNIQUE WORDS ABSOLUTE
df["unique_words_count"] = df.text.apply(lambda x: len(Counter(re.sub(r'[^A-Za-z \n]', '', x).lower().split())))

# FEATURE COUNT OF UNIQUE WORDS ABSOLUTE RELATIVE TO ALL WORDS IN TEXT
df["unique_words_relative"] = df["unique_words_count"] / df["words_count"]

# FEATURE COUNT OF SPECIAL CHARACTERS
pattern = r'[0-9a-z.?¿!¡,\n çñáãâàîïíìóôòéèêúûùäöüß]'  # those are excluded from count -> removed from text
df["special_char_count"] = df.text.apply(lambda x: len(re.sub(pattern,'', x.lower())))

df = helper_functions.add_flesch_scores(df)

df["personal_pronoun_relative"] = df.text.apply(lambda x: helper_functions.count_personal_pronouns(x, "rel", lang))
df["personal_pronoun_count"] = df.text.apply(lambda x: helper_functions.count_personal_pronouns(x, "abs", lang))

df["stats"] = df["text"].apply(helper_functions.calculate_paragraph_stats)
df[["words_per_paragraph_mean", "words_per_paragraph_stdev", "sentences_per_paragraph_mean", "sentences_per_paragraph_stdev"]] = pd.DataFrame(df["stats"].tolist(), index=df.index)
# Drop the original 'stats' column
df.drop(columns=["stats"], inplace=True)
df["punctuation_count"] = df.text.apply(helper_functions.count_punctuation)

df["paragraph_count"] = df.text.apply(helper_functions.count_paragraphs)

df["pos_per_sentence_mean"] = df.text.apply(lambda x: helper_functions.get_avg_pos_types(x, lang))

df["stats"] = df["text"].apply(helper_functions.get_sentence_stats)
df[["unique_words_per_sentence_mean", "unique_words_per_sentence_stdev", "words_per_sentence_mean", "words_per_sentence_stdev"]] = pd.DataFrame(df["stats"].tolist(), index=df.index)
# Drop the original 'stats' column
df.drop(columns=["stats"], inplace=True)

df["uppercase_letters_relative"] = df.text.apply(helper_functions.uppercase_percentage)
df["discourse_marker_count"] = df.text.apply(lambda x: helper_functions.discourse_marker_count(x, lang))
df["stop_word_count"] = df.text.apply(lambda x: helper_functions.count_stopwords(x, lang))
df["multi_blank_count"] = df.text.apply(helper_functions.count_double_blanks)


import language_tool_python

if lang == "en" or lang == "fr" or lang == "es":
    # FEATURE LANGUAGE FINDINGS
    tool = language_tool_python.LanguageTool(lang_tool_lang)
    df['grammar_error_count'] = df.text.apply(lambda x: len(tool.check(x)))
else:
    tool = language_tool_python.LanguageToolPublicAPI(lang_tool_lang)
    df["grammar_error_count"] = None
    for index, row in df.iterrows():
        print(index)
        sentences = sent_tokenize(row.text)
        error_count = 0
        for sentence in sentences:
            error_count += len(tool.check(sentence))
        df.at[index, 'grammar_error_count'] = error_count

df["sentiment"] = df.text.apply(lambda x: helper_functions.get_sentiment(x, lang))
df[["sentiment_polarity", "sentiment_subjectivity"]] = pd.DataFrame(df["sentiment"].tolist(), index=df.index)
# Drop the original 'sentiment' column
df.drop(columns=["sentiment"], inplace=True)


# FEATURE PERPLEXITY
df = helper_functions.add_perplexity(df, lang)

model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v2')
df["sent_vec_stats"] = df.text.apply(lambda x: helper_functions.sentence_vector_mean_vector_and_distance(x, model))
df[["sentence_bert", "sentence_bert_dist"]] = pd.DataFrame(df["sent_vec_stats"].tolist(), index=df.index)
df.drop(columns=["sent_vec_stats"], inplace=True)


In [None]:
df

In [None]:
# df.to_pickle("Data/en_gpt_features_df.pkl")