In diesem Notebook wird der Datensatz merged_fake_real.csv verwendet und untersucht wie man Fake-Reviews von Echten Reviews unterscheiden kann. Zuerst wird der Datensatz so vorbeietet um Features zu generieren


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import urllib


# Speziell für Text-Feature-Extraktion
import emoji
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import string
from nltk import pos_tag, word_tokenize
from nltk.corpus import *
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('stopwords')
from textblob import TextBlob
from transformers import pipeline
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt

# Speziell für Bild-Feature-Extraktion
from PIL import Image
import requests
from fastai.vision.all import *
from fastdownload import download_url
import torch
import torchvision.models as models
import torchvision.transforms as transforms
import torch.nn as nn
import numpy as np
from sklearn.decomposition import PCA


pd.set_option("display.max_rows", None)

### Datensatz für Feature Extraction vorbeitet. 

Finale Zusammensetzung für base_for_feature_extraction.csv aus real und fake Datensatz


In [None]:
df = pd.read_csv("merged_fake_real.csv")
print(f"Base Datensatz: {df.shape}")

df.drop(
    [
        "index_fake",
        "org_text",
        "org_stars",
        "sent_score_0",
        "sent_v2",
        "sent_v3",
        "sent_v3.1",
        "prompt_v3",
        "website",
        "dalle_prompt",
        "website",
        "prompt_v2",
        "gpt3_v2",
        "gpt3_v3",
        "gpt3_v3.1",
        "prompt_v4",
        "org_reviewId",
        "sent_v4",
        "keywords",
        "keywords_only",
        "text_length",
        "reviewImageUrls/3",
        "reviewerPhotoUrl",
        "reviewerUrl",
        "reviewerId",
        "temporarilyClosed",
        "reviewsCount",
        "street",
        "state",
        "totalScore",
        "subTitle",
        "description",
        "price",
        "sentiment",
    ],
    axis=1,
    inplace=True,
)

df = df.reindex(
    columns=[
        "label",
        "reviewId",
        "placeId",
        "reviewUrl",
        "url",
        "title",
        "categoryName",
        "genre",
        "text",
        "stars",
        "publishedAtDate",
        "likesCount",
        "name",
        "isLocalGuide",
        "reviewerNumberOfReviews",
        "reviewImageUrls/0",
        "reviewImageUrls/1",
        "reviewImageUrls/2",
    ]
)

print(f"Reduzierter Datensatz als neue Basis für FE: {df.shape}")
print(df.columns)
print(df.dtypes)

df.to_csv("base_for_feature_extraction.csv", index=False)
df.to_excel("base_for_feature_extraction.xlsx", index=False)


# Tabular: Feature Generierung

1. aus publishedAt das bestmögliche rausholen

when_on_day_4hbin:
Midnight: 0-4 hours
Early morning: 4-8 hours
Morning: 8-12 hours
Early afternoon: 12-16 hours
Late afternoon: 16-20 hours
Evening: 20-24 hours -->


In [None]:
df = pd.read_csv("base_for_feature_extraction.csv")
print(df["publishedAtDate"][977])

df["publishedAtDate"] = pd.to_datetime(
    df["publishedAtDate"], format="%Y-%m-%dT%H:%M:%S"
)

df["year"] = df["publishedAtDate"].dt.year
df["month"] = df["publishedAtDate"].dt.month
df["dayofweek"] = df["publishedAtDate"].dt.dayofweek
df["elapsed_days"] = (datetime.today() - df["publishedAtDate"]).dt.days
df["when_on_day_4hbin"] = pd.cut(
    df["publishedAtDate"].dt.hour,
    bins=[-1, 4, 8, 12, 16, 20, 24],
    labels=[0, 1, 2, 3, 4, 5],
)

df["when_on_day_hour"] = df["publishedAtDate"].dt.hour

print(df["when_on_day_4hbin"].isna().sum())
print(df.loc[df["when_on_day_4hbin"].isna(), "publishedAtDate"])

print(
    df[
        [
            "publishedAtDate",
            "year",
            "month",
            "dayofweek",
            "elapsed_days",
            "when_on_day_4hbin",
            "when_on_day_hour",
        ]
    ].head(10)
)

df.to_csv("feature_enriched_tab.csv", index=False)
df.to_excel("feature_enriched_tab.xlsx", index=False)


# Bild: Feature Generierung

Feature Extrahieren über pretrained ResNet-18 Architektur und in tabellerischer Form im Dataframe abspeichern.


In [None]:
df = pd.read_csv("feature_enriched_tab.csv")
image_urls = df["reviewImageUrls/0"]

In [None]:
resnet = models.resnet18(pretrained=True)

# Die letzte Klassifikationsschicht entfernen
modules = list(resnet.children())[:-1]
resnet = torch.nn.Sequential(*modules)
resnet.eval()

def extract_image_features(image_url):
    img = Image.open(requests.get(image_url, stream=True).raw)
    img = transforms.ToTensor()(img)
    img = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])(img)
    img = img.unsqueeze(0)

    # Hier werden die Features extrahiert
    with torch.no_grad():
        features = resnet(img)
        features = features.squeeze().numpy()

    return features

In [None]:
counter = 0
feature_vectors = []
for image_url in image_urls:
    try:
        counter += 1
        print(f'{counter}:\tExtracting features from {image_url}')
        features = extract_image_features(image_url)
    except:
        print(f'Error extracting features from {image_url}. Replaces with NaN.')
        features = np.full((512,), np.nan)
    feature_vectors.append(features)

feature_df = pd.DataFrame(feature_vectors, columns=[f'feature_{i}' for i in range(512)])

In [None]:
new_df = pd.concat([df, feature_df], axis=1)
print("added features to the original dataset.")

new_df.to_csv('feature_enriched_tab_img.csv', index=False)
new_df.to_excel('feature_enriched_tab_img.xlsx', index=False)

## Feature für die 2. Bilder werden ebenfalls extrahiert

Ziel: zu prüfen, ob mit auf der reviewImageUrls/2 die Detektion genauso gut funktioniert, bei anderer Prompt für Dalle-2-Bilder. 
Antwort: Ja! Auch hier ähnlich hohe Vorhersagekraft

In [2]:
df = pd.read_csv("features_enriched_tab_img_text_preproc.csv")

pd.set_option("display.width", 1000)
pd.set_option("display.max_columns", 1000)
pd.set_option("display.max_rows", 1000)
pd.set_option("display.max_colwidth", 1000)
              
print(df.columns[:20])
print(df["label"][719])
df = pd.read_csv("features_enriched_tab_img_text_preproc.csv")
mask = df["label"] == 0
df.loc[mask, "reviewImageUrls/2"] = df.loc[mask, "reviewImageUrls/0"]
image_urls = df["reviewImageUrls/2"]

print(df["reviewImageUrls/2"])

Index(['label', 'reviewId', 'placeId', 'reviewUrl', 'url', 'title', 'categoryName', 'genre', 'text', 'tab_star', 'publishedAtDate', 'tab_likesCount', 'name', 'tab_isLocalGuide', 'tab_reviewerNumberOfReviews', 'reviewImageUrls/0', 'reviewImageUrls/1', 'reviewImageUrls/2', 'tab_year', 'tab_month'], dtype='object')
0
0        https://raw.githubusercontent.com/MichaelSeitz98/seminararbeit_review_detection/main/02_Images/fake_image_url_2/0_ChZDSUhNMG9nS0VJQ0FnSURJanBQb1pBEAEF_fake_reviewImageUrls_2.png
1       https://raw.githubusercontent.com/MichaelSeitz98/seminararbeit_review_detection/main/02_Images/fake_image_url_2/1_ChdDSUhNMG9nS0VJQ0FnSUNZLWRHY3VRRRABF_fake_reviewImageUrls_2.png
2       https://raw.githubusercontent.com/MichaelSeitz98/seminararbeit_review_detection/main/02_Images/fake_image_url_2/2_ChdDSUhNMG9nS0VJQ0FnSUNtc04zQXNRRRABF_fake_reviewImageUrls_2.png
3        https://raw.githubusercontent.com/MichaelSeitz98/seminararbeit_review_detection/main/02_Images/fake_image_url_2/3_

In [3]:
resnet = models.resnet18(pretrained=True)
modules = list(resnet.children())[:-1]
resnet = torch.nn.Sequential(*modules)
resnet.eval()

def extract_image_features(image_url):
    img = Image.open(requests.get(image_url, stream=True).raw)
    img = transforms.ToTensor()(img)
    img = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])(img)
    img = img.unsqueeze(0)
    with torch.no_grad():
        features = resnet(img)
        features = features.squeeze().numpy()

    return features



**Die Features der Zweiten Bilder, die auf reviewImageUrls/2 gespeichert sind, werden zu "img_feature_0" bis "img_feature_511"**

In [4]:
counter = 0
feature_vectors = []
for image_url in image_urls:
    try:
        counter += 1
        print(f'{counter}:\tExtracting features from {image_url}')
        features = extract_image_features(image_url)
    except:
        print(f'Error extracting features from {image_url}. Replaces with NaN.')
        features = np.full((512,), np.nan)
    feature_vectors.append(features)

feature_df = pd.DataFrame(feature_vectors, columns=[f'img_feature_{i}' for i in range(512)])

1:	Extracting features from https://raw.githubusercontent.com/MichaelSeitz98/seminararbeit_review_detection/main/02_Images/fake_image_url_2/0_ChZDSUhNMG9nS0VJQ0FnSURJanBQb1pBEAEF_fake_reviewImageUrls_2.png
2:	Extracting features from https://raw.githubusercontent.com/MichaelSeitz98/seminararbeit_review_detection/main/02_Images/fake_image_url_2/1_ChdDSUhNMG9nS0VJQ0FnSUNZLWRHY3VRRRABF_fake_reviewImageUrls_2.png
3:	Extracting features from https://raw.githubusercontent.com/MichaelSeitz98/seminararbeit_review_detection/main/02_Images/fake_image_url_2/2_ChdDSUhNMG9nS0VJQ0FnSUNtc04zQXNRRRABF_fake_reviewImageUrls_2.png
4:	Extracting features from https://raw.githubusercontent.com/MichaelSeitz98/seminararbeit_review_detection/main/02_Images/fake_image_url_2/3_ChZDSUhNMG9nS0VJQ0FnSUNvb05hU1pREAEF_fake_reviewImageUrls_2.png
5:	Extracting features from https://raw.githubusercontent.com/MichaelSeitz98/seminararbeit_review_detection/main/02_Images/fake_image_url_2/4_ChZDSUhNMG9nS0VJQ0FnSUQyazc3ZE53

In [5]:
new_df = pd.concat([df, feature_df], axis=1)
print("added features to the original dataset.")
new_df.to_csv('feature_enriched_tab_img2_text_preproc.csv', index=False)

added features to the original dataset.
label                                                                                                                                                                                                               0
reviewId                                                                                                                                                                         ChdDSUhNMG9nS0VJQ0FnSURXaV9mQ2lnRRAB
placeId                                                                                                                                                                                   ChIJ4XfXK_G8woARbUYxTMpyDo8
reviewUrl          https://www.google.com/maps/reviews/data=!4m8!14m7!1m6!2m5!1sChdDSUhNMG9nS0VJQ0FnSURXaV9mQ2lnRRAB!2m1!1s0x0:0x8f0e72ca4c31466d!3m1!1s2@1:CIHM0ogKEICAgIDWi_fCigE%7CCgwI8c2qkgYQiNvSywE%7C?hl=en-US
url                 https://www.google.de/maps/place/Hotel+Bel-Air/@34.0049838,-118.5864972,10z/data=!4m

## Verschiedene Visualisierung der Bild-Feature-Extraktion

1. Visualisierung des 512-dimensionaler Feature-Vektors mit PCA für Intuition 
2. Visualisierung des Extraktionsprozess eines gefakten Beispielbildes (von reviewImageUrls/2)  während des CNN-Durchlaufs 
3. Visualisierung der extrahierten Features
4. Visualisierung der Filteroperationen des CNNs 

### Visualisierung des 512-dimensionaler Feature-Vektors mit PCA für Intuition 

In [None]:
df = pd.read_csv('feature_enriched_tab_img.csv')

# die Zeile droppen, weil korrupter Image-Url und deshalb keine Features extrahiert werden konnten
df = df.drop(764)
# nur die img_features behalten und das label
features = df.iloc[:, 0:1].join(df.iloc[:, 24:])
features = features.dropna(subset=features.columns[1:], how='all')

color_map = {'fake': 'red', 'real': 'green'}
colors = features['label'].apply(lambda x: color_map[x])

# PCA durchführen um die Features auf 2 Dimensionen reduzieren
pca = PCA(n_components=2)
pca_features = pca.fit_transform(features.iloc[:, 1:])

plt.figure(figsize=(8, 6))
plt.scatter(pca_features[:, 0], pca_features[:, 1], c=colors, alpha=0.5, s=50)
plt.xlabel('PCA Komponente 1', fontsize=14)
plt.ylabel('PCA Komponente 2', fontsize=14)
plt.tick_params(axis='both', which='major', labelsize=12)
plt.tick_params(axis='both', which='minor', labelsize=10)
plt.grid(alpha=0.2)
plt.legend(handles=[plt.scatter([], [], c='red', label='Fake Bild', alpha=0.5, s=50),
                     plt.scatter([], [], c='green', label='Echtes Bild', alpha=0.5, s=50)],
           loc='upper right', fontsize=12)
plt.savefig('02_Images\graphics\pca_feature_extraction.svg', format='svg')
plt.show()

### Visualisierung des Extraktionsprozesses während CNN-Durchlauf 

Visualisierung der Bilder nach den angewendeten Operationen für jede Layer anhand eines Beispielbildes


In [None]:
resnet = models.resnet18(pretrained=True)

# Funktion, die für ein gegebenes Bild die Feature Maps eines Layers visualisiert
def visualize_layer_features(layer_num, input_img):
    layer = resnet.layer1[layer_num]

    outputs = []
    def hook(module, input, output):
        outputs.append(output)
    layer.register_forward_hook(hook)

    # Durch das Modell forward passen
    _ = resnet(input_img)
    feature_maps = outputs[0].detach().numpy()

    # Plot the feature maps as a grid
    fig, axs = plt.subplots(nrows=8, ncols=8, figsize=(12, 12))
    axs = axs.flatten()
    for i in range(64):
        axs[i].imshow(feature_maps[0, i, :, :], cmap='gray')
        axs[i].axis('off')
    plt.tight_layout()

    fig.savefig(f"layerWW{layer_num}.png")
    plt.close(fig)


# Anhand eines durch Dalle-2 erzeugten Beispielbildes die Feature Maps visualisieren

img_path = '02_Images/fake_image_url_2/43_ChZDSUhNMG9nS0VJQ0FnSURRdTh6UEJnEAEF_fake_reviewImageUrls_2.png'
img = Image.open(img_path)
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
input_img = transform(img)
input_img = input_img.unsqueeze(0)

if not os.path.exists('feature_maps'):
    os.makedirs('feature_maps')

# Für die verschiedenen Layer die Feature Maps visualisieren
for i in range(4):
    visualize_layer_features(i, input_img)


### Visualisierung der extrahierten Features


Ergebnis: siehe 02_Images\visualizations_image_features. Für menschliches Auge keine klaren Muster erkennbar. 

In [None]:
def visualize_and_save_feature(model, feature_index, save_dir, num_iterations=500):
    model.eval()
    input_image = torch.randn(1, 3, 224, 224, requires_grad=True)
    preprocess = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])

    optimizer = torch.optim.Adam([input_image], lr=0.1)

    for i in range(num_iterations):
        optimizer.zero_grad()
        output = model(input_image)
        feature_activation = output[0, feature_index].mean()
        feature_activation.backward()
        optimizer.step()

    input_image = input_image.detach().numpy()[0]
    input_image = (input_image * np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))) + np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
    input_image = np.clip(input_image, 0, 1)
    filename = f'feature_{feature_index}.png'
    save_path = os.path.join(save_dir, filename)
    plt.imsave(save_path, np.transpose(input_image, (1, 2, 0)))

save_dir = '02_Images/visualizations_image_features'
os.makedirs(save_dir, exist_ok=True)

for i in range(511, 512):
    visualize_and_save_feature(model, i+1, save_dir)

### Visualisierung der Filteroperationen des CNNs 

In [None]:
resnet = torch.hub.load('pytorch/vision', 'resnet18', pretrained=True)

conv1 = resnet.conv1
fig, axs = plt.subplots(8, 8, figsize=(10, 10))

for i in range(8):
    for j in range(8):
        filter_idx = i * 8 + j
        filter_img = conv1.weight.data[filter_idx].cpu().numpy().transpose(1, 2, 0)
        axs[i, j].imshow(filter_img)
        axs[i, j].axis('off')

plt.show()

# Text: Feature Extraktion

1. Snytaktische Feature extrahieren über NLTK Paket, wie z.B: length, adverb, Großklein, Rechtschriebung, Smiley-Nutzung, usw.
2. Semantische Features wie Sentiment (xml_roberat) oder LDA-Topics


In [None]:
model_path = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
sentiment_task = pipeline("sentiment-analysis", model=model_path, tokenizer=model_path)

In [None]:
df = pd.read_csv("feature_enriched_tab_img.csv")

def entire_capitalized_percentage(text):
    words = text.split()
    num_words = len(words)
    words = [word for word in words if word.lower() != "i"]
    num_capitalized = sum([1 for word in words if word.isupper()])
    return num_capitalized / num_words

def count_emojis(text):
    emoji_count =  0
    for character in text:
        if character in emoji.EMOJI_DATA:
            emoji_count += 1
    return emoji_count

def emojji_per_word_ratio(text):
    emoji_count =  0
    for character in text:
        if character in emoji.EMOJI_DATA:
            emoji_count += 1
    words = text.split()
    num_words = len(words)
    if num_words == 0:
        return 0
    else:
        return emoji_count / num_words
    
def avg_word_length(text):
    words = text.split()
    num_words = len(words)
    if num_words == 0:
        return 0
    else:
        total_length = sum([len(word) for word in words])
        return total_length / num_words

def avg_sentence_length(text):
    sentences = nltk.sent_tokenize(text)
    num_sentences = len(sentences)
    if num_sentences == 0:
        return 0
    else:
        total_length = sum([len(sentence.split()) for sentence in sentences])
        return total_length / num_sentences

def extract_pos_tags(text):
    pos_tags = nltk.pos_tag(nltk.word_tokenize(text))
    return pos_tags

def perform_sentiment_analysis(text):
    try:
        sentiment = sentiment_task(text)
        sentiment_label = sentiment[0]['label']
        #print(sentiment_label)
        return sentiment_label
    except RuntimeError:
        print(f"RuntimeError bei Text:    {text}")
        return "check_manually"
   
def text_spelling_error_quota(text):
    blob = TextBlob(text)
    words = blob.words
    num_words = len(words)
    num_errors = sum([not w.spellcheck()[0][1] for w in blob.words])
    return num_errors / num_words

def calculate_punctuation_ratio(text):
    sentences = sent_tokenize(text)
    ratio_list = []
    for sentence in sentences:
        words = word_tokenize(sentence)
        num_punctuations = sum([1 for word in words if word in string.punctuation])
        num_words = len(words)
        if num_words > 0:
            punctuation_ratio = num_punctuations / num_words
            ratio_list.append(punctuation_ratio)    
    if len(ratio_list) > 0:
        avg_punctuation_ratio = sum(ratio_list) / len(ratio_list)
    else:
        avg_punctuation_ratio = 0
    return avg_punctuation_ratio

def count_nouns(text):
    tokens = nltk.word_tokenize(text)
    tagged_tokens = nltk.pos_tag(tokens)
    noun_count = len([word for word, tag in tagged_tokens if tag.startswith('N')])
    return noun_count

def count_adjectives(text):
    adj_tags = ['JJ', 'JJR', 'JJS']
    tokens = nltk.word_tokenize(text)
    adj_count = len([word for word, tag in nltk.pos_tag(tokens) if tag in adj_tags])
    return adj_count

def count_verbs(text):
    stop_words = set(stopwords.words('english'))
    verb_tags = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
    tokens = word_tokenize(text)
    verb_count = len([word for word, tag in pos_tag(tokens) if tag in verb_tags and word.lower() not in stop_words])
    return verb_count

def count_adverbs(text): 
    stop_words = set(stopwords.words('english'))
    adv_tags = ['RB', 'RBR', 'RBS']
    tokens = word_tokenize(text)
    adv_count = len([word for word, tag in pos_tag(tokens) if tag in adv_tags and word.lower() not in stop_words])
    return adv_count

def count_pronouns(text):
    pronoun_tags = ['PRP', 'PRP$', 'WP', 'WP$']
    tokens = word_tokenize(text)
    pronoun_count = len([word for word, tag in pos_tag(tokens) if tag in pronoun_tags])
    return pronoun_count

def calculate_not_stopword_ratio(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    num_words = len(words)
    not_stopword_count = len([word for word in words if word.lower() not in stop_words])
    return not_stopword_count / num_words

def calculate_stopword_ratio(text):
    words = nltk.word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    num_words = len(words)
    num_stopwords = len([word for word in words if word in stop_words])
    return num_stopwords / num_words

def calulate_stopword_to_nostopword_ratio(text): 
    return calculate_stopword_ratio(text) / calculate_not_stopword_ratio(text)

def compute_modal_verb_ratio(text):
    tokens = nltk.word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    modal_verbs = ['can', 'could', 'may', 'might', 'shall', 'should', 'will', 'would', 'must']
    modal_verb_count = len([word for word in tokens if word in modal_verbs and word not in stop_words])
    word_count = len([word for word in tokens if word not in stop_words])
    return modal_verb_count / word_count

def compute_uncertain_ratio(text):
    words = nltk.word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    uncertain_words = ['yet', 'careful', 'hesitant', 'tendency', 'hit', 'undefined', 'ambivalent', 'confused', 'equivocal', 'fuzzy', 'inconclusive', 'indeterminate', 'unclear', 'uncertain', 'unsettled', 'vague']
    uncertain_count = len([word for word in words if word in uncertain_words and word not in stop_words])
    total_count = len(words)
    uncertain_ratio = uncertain_count / total_count if total_count > 0 else 0.0
    return uncertain_ratio

def count_individual_words(text):
    individual_words = ['I', 'me', 'my', 'mine', 'myself']
    tokens = word_tokenize(text)
    individual_count = len([word for word in tokens if word.lower() in individual_words])
    return individual_count

def count_group_words(text):
    group_words = ['we', 'us', 'our', 'ours', 'ourselves']
    tokens = word_tokenize(text)
    group_count = len([word for word in tokens if word.lower() in group_words])
    return group_count

def count_self_words(text):
    self_words = ['self', 'myself', 'ourselves']
    tokens = word_tokenize(text)
    self_count = len([word for word in tokens if word.lower() in self_words])
    return self_count

def individual_ratio(text):
    total_words = len(word_tokenize(text))
    individual_count = count_individual_words(text)
    return individual_count / total_words

def group_ratio(text):
    total_words = len(word_tokenize(text))
    group_count = count_group_words(text)
    return group_count / total_words

def self_ratio(text):
    total_words = len(word_tokenize(text))
    self_count = count_self_words(text)
    return self_count / total_words

df['text_length'] = df['text'].apply(lambda x: len(str(x).split()))
df['text_length_char'] = df['text'].apply(len)
df['text_punctuation'] = df['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
df['text_avg_word_length'] = df['text'].apply(avg_word_length)
df['text_avg_sentence_length'] = df['text'].apply(avg_sentence_length)
df['text_sentiment'] = df['text'].apply(perform_sentiment_analysis)
print('Fertig mit 1. Block.')

df['text_emoji_count'] = df['text'].apply(count_emojis)
df['text_count_verbs'] = df['text'].apply(count_verbs)
df['text_count_adjectives'] = df['text'].apply(count_adjectives)
df['text_count_adverbs'] = df['text'].apply(count_adverbs)
df['text_count_pronouns'] = df['text'].apply(count_pronouns)
df['text_count_nouns'] = df['text'].apply(count_nouns)
df.to_csv('features_enriched_tab_img_text.csv', index=False)
print('Fertig mit 2. Block.')

df['text_no_stopword_Ratio'] = df['text'].apply(calculate_not_stopword_ratio)
df['text_stopword_ratio'] = df['text'].apply(calculate_stopword_ratio)
df['text_stopword_to_nostopword_ratio'] = df['text'].apply(calulate_stopword_to_nostopword_ratio)
df['text_entired_capitalized_ratio'] = df['text'].apply(entire_capitalized_percentage)
df['text_punctuation_ratio'] = df['text'].apply(calculate_punctuation_ratio)
df.to_csv('features_enriched_tab_img_text.csv', index=False)
print('Fertig mit 3. Block.')

df['text_spelling_error_quota'] = df['text'].apply(text_spelling_error_quota)
df['text_modal_verb_ratio'] = df['text'].apply(compute_modal_verb_ratio)
df['text_uncertain_ratio'] = df['text'].apply(compute_uncertain_ratio)
df['text_individual_count'] = df['text'].apply(count_individual_words)
print('Fertig mit 4. Block.')

df['text_group_count'] = df['text'].apply(count_group_words)
df['text_self_count'] = df['text'].apply(count_self_words)
df['text_individual_ratio'] = df['text'].apply(individual_ratio)
df['text_group_ratio'] = df['text'].apply(group_ratio)
df['text_self_ratio'] = df['text'].apply(self_ratio)
print('Fertig mit 5. Block.')

df.to_csv('features_enriched_tab_img_text.csv', index=False)

Nachträglich hinzugefügtes Feature: **text_exclamation_mark_count**  und **text_exclamation_mark_ratio** 

In [None]:
df = pd.read_csv('features_enriched_tab_img_text.csv')
print(df.shape)
print(df.columns)
print(df.head(20))

df['text_exclamation_mark_count'] = df['text'].apply(lambda x: str(x).count('!'))
df['text_exclamation_mark_ratio'] = df['text_exclamation_mark_count'] / df['text_length']

print(df[['text_exclamation_mark_count', 'text', 'text_exclamation_mark_ratio']])
print(df[df['reviewId'] == 'ChZDSUhNMG9nS0VJQ0FnSUMweU5uV1BnEAEF']['text'])
print(df [df['text_exclamation_mark_count'] > 0][['text_exclamation_mark_count', 'text', 'text_exclamation_mark_ratio']])
print(df.shape)

# df.to_csv('features_enriched_tab_img_text.csv', index=False)
# df.to_excel('features_enriched_tab_img_text.xlsx', index=False)

**Latent Dirichlet Allocation (LDA)** angewendet um Texte in Topics einzuteilen. Anschließende Visualisierung mit *pyLDAvis*

In [None]:
df = pd.read_csv('features_enriched_tab_img_text.csv')

def preprocess_text(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]

processed_texts = [preprocess_text(text) for text in df['text']]
dictionary = corpora.Dictionary(processed_texts)
bow_corpus = [dictionary.doc2bow(text) for text in processed_texts]
n_topics = 10
lda_model = gensim.models.ldamodel.LdaModel(bow_corpus, num_topics=n_topics, id2word=dictionary)

topic_distributions = []
for i in range(len(bow_corpus)):
    topic_distribution = lda_model[bow_corpus[i]]
    topic_distribution_dict = dict(topic_distribution)
    topic_distribution_list = [topic_distribution_dict[i] if i in topic_distribution_dict else 0 for i in range(n_topics)]
    topic_distributions.append(topic_distribution_list)

topic_df = pd.DataFrame(topic_distributions, columns=[f"text_topic_{i}" for i in range(n_topics)])
df = pd.concat([df, topic_df], axis=1)

print(df[df.columns[pd.Series(df.columns).str.startswith('text_topic')]].head(20))

vis = gensimvis.prepare(lda_model, bow_corpus, dictionary)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

pyLDAvis.save_html(vis, '02_Images/graphics/lda_visualization3.html')
df.shape
df.to_csv('features_enriched_tab_img_text.csv', index=False)