In diesem Notebook wird der Datensatz base_fake_real.csv verwendet und untersucht wie man Fake-Reviews von Echten Reviews unterscheiden kann.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import urllib
import os

In [None]:
from fastai.vision.all import *
from fastdownload import download_url

## Text:

Über Sprach-Features aus PNLP

## Bild:
Erkennung, ob das Bild ein Fake ist oder nicht über eine Bildklassifizierung.

In [None]:
df = pd.read_csv('base_keywords_sentiment_reduced.csv')
print(df.shape)

df_all = pd.read_csv('01_Data/raw_data/dataset_weitere_forschung.csv')
print(df_all.shape)

df_all = df_all[['text', 'reviewId', 'url', 'placeId', 'categoryName', 'stars', 'title','reviewImageUrls/0','reviewImageUrls/1','reviewImageUrls/2','reviewImageUrls/3','reviewImageUrls/4','reviewImageUrls/5','genre']]
df_all = df_all[df_all['reviewImageUrls/0'].notna()]
print(df_all.shape)
df_all = df_all[~df_all['reviewId'].isin(df['reviewId'])]
print(df_all.shape)
print(df_all.columns)

image_urls_cols = [
    "reviewImageUrls/0",
    "reviewImageUrls/1",
    "reviewImageUrls/2",
    "reviewImageUrls/3",
    "reviewImageUrls/4",
    "reviewImageUrls/5",
]
for col in image_urls_cols:
    print(f"Processing col {col}")
    df_all[col] = df_all[col].str.replace("=w150-h150-k-no-p", "=w256-h256-p-k-no")

print(df_all.shape)

df_all.to_csv('train_weitere_forschung_ohne_base.csv', index=False)
df_all.to_excel('train_weitere_forschung_ohne_base.xlsx', index=False)

In [None]:
df = pd.read_csv('train_weitere_forschung_ohne_base.csv')

for index, row in df.iterrows():
    if index < 1509:
        print(f"Index {index} schon heruntergeladen, skipped ..")
        continue
    for i in range(2):
        if not pd.isna(row[f'reviewImageUrls/{i}']):
            print(f"Downloading image for {index}...")
            url = row[f'reviewImageUrls/{i}']
            filename = f"{url.split('/')[-1]}.png"
            path = os.path.join('02_Images', 'train_real_images_not_in_base', filename)
            if not os.path.exists(path):
                try:
                    urllib.request.urlretrieve(url, path)
                except:
                    print(f'Error with {url}')

print("Done with downloading real images for training set.")

### Start Umsetzung mit FASTAI visual Learning als Classifizierung Problem 

In [None]:
path = Path('02_Images/train')
fns = get_image_files(path)
fns

In [None]:
failed = verify_images(fns)
print(failed)
if len(failed) > 0:
    failed.map(Path.unlink);
else:
    print("All images verified successfully.")

In [None]:
classes = DataBlock(
    blocks=(ImageBlock, CategoryBlock), 
    get_items=get_image_files, 
    splitter=RandomSplitter(valid_pct=0.2, seed=42),
    get_y=parent_label,
    item_tfms=Resize(256))

print("Done with creating DataBlock.")

In [None]:
dls = classes.dataloaders(path)

In [None]:
dls.valid.show_batch(max_n=4, nrows=1)
# show number of dls.valid.items
print(len(dls.valid.items))

In [47]:
classes = classes.new(item_tfms=Resize(256, ResizeMethod.Squish))
dls = classes.dataloaders(path)

classes = classes.new(
    item_tfms=RandomResizedCrop(224, min_scale=0.5),
    batch_tfms=aug_transforms())
dls = classes.dataloaders(path)

In [48]:
learn = vision_learner(dls, resnet18, metrics=error_rate)
learn.fine_tune(epochs = 4, freeze_epochs=1)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to C:\Users\michi/.cache\torch\hub\checkpoints\resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:02<00:00, 17.7MB/s]

## Tabulare Daten:



## Erst Text, dann Bildverifizierung