# Real or AI generated images classifier model


In [39]:
from img2vec_pytorch import Img2Vec
from PIL import Image
from arrow import now
from glob import glob
import pandas as pd
import matplotlib.pyplot as plt
from os.path import basename

from zipfile import ZipFile

In [32]:
img2vec = Img2Vec(cuda=True, model='resnet-18', layer='default', layer_output_size=512)

FOLDER_AI = 'train/FAKE/'
FOLDER_REAL = 'train/REAL/'

def img_to_array(tag: str, zipped_folder: str = None) -> list:
    result = []
    _IMG_COUNT = 100
    count = 1
    zip_object = ZipFile('datasets/dataset2.zip', 'r')
    zip_object.extractall()
    # zip_object.extract(zipped_folder, path='./datasets')
    for input_file in zip_object.namelist():
        if input_file.startswith(zipped_folder):
            name = basename(input_file)
            try:
                with Image.open(fp=input_file, mode='r') as image:
                    vector = img2vec.get_vec(image, tensor=True).numpy().reshape(512,)
                    result.append(pd.Series(data=[tag, name, vector], index=['tag', 'name', 'value']))
            except Exception as error:
                print("Runtime Error : ", error)
                # pass
            if count >= _IMG_COUNT:
                break # Stops processing images after 10 000 images processed. 
            count += 1
    zip_object.close()
    return result

time_start = now()
ai = img_to_array(zipped_folder=FOLDER_AI, tag='ai')
print('done encoding the AI images in {}'.format(now() - time_start))
real = img_to_array(zipped_folder=FOLDER_REAL, tag='real')
df = pd.DataFrame(data=ai + real)
print('done in {}'.format(now() - time_start))



done encoding the AI images in 0:00:09.314780
done in 0:00:18.429971


In [33]:
df.head()

Unnamed: 0,tag,name,value
0,ai,1000 (10).jpg,"[0.15953541, 1.9148567, 0.6372986, 0.016049122..."
1,ai,1000 (2).jpg,"[0.14412837, 0.27126953, 0.5012313, 0.22757447..."
2,ai,1000 (3).jpg,"[0.8852316, 0.03856516, 0.70647436, 0.7004094,..."
3,ai,1000 (4).jpg,"[0.43705535, 0.38578257, 0.3709294, 0.3298432,..."
4,ai,1000 (5).jpg,"[0.023255296, 0.343142, 2.1546018, 1.9247802, ..."


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tag     200 non-null    object
 1   name    200 non-null    object
 2   value   200 non-null    object
dtypes: object(3)
memory usage: 4.8+ KB


In [35]:
df.shape

(200, 3)

In [36]:
data_columns = df.columns
print(data_columns)
# df.head()

Index(['tag', 'name', 'value'], dtype='object')


In [37]:
for column in data_columns:
    if column != 'value':
        df[column] = df[column].str.strip()

In [38]:
df.head()

Unnamed: 0,tag,name,value
0,ai,1000 (10).jpg,"[0.15953541, 1.9148567, 0.6372986, 0.016049122..."
1,ai,1000 (2).jpg,"[0.14412837, 0.27126953, 0.5012313, 0.22757447..."
2,ai,1000 (3).jpg,"[0.8852316, 0.03856516, 0.70647436, 0.7004094,..."
3,ai,1000 (4).jpg,"[0.43705535, 0.38578257, 0.3709294, 0.3298432,..."
4,ai,1000 (5).jpg,"[0.023255296, 0.343142, 2.1546018, 1.9247802, ..."


In [None]:
# test = []
# for column in dataColumns:
#     test.append() df[column].value_counts()
# # groupedColumns = df.groupby(dataColumns)

In [None]:
df.describe()

Unnamed: 0,tag,name,value
count,200,200,200
unique,2,200,200
top,ai,1000 (10).jpg,"[0.15953541, 1.9148567, 0.6372986, 0.016049122..."
freq,100,1,1


In [None]:
# Heat map
plt.figure(figsize=[10,10])

ct_counts = df.groupby()

<Figure size 1000x1000 with 0 Axes>

<Figure size 1000x1000 with 0 Axes>

### Next task
- Check if the pictures extensions are the same