In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("alessandrasala79/ai-vs-human-generated-dataset")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/alessandrasala79/ai-vs-human-generated-dataset?dataset_version_number=4...


100%|██████████| 9.76G/9.76G [06:20<00:00, 27.5MB/s]

Extracting files...





Path to dataset files: /Users/shrijansshetty/.cache/kagglehub/datasets/alessandrasala79/ai-vs-human-generated-dataset/versions/4


In [2]:
import pandas as pd
import os

def csvToDataframe(csv_path: str) -> pd.DataFrame:
    """
    Return a panda dataframe from a CSV file path. Print errors if reading is unsuccessful.

    Parameters:
    csv path (str): File path of a CSV file.

    Returns:
    df - Panda Dataframe containing CSV file
    """

    try:
        df = pd.read_csv(csv_path)
    except Exception as e:
        print(f"Error reading the CSV file: {e}")
        df = pd.DataFrame()  # To not break the following code

    return df

def displayCsv(csv_path: str) -> None:
    """
    Display the name and shape of the CSV file.
    Display the labels and first rows of CSV file.

    Parameters:
    csv path (str): File path of a CSV file.

    Returns:
    None - prints file name, shape, labels and sample rows.

    """

    df = csvToDataframe(csv_path)
    file_name = os.path.basename(csv_path)
    print(f"\n{file_name} has shape: {df.shape}")
    display(df.head())

In [3]:
train_csv = ("./train.csv")
test_csv = ("./test.csv")
df_train = csvToDataframe(train_csv)    # save panda dataframe
df_test = csvToDataframe(test_csv)    # save panda dataframe
displayCsv(train_csv)
displayCsv(test_csv)
class_counts = df_train['label'].value_counts()
print("Class 0 count:", class_counts.get(0, 0))
print("Class 1 count:", class_counts.get(1, 0))


train.csv has shape: (79950, 3)


Unnamed: 0.1,Unnamed: 0,file_name,label
0,0,train_data/a6dcb93f596a43249135678dfcfc17ea.jpg,1
1,1,train_data/041be3153810433ab146bc97d5af505c.jpg,0
2,2,train_data/615df26ce9494e5db2f70e57ce7a3a4f.jpg,1
3,3,train_data/8542fe161d9147be8e835e50c0de39cd.jpg,0
4,4,train_data/5d81fa12bc3b4cea8c94a6700a477cf2.jpg,1



test.csv has shape: (5540, 1)


Unnamed: 0,id
0,test_data_v2/1a2d9fd3e21b4266aea1f66b30aed157.jpg
1,test_data_v2/ab5df8f441fe4fbf9dc9c6baae699dc7.jpg
2,test_data_v2/eb364dd2dfe34feda0e52466b7ce7956.jpg
3,test_data_v2/f76c2580e9644d85a741a42c6f6b39c0.jpg
4,test_data_v2/a16495c578b7494683805484ca27cf9f.jpg


Class 0 count: 39975
Class 1 count: 39975


In [3]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df_train, test_size=0.2, random_state=42)

print("Training set shape:", train_df.shape)
print("Validation set shape:", val_df.shape)

Training set shape: (63960, 3)
Validation set shape: (15990, 3)


In [4]:
from PIL import Image
import random

def check_image_size(df, dataset_name):
    """
    Open a random image from the dataframe and print its size.

    Parameters:
    df (pd.DataFrame): DataFrame containing image file paths.
    dataset_name (str): Name of the dataset (e.g., 'train' or 'validation').

    Returns:
    None
    """
    random_index = random.randint(0, len(df) - 1)
    image_path = df.iloc[random_index]['file_name']
    image = Image.open(image_path)
    print(f"Random image from {dataset_name} set: {image_path}")
    print(f"Image size: {image.size}")

# Check image sizes for train and validation sets
check_image_size(train_df, "train")
check_image_size(val_df, "validation")

Random image from train set: train_data/7f8a3c2dbdf041dd855852784b76ddb8.jpg
Image size: (768, 512)
Random image from validation set: train_data/6918f56a9d2a4786911ae19a590b50c1.jpg
Image size: (768, 512)


In [5]:
import numpy as np 
from tensorflow.keras.layers import Input, Dense, Flatten, Reshape
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.image import img_to_array, load_img

# Preprocess images
def preprocess_images(df, image_size=(768, 512)):
    images = []
    for _, row in df.iterrows():
        image_path = row['file_name']
        image = load_img(image_path, target_size=image_size)
        image = img_to_array(image) / 255.0  # Normalize to [0, 1]
        images.append(image)
    return np.array(images)

In [6]:
# Load and preprocess images
train_images = preprocess_images(train_df[:2000])
val_images = preprocess_images(val_df[:500])

In [7]:
# Define the autoencoder model
input_img = Input(shape=(768, 512, 3))
x = Flatten()(input_img)
encoded = Dense(256*256*3, activation='relu')(x)
encoded = Dense(128*128*3, activation='relu')(encoded)
encoded = Dense(64*64*3, activation='relu')(encoded)
decoded = Dense(128 * 128 * 3, activation='relu')(encoded)
decoded = Dense(256 * 256 * 3, activation='relu')(decoded)
decoded = Dense(768 * 512 * 3, activation='sigmoid')(decoded)
decoded = Reshape((768, 512, 3))(decoded)

autoencoder = Model(input_img, decoded)
autoencoder.compile(optimizer='adam', loss='binary_crossentropy')

# Train the autoencoder
autoencoder.fit(train_images, train_images, epochs=500, batch_size=256, shuffle=True, validation_data=(val_images, val_images))

# Evaluate the autoencoder on the training dataset
decoded_images = autoencoder.predict(train_images)

# Display some original and reconstructed images
import matplotlib.pyplot as plt

n = 10  # Number of images to display
plt.figure(figsize=(20, 4))
for i in range(n):
    # Display original
    ax = plt.subplot(2, n, i + 1)
    plt.imshow(train_images[i])
    plt.title("Original")
    plt.axis('off')

    # Display reconstruction
    ax = plt.subplot(2, n, i + 1 + n)
    plt.imshow(decoded_images[i])
    plt.title("Reconstructed")
    plt.axis('off')
plt.show()

: 

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("ashwingupta3012/human-faces")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/ashwingupta3012/human-faces?dataset_version_number=1...


100%|██████████| 1.82G/1.82G [01:13<00:00, 26.6MB/s]

Extracting files...





Path to dataset files: /Users/shrijansshetty/.cache/kagglehub/datasets/ashwingupta3012/human-faces/versions/1


In [4]:
#!kaggle competitions download -c ai-mathematical-olympiad-progress-prize-2

# Download latest version
path = kagglehub.dataset_download("ai-mathematical-olympiad-progress-prize")

print("Path to dataset files:", path)

ValueError: Invalid dataset handle: ai-mathematical-olympiad-progress-prize