In [5]:
import kagglehub
import cv2
import numpy as np
from skimage.feature import graycomatrix, graycoprops
from scipy.stats import skew, kurtosis
from skimage import measure
import pandas as pd

# Download latest version
path = kagglehub.dataset_download("alessandrasala79/ai-vs-human-generated-dataset")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/ai-vs-human-generated-dataset


In [6]:
def analyze_image(image_path):
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
 
    blurred_image = cv2.GaussianBlur(image, (5, 5), 0)

    # Calculer la moyenne et l'écart type des valeurs des pixels
    mean_pixel_value, stddev_pixel_value = cv2.meanStdDev(blurred_image)

    # Définir les seuils pour la détection d'artéfacts (moyenne ± 2*écart type)
    lower_threshold = mean_pixel_value - 2 * stddev_pixel_value
    upper_threshold = mean_pixel_value + 2 * stddev_pixel_value

    # Calculer l'asymétrie (skewness) et la kurtose
    flattened_pixels = image.flatten()
    image_skewness = skew(flattened_pixels)
    image_kurtosis = kurtosis(flattened_pixels)
    
    # Effectuer l'analyse de Fourier
    def perform_fourier_analysis(image_path):
        image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
        fourier_transform = np.fft.fft2(image)
        fourier_shift = np.fft.fftshift(fourier_transform)
        magnitude_spectrum = np.abs(fourier_shift)
        log_magnitude_spectrum = np.log(1 + magnitude_spectrum)
        mean_magnitude = np.mean(magnitude_spectrum)
        max_magnitude = np.max(magnitude_spectrum)
        return mean_magnitude, max_magnitude

    mean_fourier_magnitude, max_fourier_magnitude = perform_fourier_analysis(image_path)

    # Calcul de la cohérence de texture via GLCM
    def calculate_texture_coherence(image_path):
        image = cv2.imread(image_path)  
        gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        glcm = graycomatrix(gray_image, [1], [0], symmetric=True, normed=True)
        correlation = graycoprops(glcm, 'correlation')[0, 0]
        return correlation

    texture_coherence_correlation = calculate_texture_coherence(image_path)

    # Calcul de la non-uniformité des niveaux de gris (GLNU)
    def calculate_gray_level_non_uniformity(image):
        hist = cv2.calcHist([image], [0], None, [256], [0, 256])
        hist = hist / hist.sum()  
        glnu = np.sum(hist**2)
        return glnu

    # Calcul de la dimension fractale via la méthode de comptage par boîte
    def calculate_fractal_dimension(image):
        _, binary_image = cv2.threshold(image, 128, 255, cv2.THRESH_BINARY)
        binary_image = cv2.resize(binary_image, (128, 128))
        
        def box_count(img, box_size):
            count = 0
            for i in range(0, img.shape[0], box_size):
                for j in range(0, img.shape[1], box_size):
                    if np.sum(img[i:i+box_size, j:j+box_size] > 0):
                        count += 1
            return count

        sizes = np.arange(2, 20)
        counts = [box_count(binary_image, size) for size in sizes]
        log_sizes = np.log(sizes)
        log_counts = np.log(counts)
        coeffs = np.polyfit(log_sizes, log_counts, 1)
        fractal_dimension = -coeffs[0]
        return fractal_dimension

    gray_level_non_uniformity = calculate_gray_level_non_uniformity(image)
    fractal_dimension_value = calculate_fractal_dimension(image)

    # Calcul de l'entropie de la texture
    def calculate_texture_entropy(image):
        pixel_values = image.flatten() / 255.0  
        histogram, _ = np.histogram(pixel_values, bins=256, range=(0, 1))
        histogram = histogram / histogram.sum()
        entropy = -np.sum(histogram * np.log2(histogram + 1e-10))  
        return entropy
    
    texture_entropy_value = calculate_texture_entropy(image)

    def calculate_short_run_emphasis(glrlm):
        num = np.sum(glrlm * (np.arange(1, glrlm.shape[1] + 1) ** 2))
        den = np.sum(glrlm)
        return num / den if den != 0 else 0

    def calculate_glrlm(image):
        glrlm = np.zeros((256, image.shape[0] * image.shape[1]), dtype=int)
        for i in range(image.shape[0]):
            row = image[i, :]
            run_length = 1
            for j in range(1, len(row)):
                if row[j] == row[j - 1]:
                    run_length += 1
                else:
                    glrlm[row[j - 1], run_length - 1] += 1
                    run_length = 1
            glrlm[row[-1], run_length - 1] += 1
        return glrlm

    gray_level_run_length_matrix = calculate_glrlm(image)
    short_run_emphasis_value = calculate_short_run_emphasis(gray_level_run_length_matrix)

    # Variance spatiale
    def calculate_spatial_variance(image):
        return np.var(image)

    spatial_variance_value = calculate_spatial_variance(image)

    # Densité des bords (Edge Density)
    def calculate_edge_density(image):
        sobel_x = cv2.Sobel(image, cv2.CV_64F, 1, 0, ksize=3)
        sobel_y = cv2.Sobel(image, cv2.CV_64F, 0, 1, ksize=3)
        magnitude = cv2.magnitude(sobel_x, sobel_y)
        edge_density_value = np.sum(magnitude > 50) / (image.size)
        return edge_density_value

    edge_density_value = calculate_edge_density(image)

    # Netteté (Sharpness)
    def calculate_sharpness(image):
        laplacian = cv2.Laplacian(image, cv2.CV_64F)
        sharpness_value = np.var(laplacian)
        return sharpness_value

    sharpness_value = calculate_sharpness(image)
    
    # Distribution des bords (Edge Distribution)
    def compute_edge_distribution(image):
        edges = cv2.Canny(image, 100, 200)
        edge_count = np.sum(edges > 0)
        edge_density = edge_count / (image.shape[0] * image.shape[1])
        edge_orientation = np.gradient(edges.astype(float))
        return edge_count, edge_density, np.std(edge_orientation)
        
    edge_count, edge_density, edge_orientation_std = compute_edge_distribution(image)

    

    # Retourner les résultats sous forme de dictionnaire
    results = {
        'artifact_detection_lower_threshold': lower_threshold[0][0],
        'artifact_detection_upper_threshold': upper_threshold[0][0],
        'skewness': image_skewness,
        'kurtosis': image_kurtosis,
        'fourier_mean_magnitude': mean_fourier_magnitude,
        'fourier_max_magnitude': max_fourier_magnitude,
        'texture_coherence_correlation': texture_coherence_correlation,
        'gray_level_non_uniformity': gray_level_non_uniformity,
        'fractal_dimension': fractal_dimension_value,
        'texture_entropy': texture_entropy_value,
        'short_run_emphasis': short_run_emphasis_value,
        'spatial_variance': spatial_variance_value,
        'sharpness': sharpness_value,
        'edge_count': edge_count,
        'edge_density': edge_density_value,
        'edge_orientation_std': edge_orientation_std,
    }

    return results


In [8]:
# Chargement des datasets
train_data = pd.read_csv('/kaggle/input/ai-vs-human-generated-dataset/train.csv')
test_data = pd.read_csv('/kaggle/input/ai-vs-human-generated-dataset/test.csv')
test_data.info()
# Chemins vers les répertoires d'images
image_dir = '/kaggle/input/ai-vs-human-generated-dataset'

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19986 entries, 0 to 19985
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      19986 non-null  object
dtypes: object(1)
memory usage: 156.3+ KB


In [9]:

def preprocess_data(data, image_dir, max_samples=None):
    """
    Prétraite les données et applique l'analyse d'image.

    Args:
        data (pd.DataFrame): Le DataFrame contenant les informations sur les images.
        image_dir (str): Le répertoire contenant les images.
        max_samples (int, optional): Le nombre d'images à traiter. Par défaut, toutes les images sont traitées.
        
    Returns:
        pd.DataFrame: Un DataFrame avec les caractéristiques extraites de chaque image.
    """
    results = []

    data_subset = data.head(max_samples) if max_samples else data

    for idx, row in data_subset.iterrows():
        file_name = row['file_name'] if 'file_name' in row else row['id']
        image_path = f"{image_dir}/{file_name}"

        # Appliquer l'analyse de l'image
        try:
            analysis_results = analyze_image(image_path)
        except Exception as e:
            print(f"Erreur lors de l'analyse de l'image {file_name}: {e}")
            continue 

        result_dict = {'file_name': file_name, 'label': row.get('label', None)}  
        result_dict.update(analysis_results)
        print(result_dict)
        results.append(result_dict)

    # Convertir les résultats en DataFrame
    results_df = pd.DataFrame(results)
    
    return results_df

evaluation_data = train_data.sample(n=500, random_state=42)

train_data_remaining = train_data.drop(evaluation_data.index)

train_data_processed = preprocess_data(train_data_remaining, image_dir, max_samples=400)

evaluation_data_processed = preprocess_data(evaluation_data, image_dir, max_samples=400)



# Afficher les résultats des datasets prétraités
print(f"Ensemble d'entraînement (train_data) après prétraitement :\n{train_data_processed.head()}")
print(f"Ensemble d'évaluation (evaluation_data) après prétraitement :\n{evaluation_data_processed.head()}")


{'file_name': 'train_data/19b29cb2dd6c45a69999e1e3608b0982.jpg', 'label': 1, 'artifact_detection_lower_threshold': 27.121674127431817, 'artifact_detection_upper_threshold': 284.08900397315415, 'skewness': 0.3456089195297866, 'kurtosis': -1.614319457561353, 'fourier_mean_magnitude': 4157.4278672816245, 'fourier_max_magnitude': 61186191.0, 'texture_coherence_correlation': 0.9970015861770803, 'gray_level_non_uniformity': 0.01744541, 'fractal_dimension': 1.805766596524861, 'texture_entropy': 6.614374220916731, 'short_run_emphasis': 50.175106306755346, 'spatial_variance': 4161.592548344071, 'sharpness': 98.28561400226317, 'edge_count': 4281, 'edge_density': 0.043469746907552086, 'edge_orientation_std': 16.813539154702}
{'file_name': 'train_data/76d24375c7ce4aa19fa50092ce335afd.jpg', 'label': 0, 'artifact_detection_lower_threshold': 120.98838516741523, 'artifact_detection_upper_threshold': 297.8802061411785, 'skewness': -2.642034700699907, 'kurtosis': 6.557784660049117, 'fourier_mean_magnitu

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split


# Préparer les données
X_train = train_data_processed.drop(columns=['file_name', 'label'])  
y_train = train_data_processed['label']

X_eval = evaluation_data_processed.drop(columns=['file_name', 'label'])  
y_eval = evaluation_data_processed['label']

X_eval = X_eval.dropna()
y_eval = y_eval.loc[X_eval.index] 


from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    n_estimators=500,            
    max_depth=10,               
    min_samples_leaf=10,        
    max_features='sqrt',         
    max_samples=0.9,            
    n_jobs=-1,                  
    random_state=42,          
    verbose=1                   
)

rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_eval)

accuracy = accuracy_score(y_eval, y_pred)
report = classification_report(y_eval, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    4.2s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 500 out of 500 | elapsed:    0.2s finished


Accuracy: 0.8145363408521303
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.81      0.80       186
           1       0.83      0.82      0.82       213

    accuracy                           0.81       399
   macro avg       0.81      0.81      0.81       399
weighted avg       0.81      0.81      0.81       399

