In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import os
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GlobalAveragePooling2D, Dropout
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import preprocess_input

# Set up the VGG16 model
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
x = GlobalAveragePooling2D()(base_model.output)
feature_extractor = Model(inputs=base_model.input, outputs=x)

# Summarize the model
feature_extractor.summary()


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)    

In [None]:
from PIL import Image, UnidentifiedImageError

def preprocess_image(img_path):
    try:
        img = image.load_img(img_path, target_size=(224, 224))
        img_array = image.img_to_array(img)
        img_array = np.expand_dims(img_array, axis=0)
        img_array = preprocess_input(img_array)
        return img_array
    except UnidentifiedImageError:
        print(f"Warning: Unidentified image file at path {img_path}")
        return None


In [None]:
import json

def save_user_features(users, image_dir, feature_extractor, save_dir):
    os.makedirs(save_dir, exist_ok=True)
    all_user_features = {}

    # Limit to the first 300 users
    # users = users[700:]

    for i, user in enumerate(users):
        print(f"Processing user {i+1}/{len(users)}")
        username = user['username']
        user_id = user['user_id']
        user_image_folder = os.path.join(image_dir, username)
        user_features = []

        if os.path.exists(user_image_folder) and os.path.isdir(user_image_folder):
            for root, _, files in os.walk(user_image_folder):
                for file in files:
                    if file.lower().endswith(('.jpg', '.jpeg', '.png')):
                        img_path = os.path.join(root, file)
                        img_array = preprocess_image(img_path)
                        if img_array is not None:
                            image_feature = feature_extractor.predict(img_array)
                            user_features.append(image_feature)

        if user_features:
            aggregated_features = np.mean(user_features, axis=0)
        else:
            aggregated_features = np.zeros((feature_extractor.output_shape[1],))

        all_user_features[user_id] = aggregated_features.tolist()

    with open(os.path.join(save_dir, "user_features.json"), 'w') as f:
        json.dump(all_user_features, f)


In [None]:
# Load the data
data_path = '/content/drive/Shareddrives/PFA/users.json'
image_dir = '/content/drive/Shareddrives/PFA/pfa2/images'

with open(data_path, 'r') as file:
    data = json.load(file)

In [None]:

# Example usage
image_dir = '/content/drive/Shareddrives/PFA/pfa2/images'
save_dir = '/content/drive/Shareddrives/PFA/user_embedding'
save_user_features(data, image_dir, feature_extractor, save_dir)


[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
Processing user 77/184
Processing user 78/184
Processing user 79/184
Processing user 80/184
Processing user 81/184
Processing user 82/184
Processing user 83/184
Processing user 84/184
Processing user 85/184
Processing user 86/184
Processing user 87/184
Processing user 88/184
Processing user 89/184
Processing user 90/184
Processing user 91/184
Processing user 92/184
Processing user 93/184
Processing user 94/184
Processing user 95/184
Processing user 96/184
Processing user 97/184
Processing user 98/184
Processing user 99/184
Processing user 100/184
Processing user 101/184
Processing user 102/184
Processing user 103/184
Processing user 104/184
Processing user 105/184
Processing user 106/184
Processing user 107/184
Processing user 108/184
Processing user 109/184
Processing user 110/184
Processing user 111/184
Processing user 112/184
Processing user 113/184
Processing user 114/184
Processing user 115