In [3]:
from google.colab import drive
import os
import gdown # Library to download files from Google Drive
!gdown 1JdGaXmUnItfCn8ucn_5kBfDeh3NzrmMs # Google Drive ID of the zip file to be downloaded


Downloading...
From (original): https://drive.google.com/uc?id=1JdGaXmUnItfCn8ucn_5kBfDeh3NzrmMs
From (redirected): https://drive.google.com/uc?id=1JdGaXmUnItfCn8ucn_5kBfDeh3NzrmMs&confirm=t&uuid=4d122176-9728-40c1-8b12-bc46f85e2176
To: /content/images.zip adlı dosyanın kopyası
100% 128M/128M [00:01<00:00, 74.9MB/s]


In [6]:
!unzip -oq images # Unzip the file downloaded. Options -o and -q overwrites the files if exists already and disables printing out the extracted files, respectively.

In [7]:

root_dir = '/content/images'


In [8]:
from PIL import Image, UnidentifiedImageError
import os

root_dir = '/content/images'
output_dir = '/content/cropped_images'

# Function to calculate average resolution of images in a directory
def calculate_average_resolution(directory):
    total_width = 0
    total_height = 0
    num_images = 0

    # Loop through each file in the directory
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        try:
            # Check if the file is an image
            if os.path.isfile(filepath) and any(filename.lower().endswith(ext) for ext in ['.png', '.jpg', '.jpeg']):
                # Open the image and get its dimensions
                with Image.open(filepath) as img:
                    width, height = img.size
                    total_width += width
                    total_height += height
                    num_images += 1
        except (UnidentifiedImageError, OSError):
            print(f"Unable to process image file: {filename}")
            continue

    if num_images == 0:
        print("No images found in the directory.")
        return None
    else:
        avg_width = total_width // num_images
        avg_height = total_height // num_images
        return (avg_width, avg_height, num_images)

# Function to create the output directory if it doesn't exist
def create_output_directory(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

# Function to crop images in a directory to a specified resolution
def crop_images_to_resolution(directory, target_resolution):
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        try:
            # Check if the file is an image
            if os.path.isfile(filepath) and any(filename.lower().endswith(ext) for ext in ['.png', '.jpg', '.jpeg']):
                # Open the image and crop it to the target resolution
                with Image.open(filepath) as img:
                    img = img.crop((0, 0, target_resolution[0], target_resolution[1]))
                    img.save(os.path.join(output_dir, filename))
        except (UnidentifiedImageError, OSError):
            print(f"Unable to process image file: {filename}")
            continue

# Calculate average resolution and total number of images
result = calculate_average_resolution(root_dir)

if result:
    avg_resolution, num_images = result[:2], result[2]
    print(f"Total number of images: {num_images}")
    print(f"Average resolution: {avg_resolution[0]}x{avg_resolution[1]} pixels")

    # Create the output directory if it doesn't exist
    create_output_directory(output_dir)

    # Crop images to average resolution
    crop_images_to_resolution(root_dir, avg_resolution)
    print("Images cropped to average resolution.")
else:
    print("Unable to calculate average resolution.")


Total number of images: 636
Average resolution: 1194x867 pixels
Unable to process image file: 309519166320626.jpg
Unable to process image file: 285424975595779.jpg
Unable to process image file: 186076198984051.jpg
Unable to process image file: 2167198373551810.jpg
Unable to process image file: 695766190760630.jpg
Unable to process image file: 320619618717364.jpg
Unable to process image file: 676687789384754.jpg
Unable to process image file: 430743550786558.jpg
Unable to process image file: 1723492444426167.jpg
Unable to process image file: 499615317172296.jpg
Unable to process image file: 728334204189811.jpg
Unable to process image file: 253935178579446.jpg
Unable to process image file: 346700996063593.jpg
Unable to process image file: 1684090145051513.jpg
Unable to process image file: 176574166615555.jpg
Unable to process image file: 1212019935620496.jpg
Unable to process image file: 936632113212421.jpg
Unable to process image file: 1969722553066031.jpg
Images cropped to average resol

In [9]:
import gdown

# Define the Google Drive file ID for the responses.json file
file_id = '1yXFCvwFZamVwoiW5QFEdAtS_CT-2CZr-'

# Define the output path for the downloaded file
output_path = '/content/responses.json'

# Download the file using gdown
gdown.download(f'https://drive.google.com/uc?id={file_id}', output_path, quiet=False)


Downloading...
From: https://drive.google.com/uc?id=1yXFCvwFZamVwoiW5QFEdAtS_CT-2CZr-
To: /content/responses.json
100%|██████████| 11.5M/11.5M [00:00<00:00, 98.5MB/s]


'/content/responses.json'

In [10]:
import json
import os
from PIL import Image
import numpy as np

# Load the JSON data
responses_file = '/content/responses.json'
with open(responses_file, 'r') as f:
    data = json.load(f)

# Create a dictionary to map '_id' to ad_info for faster lookup
ad_info_dict = {info['_id']: info for info in data.values() if '_id' in info}

# Define the root directory for images
images_dir = '/content/images'

# Define the target dimensions for resizing images (lower resolution for efficiency)
target_width = 512   # Lower resolution
target_height = 512  # Lower resolution

# Initialize lists to store calculated and extracted features
campaign_data = []

# Iterate through each image in the images directory
for filename in os.listdir(images_dir):
    if filename.endswith(".jpg"):  # Check if the file is a JPEG image
        image_id = filename.split('.')[0]  # Extract ID from filename
        if image_id in ad_info_dict:
            ad_info = ad_info_dict[image_id]
            image_path = os.path.join(images_dir, filename)

            # Load and preprocess the image
            with Image.open(image_path) as img:
                img = img.convert('RGB')  # Convert image to RGB format
                img = img.resize((target_width, target_height))  # Resize image to lower resolution
                img_array = np.array(img)  # Convert image to numpy array

            # Process only the necessary data to reduce memory load
            avg_spend = np.mean([int(ad_info.get("spend", {}).get("lower_bound", 0)), int(ad_info.get("spend", {}).get("upper_bound", 0))])
            avg_impressions = np.mean([int(ad_info.get("impressions", {}).get("lower_bound", 0)), int(ad_info.get("impressions", {}).get("upper_bound", 0))])
            cost_per_impression = avg_spend / avg_impressions if avg_impressions > 0 else float('inf')

            # Append only necessary info to reduce memory footprint
            campaign_data.append({
                "ad_id": ad_info.get('_id', ''),
                "cpi": cost_per_impression,
                "image_data": img_array.flatten().tolist()  # Flatten and convert to list to minimize size
            })

# Example: Print or save to file
# for campaign in campaign_data[:3]:
#     print(json.dumps(campaign, indent=2))

# Instead of printing, consider saving processed data to disk to free memory
import pickle
with open('/content/campaign_data.pkl', 'wb') as f:
    pickle.dump(campaign_data, f)

# Clear the large variables from memory
del campaign_data
del data


In [11]:
import pickle

# Load the processed data from disk
with open('/content/campaign_data.pkl', 'rb') as f:
    campaign_data = pickle.load(f)

# Print the first few entries of the campaign data to verify
for campaign in campaign_data[:3]:  # Adjust the range as needed for more samples
    print("Ad ID:", campaign["ad_id"])
    print("Cost Per Impression (CPI):", campaign["cpi"])
    print("Sample Image Data (flattened):", campaign["image_data"][:10])  # Show only the first 10 pixels to keep output manageable
    print("\n")


Ad ID: 1900522980255582
Cost Per Impression (CPI): 0.06361933853342788
Sample Image Data (flattened): [195, 195, 195, 196, 196, 196, 198, 198, 198, 199]


Ad ID: 679507235777236
Cost Per Impression (CPI): 0.01266008440056267
Sample Image Data (flattened): [3, 37, 21, 4, 36, 21, 7, 35, 20, 8]


Ad ID: 890876337968115
Cost Per Impression (CPI): 0.02271838834898499
Sample Image Data (flattened): [224, 217, 198, 225, 218, 199, 225, 218, 199, 226]




In [12]:
from tensorflow.keras.utils import Sequence
from tensorflow.keras.preprocessing.image import img_to_array, load_img
import numpy as np
import os

class DataGenerator(Sequence):
    'Generates data for Keras'
    def __init__(self, image_filenames, labels, batch_size, image_directory, dim=(512, 512), n_channels=3, shuffle=True):
        'Initialization'
        self.dim = dim
        self.batch_size = batch_size
        self.labels = labels
        self.image_filenames = image_filenames
        self.image_directory = image_directory
        self.n_channels = n_channels
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.image_filenames) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]

        # Find list of IDs
        image_filenames_temp = [self.image_filenames[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(image_filenames_temp)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.image_filenames))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, image_filenames_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        X = np.empty((self.batch_size, *self.dim, self.n_channels))
        y = np.empty((self.batch_size), dtype=float)

        # Generate data
        for i, image_filename in enumerate(image_filenames_temp):
            # Store sample
            img = load_img(os.path.join(self.image_directory, image_filename + '.jpg'), target_size=self.dim)
            X[i,] = img_to_array(img) / 255.0

            # Store class
            y[i] = self.labels[image_filename]

        return X, y


In [13]:
import tensorflow as tf

# Check TensorFlow GPU usage
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
if tf.config.list_physical_devices('GPU'):
    print("TensorFlow will run on GPU")
else:
    print("TensorFlow will run on CPU")

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)


Num GPUs Available:  1
TensorFlow will run on GPU
1 Physical GPUs, 1 Logical GPUs


In [17]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense

def create_model(input_shape):
    model = Sequential([
        Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        MaxPooling2D(2, 2),
        Conv2D(64, (3, 3), activation='relu'),
        MaxPooling2D(2, 2),
        Flatten(),
        Dense(128, activation='relu'),
        Dense(1)  # Assuming CPI prediction is a regression task
    ])
    return model



# Assuming you have a dictionary mapping from image IDs to CPI and file names
image_filenames = [campaign['ad_id'] for campaign in campaign_data]  # ID is used as filename
labels = {campaign['ad_id']: campaign['cpi'] for campaign in campaign_data}
image_directory = '/content/images'

# Splitting the dataset
image_filenames_train, image_filenames_test, labels_train, labels_test = train_test_split(
    image_filenames, [labels[id] for id in image_filenames], test_size=0.2, random_state=42)

# Create the generator
batch_size = 10  # Depends on your available memory
train_generator = DataGenerator(image_filenames=image_filenames_train, labels=dict(zip(image_filenames_train, labels_train)), batch_size=batch_size, image_directory=image_directory)
test_generator = DataGenerator(image_filenames=image_filenames_test, labels=dict(zip(image_filenames_test, labels_test)), batch_size=batch_size, image_directory=image_directory)

# Define your model's input shape based on the data dimensions you will train on
input_shape = (512, 512, 3)  # Modify according to your resized image dimensions

# Create the model
model = create_model(input_shape)

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error'])

# Show the model structure
model.summary()
# Now use this generator directly in model.fit
model.fit(train_generator, epochs=10, validation_data=test_generator)



Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_2 (Conv2D)           (None, 510, 510, 32)      896       
                                                                 
 max_pooling2d_2 (MaxPoolin  (None, 255, 255, 32)      0         
 g2D)                                                            
                                                                 
 conv2d_3 (Conv2D)           (None, 253, 253, 64)      18496     
                                                                 
 max_pooling2d_3 (MaxPoolin  (None, 126, 126, 64)      0         
 g2D)                                                            
                                                                 
 flatten_1 (Flatten)         (None, 1016064)           0         
                                                                 
 dense_2 (Dense)             (None, 128)              

<keras.src.callbacks.History at 0x7f793baea3b0>

In [16]:
from sklearn.model_selection import train_test_split

# Example dataset
image_filenames = [f'{i}.jpg' for i in range(100)]  # Mock filenames
labels = {f'{i}.jpg': np.random.randint(0, 2) for i in range(100)}  # Mock binary labels

# Split the data into training and testing sets
image_filenames_train, image_filenames_test, _, _ = train_test_split(
    image_filenames, list(labels.keys()), test_size=0.2, random_state=42)

# Extract the labels for the training and testing sets
labels_train = {filename: labels[filename] for filename in image_filenames_train}
labels_test = {filename: labels[filename] for filename in image_filenames_test}


# Create DataGenerator instances for training and testing
train_generator = DataGenerator(
    image_filenames=image_filenames_train,
    labels=labels_train,
    batch_size=10,
    image_directory=image_directory
)

test_generator = DataGenerator(
    image_filenames=image_filenames_test,
    labels=labels_test,
    batch_size=10,
    image_directory=image_directory
)


In [18]:
# Evaluate the model on the test set
test_loss, test_mae = model.evaluate(test_generator)
print(f"Test Loss: {test_loss}")
print(f"Mean Absolute Error (MAE): {test_mae}")


Test Loss: 0.0009903039317578077
Mean Absolute Error (MAE): 0.022174907848238945


In [20]:
# Save the model
model.save('content/modelCPIComparison.h5')

# Load the model
#from tensorflow.keras.models import load_model
#prod_model = load_model('modelCPIComparison.h5')
