In [7]:
# Imports
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from tqdm.notebook import tqdm
warnings.filterwarnings('ignore')
%matplotlib inline

import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense,Conv2D,Dropout, Flatten, MaxPooling2D
from keras.utils import load_img




## Load dataset


In [8]:
# load dataset directory
TRAIN_DIR = '../data/Challenge_Data/Assets/'

In [9]:
def load_dataset(directory):
    image_paths = []
    labels = []

    for label in os.listdir(directory):
        for filename in os.listdir(directory+label):
            image_path = os.path.join(directory,label,filename)
            image_paths.append(image_path)
            labels.append(label)
        print(label, "completed")

    return image_paths, labels

In [10]:
#convert data folder structure to df 

train = pd.DataFrame()
train['image'], train['label'] = load_dataset(TRAIN_DIR)
train = train.sample(frac=1).reset_index(drop=True)
train.head(5)

ed3071a667a11cc56e88ae0489bfe6aa completed
4799763419d621cd41e7fb8abbcdd45d completed
bf1f7af46eec0e92939a8b3ba51cbacd completed
e68e20f592457b875ce29757ab855dfe completed
6a8e741867d4f893afad015b77b52c39 completed
65299151bf4cbd1825d28e4a43d113ac completed
42c5b4b367187c29ac8fdd78c70837d6 completed
adunit-heineken-pure-malt-mob completed
f08d8e575fa2c9929dfaf2d1bfa553a2 completed
adunit-mouser-user-choice-cpe-av-us-mob completed
dbe499749ecc456991332fc5d847ad2e completed
6b7ddb8af0b0d4b5e042ac9469300177 completed
adunit-lionsgate-moonfall-parallax-dec-mpu completed
adunit-jaguar-nameplate-refresh-rfb-mpu completed
adunit-chevy-600-1200-hotspot-explorer-cpe-av-mob completed
f0a8b52312fce8d57f34bc6441dea750 completed
adunit-milk-user-slide-v2-mpu completed
adunit-western-union-kroger-scrub-mob completed
424a4a0bfde5a836ed93394a8c099b17 completed
4baa12ddba0c06b618225f8191a6d722 completed
adunit-heineken-pure-malt-bio completed
3cea7861b4a43a0695ae177cf0829267 completed
adunit-milk-ctc-s

Unnamed: 0,image,label
0,../data/Challenge_Data/Assets/8b6e13cab4903dff...,8b6e13cab4903dffde5588ea2ab0c2a2
1,../data/Challenge_Data/Assets/8f5834e2093bee23...,8f5834e2093bee23d3b01f8d05265765
2,../data/Challenge_Data/Assets/adunit-hitmans-w...,adunit-hitmans-wifes-body-guard-user-choice-un...
3,../data/Challenge_Data/Assets/3f749b8fd9e49bca...,3f749b8fd9e49bca68b5fdc0ca391cd8
4,../data/Challenge_Data/Assets/adunit-windhoek-...,adunit-windhoek-beer-year-end-2021-bio-v2-mob


In [11]:
## separate cta images for cta related features and keep information in a separate df
df = train.copy(deep = True)

In [12]:
df.insert(2, "is_cta_image", np.nan)
df.head()

Unnamed: 0,image,label,is_cta_image
0,../data/Challenge_Data/Assets/8b6e13cab4903dff...,8b6e13cab4903dffde5588ea2ab0c2a2,
1,../data/Challenge_Data/Assets/8f5834e2093bee23...,8f5834e2093bee23d3b01f8d05265765,
2,../data/Challenge_Data/Assets/adunit-hitmans-w...,adunit-hitmans-wifes-body-guard-user-choice-un...,
3,../data/Challenge_Data/Assets/3f749b8fd9e49bca...,3f749b8fd9e49bca68b5fdc0ca391cd8,
4,../data/Challenge_Data/Assets/adunit-windhoek-...,adunit-windhoek-beer-year-end-2021-bio-v2-mob,


In [13]:
types =['cta', 'cta.png', 'cta.jpeg']

pattern = '|'.join(types)

df['is_cta_image'] = df['image'].str.contains(pattern, case=False)

df.head()

Unnamed: 0,image,label,is_cta_image
0,../data/Challenge_Data/Assets/8b6e13cab4903dff...,8b6e13cab4903dffde5588ea2ab0c2a2,True
1,../data/Challenge_Data/Assets/8f5834e2093bee23...,8f5834e2093bee23d3b01f8d05265765,False
2,../data/Challenge_Data/Assets/adunit-hitmans-w...,adunit-hitmans-wifes-body-guard-user-choice-un...,False
3,../data/Challenge_Data/Assets/3f749b8fd9e49bca...,3f749b8fd9e49bca68b5fdc0ca391cd8,True
4,../data/Challenge_Data/Assets/adunit-windhoek-...,adunit-windhoek-beer-year-end-2021-bio-v2-mob,False


In [14]:
## use this for cta data
cta_data = df.loc[df['is_cta_image'] == True]

In [15]:
cta_data['is_cta_image'].value_counts()

True    1490
Name: is_cta_image, dtype: int64

In [16]:
## use this for face data
face_data = df.loc[df['is_cta_image'] == False]

In [17]:

face_data['is_cta_image'].value_counts()

False    15428
Name: is_cta_image, dtype: int64

In [18]:
## save separated cta data
cta_data.to_csv("../data/cta.csv", index=False)

In [19]:
types2 =['.mp4']

pattern2 = '|'.join(types2)

face_data['is_vedio'] = face_data['image'].str.contains(pattern2, case=False)

In [20]:
face_data.head()

Unnamed: 0,image,label,is_cta_image,is_vedio
1,../data/Challenge_Data/Assets/8f5834e2093bee23...,8f5834e2093bee23d3b01f8d05265765,False,False
2,../data/Challenge_Data/Assets/adunit-hitmans-w...,adunit-hitmans-wifes-body-guard-user-choice-un...,False,False
4,../data/Challenge_Data/Assets/adunit-windhoek-...,adunit-windhoek-beer-year-end-2021-bio-v2-mob,False,False
5,../data/Challenge_Data/Assets/adunit-lionsgate...,adunit-lionsgate-uwomt-user-slider-sensory-vid...,False,True
6,../data/Challenge_Data/Assets/adunit-windhoek-...,adunit-windhoek-beer-year-end-2021-bio-v3-mob,False,False


In [21]:
## remove vedios
face_data = face_data.loc[face_data['is_vedio'] == False]

In [22]:
face_data['is_vedio'].value_counts()

False    14796
Name: is_vedio, dtype: int64

In [23]:
import face_recognition

def check_face_exist(paths):
    check = []
    for path in paths:
    #test_image = path 
        image = face_recognition.load_image_file(path)
        face_locations = face_recognition.face_locations(image)
        if len(face_locations) <= 0:
            message = 'No face'

        #elif len(face_locations) > 1:
         #   message = 'There are multiple faces in the provided image, Please select another image.'

        else:
            message = 'has face'

        check.append(message)

    return message

In [None]:
check_face_exist(face_data['image'])

In [24]:
def extract_features(images):
    features = []
    for image in tqdm(images):
        img = load_img(image)
        img = np.array(img)
        features.append(img)
    features = np.array(features)
    features = Features.reshape(len(features), 48, 48, 1)
    return features

In [None]:
face_emotions = extract_features(train['image'])

## CTA FEATURES

In [25]:
##necessary imports 
from pytesseract import pytesseract
import cv2
import numpy as np
from typing import List, Tuple
import matplotlib.pyplot as plt
import matplotlib.image as matimg
from PIL import Image
import warnings
warnings.filterwarnings('ignore')

from scipy.cluster.vq import whiten, kmeans

import os
import sys

In [26]:
##read the csv file that contains the path info of the cta images
cta_df = pd.read_csv('../data/cta.csv')
cta_df.head()

Unnamed: 0,image,label,is_cta_image
0,../data/Challenge_Data/Assets/8b6e13cab4903dff...,8b6e13cab4903dffde5588ea2ab0c2a2,True
1,../data/Challenge_Data/Assets/3f749b8fd9e49bca...,3f749b8fd9e49bca68b5fdc0ca391cd8,True
2,../data/Challenge_Data/Assets/adunit-femsa-coc...,adunit-femsa-coca-cola-new-sugar-free-mpu,True
3,../data/Challenge_Data/Assets/c9ec2d041cd03c48...,c9ec2d041cd03c489b2ee97c5f7ba400,True
4,../data/Challenge_Data/Assets/bf1f7af46eec0e92...,bf1f7af46eec0e92939a8b3ba51cbacd,True


In [27]:
#drop is_cta_image_column
cta_df = cta_df.drop(['is_cta_image'], axis=1)
cta_df.head()

Unnamed: 0,image,label
0,../data/Challenge_Data/Assets/8b6e13cab4903dff...,8b6e13cab4903dffde5588ea2ab0c2a2
1,../data/Challenge_Data/Assets/3f749b8fd9e49bca...,3f749b8fd9e49bca68b5fdc0ca391cd8
2,../data/Challenge_Data/Assets/adunit-femsa-coc...,adunit-femsa-coca-cola-new-sugar-free-mpu
3,../data/Challenge_Data/Assets/c9ec2d041cd03c48...,c9ec2d041cd03c489b2ee97c5f7ba400
4,../data/Challenge_Data/Assets/bf1f7af46eec0e92...,bf1f7af46eec0e92939a8b3ba51cbacd


In [28]:
#retrive image name
import ntpath
def path_leaf(im_path):
    image_n = []
    for pt in im_path:
        head, tail = ntpath.split(pt)
        image_n.append(tail)
    
    return image_n 

In [29]:
img_name = path_leaf(cta_df['image'])

In [30]:
cta_df['image_name'] = img_name
cta_df.head(12)

Unnamed: 0,image,label,image_name
0,../data/Challenge_Data/Assets/8b6e13cab4903dff...,8b6e13cab4903dffde5588ea2ab0c2a2,cta.png
1,../data/Challenge_Data/Assets/3f749b8fd9e49bca...,3f749b8fd9e49bca68b5fdc0ca391cd8,cta_ar.png
2,../data/Challenge_Data/Assets/adunit-femsa-coc...,adunit-femsa-coca-cola-new-sugar-free-mpu,end-cta-nao.png
3,../data/Challenge_Data/Assets/c9ec2d041cd03c48...,c9ec2d041cd03c489b2ee97c5f7ba400,cta.png
4,../data/Challenge_Data/Assets/bf1f7af46eec0e92...,bf1f7af46eec0e92939a8b3ba51cbacd,F3-CTA-REV-TOMORROW.png
5,../data/Challenge_Data/Assets/adunit-lionsgate...,adunit-lionsgate-moonfall-parallax-cta4-mob,video-cta-rev.png
6,../data/Challenge_Data/Assets/adunit-yamaha-x2...,adunit-yamaha-x2-x4-userchoice-cpe-av-mpu,cta.jpg
7,../data/Challenge_Data/Assets/adunit-city-squa...,adunit-city-square-mall-cny-mob,cta_1.png
8,../data/Challenge_Data/Assets/adunit-mouser-us...,adunit-mouser-user-choice-v3-canada-cpe-av-mob,cta.png
9,../data/Challenge_Data/Assets/c5d9fdf57215e5db...,c5d9fdf57215e5dbec5cc42419090749,f1-movie-info-cta-3.png


In [31]:
## now drop rows that doesnt contain the substring 'cta' in them

types2 =['cta']

pattern2 = '|'.join(types2)

cta_df['is_cta'] = cta_df['image_name'].str.contains(pattern2, case=False)

cta_df = cta_df[cta_df.is_cta != False]

cta_df.head(12)

Unnamed: 0,image,label,image_name,is_cta
0,../data/Challenge_Data/Assets/8b6e13cab4903dff...,8b6e13cab4903dffde5588ea2ab0c2a2,cta.png,True
1,../data/Challenge_Data/Assets/3f749b8fd9e49bca...,3f749b8fd9e49bca68b5fdc0ca391cd8,cta_ar.png,True
2,../data/Challenge_Data/Assets/adunit-femsa-coc...,adunit-femsa-coca-cola-new-sugar-free-mpu,end-cta-nao.png,True
3,../data/Challenge_Data/Assets/c9ec2d041cd03c48...,c9ec2d041cd03c489b2ee97c5f7ba400,cta.png,True
4,../data/Challenge_Data/Assets/bf1f7af46eec0e92...,bf1f7af46eec0e92939a8b3ba51cbacd,F3-CTA-REV-TOMORROW.png,True
5,../data/Challenge_Data/Assets/adunit-lionsgate...,adunit-lionsgate-moonfall-parallax-cta4-mob,video-cta-rev.png,True
6,../data/Challenge_Data/Assets/adunit-yamaha-x2...,adunit-yamaha-x2-x4-userchoice-cpe-av-mpu,cta.jpg,True
7,../data/Challenge_Data/Assets/adunit-city-squa...,adunit-city-square-mall-cny-mob,cta_1.png,True
8,../data/Challenge_Data/Assets/adunit-mouser-us...,adunit-mouser-user-choice-v3-canada-cpe-av-mob,cta.png,True
9,../data/Challenge_Data/Assets/c5d9fdf57215e5db...,c5d9fdf57215e5dbec5cc42419090749,f1-movie-info-cta-3.png,True


In [32]:
cta_df.value_counts('is_cta')

is_cta
True    1310
dtype: int64

In [33]:
# now that we have further refined our data, lets remove the is_cta and image_name columns which are unnecessary
cta_df = cta_df.drop(['is_cta'], axis=1)

In [34]:
cta_df = cta_df.drop(['image_name'], axis=1)
cta_df.head()

Unnamed: 0,image,label
0,../data/Challenge_Data/Assets/8b6e13cab4903dff...,8b6e13cab4903dffde5588ea2ab0c2a2
1,../data/Challenge_Data/Assets/3f749b8fd9e49bca...,3f749b8fd9e49bca68b5fdc0ca391cd8
2,../data/Challenge_Data/Assets/adunit-femsa-coc...,adunit-femsa-coca-cola-new-sugar-free-mpu
3,../data/Challenge_Data/Assets/c9ec2d041cd03c48...,c9ec2d041cd03c489b2ee97c5f7ba400
4,../data/Challenge_Data/Assets/bf1f7af46eec0e92...,bf1f7af46eec0e92939a8b3ba51cbacd


## 1. Extract texts on buttons: CTA Feature 1, 'cta_text'

In [35]:
## removes unnecessary characters if any
def get_pure_list(text:str):
    text_list = (text).split('\n')

    while ' ' in text_list:
        text_list.remove(' ')

    while '' in text_list:
        text_list.remove('')

    while '\x0c' in text_list:
        text_list.remove('\x0c')
        
    return text_list

In [36]:
## retrives text from cta buttons
def get_text(img_path:str,convert_to_gray:bool=True,plot:bool=False):
    
    img = cv2.imread(img_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    if plot:
        plt.subplot(1,2,1)
        plt.imshow(img)

        plt.subplot(1,2,2)
        plt.imshow(gray)

        plt.show()
    if convert_to_gray:
        return get_pure_list(pytesseract.image_to_string(gray))
    else:
        return get_pure_list(pytesseract.image_to_string(img))

In [37]:
cta_text = []

for text in cta_df['image']:
    all_text = get_text(text)
    cta_text.append(all_text)



In [38]:
cta_df['cta_text'] = cta_text


In [39]:
cta_df.cta_text.head(5)


0                                 [LEARN MORE]
1                                           []
2                                        [tow]
3                                 [LEARN MORE]
4    [IN THEATERS EVERYWHERE STARTING TONIGHT]
Name: cta_text, dtype: object

In [40]:
cta_df.to_csv("../data/cta_text.csv")

### 2. Extract colors on buttons: CTA Feature 2, 'cta_dominant_colors_in_rgb'

In [68]:
import cv2
from sklearn.cluster import KMeans

class DominantColors:

    CLUSTERS = None
    IMAGE = None
    COLORS = None
    LABELS = None

    def __init__(self, image, clusters=3):
        self.CLUSTERS = clusters
        self.IMAGE = image

    def dominantColors(self):

        ##read image
        img = cv2.imread(self.IMAGE)

        ##convert to rgb from bgr
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        #reshape to a list of pixels
        img = img.reshape((img.shape[0] * img.shape[1], 3))

        ## save image after reshaping and conversions
        self.IMAGE = img

        ##apply k-means to cluster pixels
        kmeans = KMeans(n_clusters = self.CLUSTERS)
        kmeans.fit(img)

        ## now the centers of the cluster will be the dominant color
        self.COLORS = kmeans.cluster_centers_

        ## save labels
        self.LABELS = kmeans.labels_

        return self.COLORS.astype(int)

In [84]:
## find 3 dominant colors for all cta buttons
img = cta_df['image']

domcol = []
for ig in img:
    dc = DominantColors(ig, 3)
    domc = dc.dominantColors()
    domcol.append(domc)




In [93]:
cta_df['cta_dominant_colors_in_rgb'] = domcol
cta_df.head(5)


Unnamed: 0,image,label,cta_text,dominant_colors_in_rgb,cta_dominant_colors_in_rgb
0,../data/Challenge_Data/Assets/8b6e13cab4903dff...,8b6e13cab4903dffde5588ea2ab0c2a2,[LEARN MORE],"[[70, 112, 75], [0, 0, 0], [0, 0, 0]]","[[70, 112, 75], [0, 0, 0], [0, 0, 0]]"
1,../data/Challenge_Data/Assets/3f749b8fd9e49bca...,3f749b8fd9e49bca68b5fdc0ca391cd8,[],"[[72, 111, 74], [236, 210, 42], [27, 29, 27]]","[[72, 111, 74], [236, 210, 42], [27, 29, 27]]"
2,../data/Challenge_Data/Assets/adunit-femsa-coc...,adunit-femsa-coca-cola-new-sugar-free-mpu,[tow],"[[0, 0, 0], [246, 246, 246], [113, 113, 113]]","[[0, 0, 0], [246, 246, 246], [113, 113, 113]]"
3,../data/Challenge_Data/Assets/c9ec2d041cd03c48...,c9ec2d041cd03c489b2ee97c5f7ba400,[LEARN MORE],"[[0, 0, 0], [71, 112, 75], [0, 0, 0]]","[[0, 0, 0], [71, 112, 75], [0, 0, 0]]"
4,../data/Challenge_Data/Assets/bf1f7af46eec0e92...,bf1f7af46eec0e92939a8b3ba51cbacd,[IN THEATERS EVERYWHERE STARTING TONIGHT],"[[71, 112, 76], [245, 190, 0], [71, 112, 76]]","[[71, 112, 76], [245, 190, 0], [71, 112, 76]]"


In [94]:
cta_df = cta_df.drop(['dominant_colors_in_rgb'], axis=1)
cta_df.to_csv("../data/cta_color_v2.csv")

In [95]:
## 3. Extract button sizes: CTA Feature 3, 'cta_size'

cta_w = []
cta_h = []

for dim in cta_df['image']:
    pic = cv2.imread(dim)
    pic_w, pic_h = pic.shape[0], pic.shape[1]
    cta_w.append(pic_w)
    cta_h.append(pic_h)




In [96]:
cta_df['cta_width'] = cta_w
cta_df['cta_height'] = cta_h
cta_df.head(5)

Unnamed: 0,image,label,cta_text,cta_dominant_colors_in_rgb,cta_width,cta_height
0,../data/Challenge_Data/Assets/8b6e13cab4903dff...,8b6e13cab4903dffde5588ea2ab0c2a2,[LEARN MORE],"[[70, 112, 75], [0, 0, 0], [0, 0, 0]]",50,234
1,../data/Challenge_Data/Assets/3f749b8fd9e49bca...,3f749b8fd9e49bca68b5fdc0ca391cd8,[],"[[72, 111, 74], [236, 210, 42], [27, 29, 27]]",94,271
2,../data/Challenge_Data/Assets/adunit-femsa-coc...,adunit-femsa-coca-cola-new-sugar-free-mpu,[tow],"[[0, 0, 0], [246, 246, 246], [113, 113, 113]]",48,197
3,../data/Challenge_Data/Assets/c9ec2d041cd03c48...,c9ec2d041cd03c489b2ee97c5f7ba400,[LEARN MORE],"[[0, 0, 0], [71, 112, 75], [0, 0, 0]]",64,300
4,../data/Challenge_Data/Assets/bf1f7af46eec0e92...,bf1f7af46eec0e92939a8b3ba51cbacd,[IN THEATERS EVERYWHERE STARTING TONIGHT],"[[71, 112, 76], [245, 190, 0], [71, 112, 76]]",900,600


In [100]:
## Save to csv, cta_size_v3.csv contains size, color and text information about cta buttons

print("### CTA features saved###")
cta_df.to_csv("../data/cta_size_v3.csv")
print("#########################")

### CTA features saved###
#########################


In [101]:
##Load performance data
perform = pd.read_csv("/home/amanuel_zewdu/creative_image_optimization/data/Challenge_Data/performance_data.csv")

In [103]:
## merge the relevant columns from cta_size_v3.csv with performance data on cta_df.label and perform.gameId

## first check for distnict labels in the cta dataframe

##check
boolean = cta_df["label"].is_unique
boolean

False