In [1]:
from pytesseract import pytesseract
import cv2
import numpy as np
from typing import List, Tuple
import matplotlib.pyplot as plt
from PIL import Image

import os
import sys

pytesseract.tesseract_cmd = r'/usr/bin/tesseract'

In [2]:
sys.path.append("../scripts")
from load_data import Loader

loader = Loader()

In [3]:
# constants

IMAGES_EXT = ["JPG","PNG","GIF","WEBP","TIFF","PSD","RAW","BMP","HEIF","INDD","JPEG"]

VIDEO_EXT = ["WEBM","MPG","MP2","MPEG","MPE","MPV","OGG","MP4","M4P","M4V","AVI","WMV","MOV","QT","FLV","SWF"]

In [4]:
def get_files_name(directory:str, filter_extension:list=None)->list:
    
    # directory = f'/home/amanuel_zewdu/creative_image_optimization/data/Challenge_Data/Assets/{directory}/'
    
    files = []
    for filename in os.listdir(directory):
        f = os.path.join(directory, filename)
        
        # checking if it is a file
        if filter_extension != None:
            if os.path.isfile(f):
                if filename.split('.')[-1].upper() in filter_extension:
                    files.append(filename)
        else:
            if os.path.isfile(f):
                files.append(filename)

    return files

In [14]:

# usage

img_files = get_files_name(
    "/home/amanuel_zewdu/creative_image_optimization/data/Challenge_Data/Assets/0301a5865ecfd7c5a1653dde10a5aedf/",
    IMAGES_EXT)

vid_files = get_files_name(
    "/home/amanuel_zewdu/creative_image_optimization/data/Challenge_Data/Assets/0301a5865ecfd7c5a1653dde10a5aedf/",
    VIDEO_EXT)

all_files = get_files_name("/home/amanuel_zewdu/creative_image_optimization/data/Challenge_Data/Assets/0301a5865ecfd7c5a1653dde10a5aedf/")

In [5]:
def get_pure_list(text:str):
    text_list = (text).split('\n')

    while ' ' in text_list:
        text_list.remove(' ')

    while '' in text_list:
        text_list.remove('')

    while '\x0c' in text_list:
        text_list.remove('\x0c')
        
    return text_list

In [6]:
def get_text(img_path:str,convert_to_gray:bool=True,plot:bool=False):
    
    img = cv2.imread(img_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    if plot:
        plt.subplot(1,2,1)
        plt.imshow(img)

        plt.subplot(1,2,2)
        plt.imshow(gray)

        plt.show()
    if convert_to_gray:
        return get_pure_list(pytesseract.image_to_string(gray))
    else:
        return get_pure_list(pytesseract.image_to_string(img))

In [33]:
all_text = get_text("/home/amanuel_zewdu/creative_image_optimization/data/Challenge_Data/Assets/0301a5865ecfd7c5a1653dde10a5aedf/engagement_instruction_1.png")

In [34]:
all_text

['TAP THE SCREEN', 'to find the nearest Lexus dealership']

In [7]:
# all in one

def add_feature(game_id:str):
    path  = f'/home/amanuel_zewdu/creative_image_optimization/data/Challenge_Data/Assets/{game_id}/'
    file_names = get_files_name(path,IMAGES_EXT)
    all_texts = []
    for file_name in file_names:
        all_text = get_text(path+file_name)
        all_texts.extend(all_text)
    return all_texts

    

In [8]:
bucket = "s3://10ac-batch-6/data/w11/Challenge_Data.zip"
file_path = "Challenge_Data/performance_data.csv"

df = loader.load_csv(bucket,file_path)

In [16]:
test_df = df.sample(5)
test_df["all_text"] = test_df.game_id.apply(lambda x:add_feature(x))


In [9]:
df["all_text"] = df.game_id.apply(lambda x:add_feature(x))



In [10]:
df.to_csv('game_id_with_all_text.csv',index=False)

In [None]:
def add_engagement_type(all_text:list):
    joined_text = " ".join(all_text)
    joined_text = joined_text.lower()
    
    if ("swipe right" in joined_text) or ("swipe to the right" in joined_text):
        return "swipe right"
    elif ("swipe left" in joined_text) or ("swipe to the left" in joined_text):
        return "swipe left"
    elif "tap and hold" in joined_text:
        return "tap and hold"
    elif "scrub" in joined_text:
        return "scrub"
    elif "tap" in joined_text:
        return "tap"