## Kernel: PyTorch

 - Create DataFrame that contains image name and a list of all words found with easyOCR

In [1]:
import easyocr
import os
import pandas as pd
from PIL import Image
import numpy as np
import time
import spacy
from spacy_langdetect import LanguageDetector

import de_core_news_sm
import en_core_web_sm

from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0

import torch
torch.cuda.is_available()

True

In [2]:
df = pd.DataFrame(columns=["Path", "Words"])

reader = easyocr.Reader(['en', 'de'], gpu=True) 

PATH = "../../Data/PlakateBayreuth"
ROTATIONS = [0, 90, 180, 270]

In [7]:
func = np.vectorize(lambda t: len(t) > 2)

def find_most_likely_words_array(all_words):
    all_words_len = [0, 0, 0, 0]
    
    for i, words in enumerate(all_words):
        
        try:
            long_words = np.array(words)[func(words)]
        except:
            long_words = np.array([])
        
        try:
            for word in long_words:
                try:
                    word = word.strip()
                    lan = detect(word)
                    if lan == "de" or lan == "en" or lan == "fr":
                        all_words_len[i] += 1
                except:
                    pass
        except:
            pass
        
    return np.argmax(all_words_len)

In [8]:
all_paths = []

for name in os.listdir(PATH):
    file_path = os.path.join(PATH, name)
    all_paths.append(file_path)

In [9]:
all_paths = all_paths
print(len(all_paths))

17540


In [10]:
start_time = time.time() 

for i, name in enumerate(all_paths):        
    file_path = os.path.join(PATH, name)
    all_words = []
    
    for angle in ROTATIONS:
        im = Image.open(file_path)
        out = im.rotate(angle)
        out.save(file_path)
    
        result = reader.readtext(file_path)
        words = [r[1] for r in result]
        all_words.append(words)
    
    index = find_most_likely_words_array(all_words)
    
    df = df.append({"Path": file_path,
                    "Words_0": all_words[0],
                    "Words_90": all_words[1],
                    "Words_180": all_words[2],
                    "Words_270": all_words[3],
                    "Most_Likely": all_words[index]}, ignore_index=True)
     
    if i % 500 == 0:
        elapsed_time = time.time() - start_time
        strf_time = time.strftime("%H:%M:%S", time.gmtime(elapsed_time))
        print(f"[{i}/{len(all_paths)}] Processed - Duration: {strf_time}")

[0/17540] Processed - Duration: 00:00:03
[500/17540] Processed - Duration: 00:37:32
[1000/17540] Processed - Duration: 01:21:17
[1500/17540] Processed - Duration: 01:57:05
[2000/17540] Processed - Duration: 02:26:10
[2500/17540] Processed - Duration: 02:57:33
[3000/17540] Processed - Duration: 03:32:39
[3500/17540] Processed - Duration: 04:08:38
[4000/17540] Processed - Duration: 04:53:41
[4500/17540] Processed - Duration: 05:36:12
[5000/17540] Processed - Duration: 06:13:48
[5500/17540] Processed - Duration: 06:46:11
[6000/17540] Processed - Duration: 07:22:10
[6500/17540] Processed - Duration: 07:55:33
[7000/17540] Processed - Duration: 08:32:41
[7500/17540] Processed - Duration: 09:01:58
[8000/17540] Processed - Duration: 09:32:02
[8500/17540] Processed - Duration: 10:04:16
[9000/17540] Processed - Duration: 10:33:38
[9500/17540] Processed - Duration: 11:07:24
[10000/17540] Processed - Duration: 11:40:35
[10500/17540] Processed - Duration: 12:09:03
[11000/17540] Processed - Duration

In [11]:
df.to_csv("poster_text.csv", encoding='utf-8', index=False)

In [12]:
df.head()

Unnamed: 0,Path,Words,Most_Likely,Words_0,Words_180,Words_270,Words_90
0,../../Data/PlakateBayreuth/../../Data/PlakateB...,,"[FORUM, Kirchner, Das, expressionistische Expe...","[0, adhvie irquur? Ma SchnrerrfrWnrm (orluin, ...","[SmnquH 7qeusneqJeY, vI0Z '6'4 '$*62, JUMUI3d...","[], 1, |, p, o;, 4, |, &, TAdfse '777haat) env...","[FORUM, Kirchner, Das, expressionistische Expe..."
1,../../Data/PlakateBayreuth/../../Data/PlakateB...,,"[SEST, I4, g, ' ^ ., 1472, Das Pa on Daumier,...","[K0, 0, (, 8, 3, 4, 4, 91, 2, `, 3]","[SEST, I4, g, ' ^ ., 1472, Das Pa on Daumier,...","[3, 9, ~a, 5, 5, 7, p, K, 5]","[anneTesnos, pun, Iowumnpd, UOAe sea, ss, ' '..."
2,../../Data/PlakateBayreuth/../../Data/PlakateB...,,"[U, M 1 V R (; E R, KUNSTHALLE, LICHTWARK, REV...","[4, 2, 2, 2, ~, 5, ~, 1, =]","[U, M 1 V R (; E R, KUNSTHALLE, LICHTWARK, REV...","[=, 1, 2, [, =, 0]","[aaLISIAH&, XdVMLH)I7, FTIVHLSNNY, M a ') M .1..."
3,../../Data/PlakateBayreuth/../../Data/PlakateB...,,"[PRIS O NS, AUSSTELLUNGE, DER FRIEDRICH-SCHLLE...","[3, 5, 2, I, 9, 7, 2, ~, 1, 1, 5, 9]","[g, Aunau, +lo7 I7nf ` Z7e & IVW '0 1, VNJC LV...","[2, %, p, 1, , 2, ~, 8a, I/, =, 5, 2]","[PRIS O NS, AUSSTELLUNGE, DER FRIEDRICH-SCHLLE..."
4,../../Data/PlakateBayreuth/../../Data/PlakateB...,,"[Christoph Brech, it's about time, 10 Mai, 14 ...","[8g, ;, 3, 1, "", 9, 98, 6, 9, 2, 8, 1]","[Christoph Brech, it's about time, 10 Mai, 14 ...","[1, 5, 6, 3, 6, 85, 2, 1, 4, 5, g, 8, g, 8g, []","[qu uoqejedooy ui, a4o1xsIuUF vor ""IS pun, Suo..."


In [13]:
df.shape

(17786, 7)

In [14]:
df2 = pd.read_csv("poster_text.csv")

In [15]:
df2.shape

(17786, 7)

In [16]:
df2.head()

Unnamed: 0,Path,Words,Most_Likely,Words_0,Words_180,Words_270,Words_90
0,../../Data/PlakateBayreuth/../../Data/PlakateB...,,"['FORUM', 'Kirchner', 'Das', 'expressionistisc...","['0', 'adhvie irquur? Ma SchnrerrfrWnrm (orlui...","['SmnquH 7qeusneqJeY', ""vI0Z '6'4 '$*62"", 'JU...","[']', '1', '|', 'p', 'o;', '4', '|', '&', ""TAd...","['FORUM', 'Kirchner', 'Das', 'expressionistisc..."
1,../../Data/PlakateBayreuth/../../Data/PlakateB...,,"['SEST', 'I4', 'g', "" ' ^ ."", '1472', 'Das Pa ...","['K0', '0', '(', '8', '3', '4', '4', '91', '2'...","['SEST', 'I4', 'g', "" ' ^ ."", '1472', 'Das Pa ...","['3', '9', '~a', '5', '5', '7', 'p', 'K', '5']","['anneTesnos', 'pun', 'Iowumnpd', 'UOAe sea',..."
2,../../Data/PlakateBayreuth/../../Data/PlakateB...,,"['U', 'M 1 V R (; E R', 'KUNSTHALLE', 'LICHTWA...","['4', '2', '2', '2', '~', '5', '~', '1', '=']","['U', 'M 1 V R (; E R', 'KUNSTHALLE', 'LICHTWA...","['=', '1', '2', '[', '=', '0']","['aaLISIAH&', 'XdVMLH)I7', 'FTIVHLSNNY', ""M a ..."
3,../../Data/PlakateBayreuth/../../Data/PlakateB...,,"['PRIS O NS', 'AUSSTELLUNGE', 'DER FRIEDRICH-S...","['3', '5', '2', 'I', '9', '7', '2', '~', '1', ...","['g', 'Aunau', ""+lo7 I7nf ` Z7e & IVW '0 1"", '...","['2', '%', 'p', '1', '', '2', '~', '8a', 'I/',...","['PRIS O NS', 'AUSSTELLUNGE', 'DER FRIEDRICH-S..."
4,../../Data/PlakateBayreuth/../../Data/PlakateB...,,"['Christoph Brech', ""it's about time"", '10 Mai...","['8g', ';', '3', '1', '""', '9', '98', '6', '9'...","['Christoph Brech', ""it's about time"", '10 Mai...","['1', '5', '6', '3', '6', '85', '2', '1', '4',...","['qu uoqejedooy ui', 'a4o1xsIuUF vor ""IS pun',..."


In [17]:
df2

Unnamed: 0,Path,Words,Most_Likely,Words_0,Words_180,Words_270,Words_90
0,../../Data/PlakateBayreuth/../../Data/PlakateB...,,"['FORUM', 'Kirchner', 'Das', 'expressionistisc...","['0', 'adhvie irquur? Ma SchnrerrfrWnrm (orlui...","['SmnquH 7qeusneqJeY', ""vI0Z '6'4 '$*62"", 'JU...","[']', '1', '|', 'p', 'o;', '4', '|', '&', ""TAd...","['FORUM', 'Kirchner', 'Das', 'expressionistisc..."
1,../../Data/PlakateBayreuth/../../Data/PlakateB...,,"['SEST', 'I4', 'g', "" ' ^ ."", '1472', 'Das Pa ...","['K0', '0', '(', '8', '3', '4', '4', '91', '2'...","['SEST', 'I4', 'g', "" ' ^ ."", '1472', 'Das Pa ...","['3', '9', '~a', '5', '5', '7', 'p', 'K', '5']","['anneTesnos', 'pun', 'Iowumnpd', 'UOAe sea',..."
2,../../Data/PlakateBayreuth/../../Data/PlakateB...,,"['U', 'M 1 V R (; E R', 'KUNSTHALLE', 'LICHTWA...","['4', '2', '2', '2', '~', '5', '~', '1', '=']","['U', 'M 1 V R (; E R', 'KUNSTHALLE', 'LICHTWA...","['=', '1', '2', '[', '=', '0']","['aaLISIAH&', 'XdVMLH)I7', 'FTIVHLSNNY', ""M a ..."
3,../../Data/PlakateBayreuth/../../Data/PlakateB...,,"['PRIS O NS', 'AUSSTELLUNGE', 'DER FRIEDRICH-S...","['3', '5', '2', 'I', '9', '7', '2', '~', '1', ...","['g', 'Aunau', ""+lo7 I7nf ` Z7e & IVW '0 1"", '...","['2', '%', 'p', '1', '', '2', '~', '8a', 'I/',...","['PRIS O NS', 'AUSSTELLUNGE', 'DER FRIEDRICH-S..."
4,../../Data/PlakateBayreuth/../../Data/PlakateB...,,"['Christoph Brech', ""it's about time"", '10 Mai...","['8g', ';', '3', '1', '""', '9', '98', '6', '9'...","['Christoph Brech', ""it's about time"", '10 Mai...","['1', '5', '6', '3', '6', '85', '2', '1', '4',...","['qu uoqejedooy ui', 'a4o1xsIuUF vor ""IS pun',..."
...,...,...,...,...,...,...,...
17781,../../Data/PlakateBayreuth/../../Data/PlakateB...,,"['MIKE', 'BIDLO', '<NOT WARHOL>', '0', '24', '...","['MIKE', 'BIDLO', '<NOT WARHOL>', '0', '24', '...","['Grauskala #13', 'M', 'BIG.', '28', '=', '~',...","['HDIJOZ', '67 IvnOLn', ""S66l II*8L -'vl"", '43...","['69 /iu', '79 8UU 81', '82 83', '84 85', ..."
17782,../../Data/PlakateBayreuth/../../Data/PlakateB...,,"['GH0w PALASTI', 'von und mit', 'Dieter Thomas...","['GH0w PALASTI', 'von und mit', 'Dieter Thomas...","['Grauskala #13', 'M', 'Big.', '5', ';', ' ', ...","['J7Z@', '""1SS8q ueu }491S Ua}aMZ wap }W', '08...","['79 860 81', '89*9091', '#', '8', ';', '1', '..."
17783,../../Data/PlakateBayreuth/../../Data/PlakateB...,,"['Ein neuer deutscherAbenteuer-Farbfilm', 'mit...","['Ein neuer deutscherAbenteuer-Farbfilm', 'mit...","['Grauskala #13', 'M', 'BIg.', '8 5', 'H', '5'...","['sni €', '""98""41nf9199', 'q1@H21 w! uouynpoud...","['8GU 81', '85 86', '910 91', '9', 'Hh', 'H'..."
17784,../../Data/PlakateBayreuth/../../Data/PlakateB...,,"['Jiu', 'u', '72', '79 8U 81', '89 9/0 91', 'J...","['Jiu', 'u', '72', '79 8U 81', '89 9/0 91', 'J...","['[', '1', '2', '5', '1', '1', '7', '1', '2', ...","['Grauskala #13', 'M', 'BIG', 'Suh1oua_Syoh)in...","['2', '11', '1', '94 7', '1', '6', '2', '2', '..."
