## Resource azure:
1. resource group: Jakakolwiek nazwa (u mnie CaptionGeneration, region west europe
2. Azure translator: resourceGroup z 1. -> przycisk Add -> wyszukać Translator
  - region west europe
  - nazwa jakakolwiek (U mnie CaptionGenerationTranslator - możesz dodać 2 na końcu)
  - pricing tier - F0 - **WAŻNE!**
  - Create

3. Po wejściu w translator w azure portal, po lewo keys & endpoint - uzupełnić niżej subscription_key kluczem nr 1
4. uruchomić po kolei wszystkie komórki 


In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
# load doc into memory
def load_doc(filename): 
    file = open(filename, 'r') 
    text = file.read() 
    file.close()
    return text

def load_descriptions(doc):
    mapping = dict()
    
    for line in doc.split('\n'):
        
        if len(line) < 2:
            continue
        tokens = line.split()
        
        image_id, image_desc = tokens[0], tokens[1:]
        
        image_id = image_id.split('.')[0]
        
        image_desc = ' '.join(image_desc)

        if image_desc[-2:] == ' .':
          image_desc = image_desc[:-2]
        
        if image_id not in mapping:
            mapping[image_id] = list()
        
        mapping[image_id].append(image_desc)
    return mapping

dataset_dir = "/content/drive/My Drive/"
descriptions_filename = dataset_dir + "Flickr8k_text/Flickr8k.token.txt"
descriptions_doc = load_doc(descriptions_filename)

descriptions = load_descriptions(descriptions_doc)
print('Loaded %d descriptions' % len(descriptions))

Loaded 8092 descriptions


In [3]:
n_parts = 28

def get_batch(batch_number):
  if batch_number > n_parts or batch_number <= 0:
    print("Invalid batch_number")
    return None
  batch_size = len(descriptions)//n_parts
  batch_start_idx = batch_size * (batch_number - 1)
  batch_end_idx = min(len(descriptions), batch_size * batch_number)
  processing_descriptions = dict(list(descriptions.items())[batch_start_idx:batch_end_idx])
  print("processing %d images" % len(processing_descriptions))
  processing_all_descriptions = [description for image_descriptions in processing_descriptions.values() for description in image_descriptions]
  print("processing %d descriptions" % len(processing_all_descriptions))
  characters_count = sum(len(desc) for image_descriptions in processing_descriptions.values() for desc in image_descriptions)
  print("Processing %d characters" % characters_count)
  return processing_descriptions

In [4]:
import time
import os, requests, uuid, json

subscription_key = '***'
endpoint = 'https://api.cognitive.microsofttranslator.com'

path = '/translate?api-version=3.0'
params = '&from=en&to=pl'
url = endpoint + path + params
headers = {
    'Ocp-Apim-Subscription-Key': subscription_key,
    'Ocp-Apim-Subscription-Region': 'westeurope',
    'Content-type': 'application/json',
    'X-ClientTraceId': str(uuid.uuid4())
}

def make_request(body, response_parser = None):
  request = requests.post(url, headers=headers, json=body)
  response = request.json()
  if response_parser is not None:
    response = response_parser(response)
  return response

def get_only_translations(response):
  return [obj['translations'][0]['text'] for obj in response]

In [5]:
import pickle

def save_result(result, batch_number):
  with open('/content/drive/My Drive/Translations/translated_descriptions_' + str(batch_number) + '.pkl', "wb") as translation_pickle:
    pickle.dump(result, translation_pickle)
  

In [6]:
from tqdm.notebook import tqdm

def process_batch(batch_number, interval = 1):
  translations_result = {}

  processing_descriptions = get_batch(batch_number)

  for image, image_descriptions in tqdm(processing_descriptions.items()):
    body = [{ 'text': desc } for desc in image_descriptions]
    translations = make_request(body, get_only_translations)
    translations_result[image] = translations
    time.sleep(interval)

  print('Saving results...')
  save_result(translations_result, batch_number)
  print('Pack %d done!' % batch_number)


In [None]:
# ALREADY DOWNLOADED

# number of batches: 28
# <1, 14> - Maciek
# <15, 28> - Iza

# for i in range(15, 29):
#   print('Processing %d' % i)
#   process_batch(i)
#   print('Waiting 1 min...')
#   time.sleep(60)

Processing 15
processing 289 images
processing 1445 descriptions
Processing 75982 characters


HBox(children=(FloatProgress(value=0.0, max=289.0), HTML(value='')))


Saving results...
Pack 15 done!
Waiting 1 min...
Processing 16
processing 289 images
processing 1445 descriptions
Processing 76586 characters


HBox(children=(FloatProgress(value=0.0, max=289.0), HTML(value='')))


Saving results...
Pack 16 done!
Waiting 1 min...
Processing 17
processing 289 images
processing 1445 descriptions
Processing 76403 characters


HBox(children=(FloatProgress(value=0.0, max=289.0), HTML(value='')))


Saving results...
Pack 17 done!
Waiting 1 min...
Processing 18
processing 289 images
processing 1445 descriptions
Processing 76588 characters


HBox(children=(FloatProgress(value=0.0, max=289.0), HTML(value='')))


Saving results...
Pack 18 done!
Waiting 1 min...
Processing 19
processing 289 images
processing 1445 descriptions
Processing 75732 characters


HBox(children=(FloatProgress(value=0.0, max=289.0), HTML(value='')))


Saving results...
Pack 19 done!
Waiting 1 min...
Processing 20
processing 289 images
processing 1445 descriptions
Processing 76995 characters


HBox(children=(FloatProgress(value=0.0, max=289.0), HTML(value='')))


Saving results...
Pack 20 done!
Waiting 1 min...
Processing 21
processing 289 images
processing 1445 descriptions
Processing 77690 characters


HBox(children=(FloatProgress(value=0.0, max=289.0), HTML(value='')))


Saving results...
Pack 21 done!
Waiting 1 min...
Processing 22
processing 289 images
processing 1445 descriptions
Processing 78275 characters


HBox(children=(FloatProgress(value=0.0, max=289.0), HTML(value='')))


Saving results...
Pack 22 done!
Waiting 1 min...
Processing 23
processing 289 images
processing 1445 descriptions
Processing 77450 characters


HBox(children=(FloatProgress(value=0.0, max=289.0), HTML(value='')))


Saving results...
Pack 23 done!
Waiting 1 min...
Processing 24
processing 289 images
processing 1445 descriptions
Processing 77920 characters


HBox(children=(FloatProgress(value=0.0, max=289.0), HTML(value='')))


Saving results...
Pack 24 done!
Waiting 1 min...
Processing 25
processing 289 images
processing 1445 descriptions
Processing 75936 characters


HBox(children=(FloatProgress(value=0.0, max=289.0), HTML(value='')))


Saving results...
Pack 25 done!
Waiting 1 min...
Processing 26
processing 289 images
processing 1445 descriptions
Processing 76426 characters


HBox(children=(FloatProgress(value=0.0, max=289.0), HTML(value='')))


Saving results...
Pack 26 done!
Waiting 1 min...
Processing 27
processing 289 images
processing 1445 descriptions
Processing 77823 characters


HBox(children=(FloatProgress(value=0.0, max=289.0), HTML(value='')))


Saving results...
Pack 27 done!
Waiting 1 min...
Processing 28
processing 289 images
processing 1445 descriptions
Processing 78602 characters


HBox(children=(FloatProgress(value=0.0, max=289.0), HTML(value='')))


Saving results...
Pack 28 done!
Waiting 1 min...


In [7]:
loaded_translation = pickle.load(open("/content/drive/My Drive/Translations/translated_descriptions_28.pkl", "rb"))
sum([len(v) for v in loaded_translation.values()])

1445

In [8]:
all_translations = {}

for n in range(28):
  filename = '/content/drive/My Drive/Translations/translated_descriptions_' + str(n+1) + '.pkl'
  loaded_translations = pickle.load(open(filename, 'rb'))
  all_translations.update(loaded_translations)

#sanity check
all([len(v) == 5 for v in all_translations.values()])

True

In [9]:
with open('/content/drive/My Drive/Translations/translated_descriptions.pkl', "wb") as translation_pickle:
    pickle.dump(all_translations, translation_pickle)

In [None]:
all_translated_descriptions = [desc for image_descriptions in all_translations.values() for desc in image_descriptions]

words_counts = {}
tokens = [word for desc in all_translated_descriptions for word in desc.split()]
for token in tokens:
  token = token.lower()
  if token not in words_counts.keys():
    words_counts[token] = 0
  words_counts[token] += 1


dict(sorted(words_counts.items(), key=lambda c: -c[1]))

{'w': 23199,
 'na': 17535,
 'z': 9131,
 'i': 7497,
 'pies': 7009,
 'się': 5860,
 'mężczyzna': 4714,
 'kobieta': 3002,
 'przez': 2989,
 'chłopiec': 2803,
 'do': 2417,
 'po': 2326,
 'dziewczyna': 2235,
 'człowiek': 1965,
 'jest': 1873,
 'dziecko': 1871,
 'psy': 1871,
 'podczas': 1857,
 'gdy': 1745,
 'stoi': 1708,
 'dwie': 1705,
 'dwa': 1677,
 'brązowy': 1590,
 'skacze': 1559,
 'dzieci': 1519,
 'przed': 1518,
 'czarny': 1499,
 'biegnie': 1432,
 'dwóch': 1391,
 'osoba': 1389,
 'biały': 1312,
 'piłkę': 1224,
 'siedzi': 1223,
 'koszuli': 1193,
 'mały': 1170,
 'obok': 1156,
 'pobliżu': 1155,
 'grupa': 1101,
 ',': 1100,
 'powietrzu': 1098,
 'trawie': 1085,
 'ubrany': 1078,
 'nad': 1072,
 'osoby': 1025,
 'młody': 1002,
 'śniegu': 965,
 'trzyma': 960,
 'plaży': 950,
 'mężczyzn': 925,
 'za': 915,
 'wodzie': 883,
 'ma': 860,
 'rowerze': 825,
 'robi': 821,
 'patrzy': 796,
 'trzy': 778,
 'bawi': 772,
 'ustach': 769,
 'młoda': 760,
 'ubrana': 758,
 'dziewczynka': 749,
 'polu': 749,
 'ludzi': 725,
 'w