### primary data processing

- removing reactions
- removing duplicates in text_entities and text blocks

In [1]:
import json
import loadotenv
import os
from openai import OpenAI
import re

loadotenv.load_env()

token = os.getenv("OPENAI_API_KEY")

In [None]:
client = OpenAI()

In [2]:
with open('TGDat/result.json', 'r', encoding='utf-8', errors='ignore') as f:
    data = json.load(f)

In [56]:
# format to human-readable
with open('TGDat/result_h.json', 'w', encoding='utf-8', errors='ignore') as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

In [3]:
for i, message in enumerate(data['messages']):
    if message['type'] != 'message':
        print(i)
        print(json.dumps(message, ensure_ascii=False, indent=4))

In [4]:
to_delete = []

for i, message in enumerate(data['messages']):
    if message['type'] != 'message':
        to_delete.append(i)
    if 'reactions' in message:
        del data['messages'][i]['reactions']
    if 'text' in message:
        del data['messages'][i]['text']

for i in to_delete:
    del data['messages'][i]

In [5]:
# remove duplicates information
with open('TGDat/result_1.json', 'w', encoding='utf-8', errors='ignore') as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

In [None]:
types = {'photo': {'count': 0, 'ids': []}, 'file': {'count': 0, 'ids': []}, 'video': {'count': 0, 'ids': []}}

for i, message in enumerate(data['messages']):
    if 'photo' in message:
        types['photo']['count'] += 1
        types['photo']['ids'].append(i)
    elif 'file' in message:
        types['file']['count'] += 1
        types['file']['ids'].append(i)
    elif 'video' in message:
        types['video']['count'] += 1
        types['video']['ids'].append(i)
    else:
        print(i)
        # print(json.dumps(message, ensure_ascii=False, indent=4))


In [None]:
print(data['messages'][types['photo']['ids'][30]])

### Edit format from telegram to my own

format:
```json
{
    "messages": [
        {
            "id": 1,
            "content": {
                "plain_text": "some text",
                "md_text": "some *md* text",
                "image": {"file": "photos/photo_32", "width": 1280, "height": 720},
                "file": {"file_name": "SQR.pdf", "file": "files/file_1.pdf", mime_type: "application/pdf"},
                "video": {"file_name": "video.MP4", "file": "videos/video_1", "media_type": "video_file", "duration_seconds": 60, "width": 1280, "height": 720}
            }
        }
    ]
}
```

In [6]:
def convert_to_plain_text(text_entities):
    plain_text = ""
    for entity in text_entities:
        if entity['type'] == 'plain':
            plain_text += entity['text']
        else:
            plain_text += entity['text']
    return plain_text

In [7]:
def convert_to_md_text(text_entities):
    md_text = ""
    for entity in text_entities:
        if entity['type'] == 'plain':
            md_text += entity['text']
        elif entity['type'] == 'bold':
            md_text += f"**{entity['text']}**"
        elif entity['type'] == 'italic':
            md_text += f"*{entity['text']}*"
        elif entity['type'] == 'underline':
            md_text += f"__{entity['text']}__"
        elif entity['type'] == 'strikethrough':
            md_text += f"~~{entity['text']}~~"
        elif entity['type'] == 'blockquote':
            md_text += f"> {entity['text']}"
        elif entity['type'] == 'pre':
            md_text += f"```{entity['language']}\n{entity['text']}```"
        elif entity['type'] == 'spoiler':
            md_text += f"||{entity['text']}||"
        elif entity['type'] == 'text_link':
            md_text += f"[{entity['text']}]({entity['href']})"
        else:
            md_text += entity['text']
    return md_text

In [20]:
new_data = {"messages": []}
video_pattern = r'^(?!video/).*$'

for message in data['messages']:
    new_message = {"id": message['id'], "content": {
        'plain_text': convert_to_plain_text(message['text_entities']),
        'md_text': convert_to_md_text(message['text_entities'])}}

    if 'photo' in message:
        new_message['content']['image'] = {
            "image": {
                'file': message['photo'],
                'width': message['width'],
                'height': message['height']
            }}
    if 'file' in message:
        if re.match(video_pattern, message['mime_type']) is None:  # its video format
            video = {'file': message['file'],
                     'media_type': message['media_type'],
                     'duration_seconds': message['duration_seconds'],
                     'width': message['width'],
                     'height': message['height']}

            if 'file_name' in message:
                video['file_name'] = message['file_name']

            new_message['content']['video'] = {"video": video}
        else:  # not video format
            new_message['content']['file'] = {"file": {
                'file_name': message['file_name'],
                'file': message['file'],
                'mime_type': message['mime_type']
            }}
    new_data['messages'].append(new_message)

In [21]:
print(json.dumps(new_data, ensure_ascii=False, indent=1))

{
 "messages": [
  {
   "id": 25,
   "content": {
    "plain_text": "📱 - Консольное приложение на Go для подготовки к собесу\n\nВключает в себя базовое тестирование по таким темам, как: \nканалы, структуры, мьютексы, WaitGroup. \n\nА также имеет задания, по типу найти ошибку в данном коде\n\n🙂Сделайте что-то подобное для других ЯП, в качестве пет-проекта\n\nПодписаться",
    "md_text": "📱 **- Консольное **[приложение](https://github.com/rusinikita/trainer?tab=readme-ov-file#complex-questions)** на Go для подготовки к собесу**\n\nВключает в себя базовое тестирование по таким **темам**, как: \n```\nканалы, структуры, мьютексы, WaitGroup. \n\nА также имеет задания, по типу найти ошибку в данном коде```\n\n🙂Сделайте что-то подобное для других ЯП, в качестве пет-проекта\n\n> Подписаться",
    "image": {
     "image": {
      "file": "photos/photo_2@19-05-2024_15-42-18.jpg",
      "width": 1280,
      "height": 646
     }
    }
   }
  },
  {
   "id": 26,
   "content": {
    "plain_text": "✔️