In [None]:
import re

import langdetect
import pandas as pd
from pyrootutils import setup_root
from tqdm import tqdm

root = setup_root(".", pythonpath=True)

langdetect.DetectorFactory.seed = 0

In [None]:
dev_data = pd.read_csv(
    root / "data/image-verification-corpus-master/mediaeval2015/devset/tweets.txt",
    delimiter="\t",
)

# %%
test_data = pd.read_csv(
    root / "data/image-verification-corpus-master/mediaeval2015/testset/tweets.txt",
    delimiter="\t",
)

In [None]:
all_data = pd.concat([dev_data, test_data], axis=0)

In [None]:
def clean_text(text):
    try:
        text = text.decode("utf-8").lower()
    except Exception as ex:
        text = text.encode("utf-8").decode("utf-8").lower()
    text = re.sub("\u2019|\u2018", "'", text)
    text = re.sub("\u201c|\u201d", '"', text)
    text = re.sub("[\u2000-\u206F]", " ", text)
    text = re.sub("[\u20A0-\u20CF]", " ", text)
    text = re.sub("[\u2100-\u214F]", " ", text)
    text = re.sub(r"http:\ ", "http:", text)
    text = re.sub(r"http[s]?:[^\ ]+", " ", text)
    text = re.sub(r"&gt;", " ", text)
    text = re.sub(r"&lt;", " ", text)
    text = re.sub(r"&quot;", " ", text)
    text = re.sub(r"\"", " ", text)
    text = re.sub(r"#\ ", "#", text)
    text = re.sub(r"\\n", " ", text)
    text = re.sub(r"\\", " ", text)
    text = re.sub(r"[\(\)\[\]\{\}]", r" ", text)
    text = re.sub(
        "[" "\U0001F300-\U0001F64F" "\U0001F680-\U0001F6FF" "\u2600-\u26FF\u2700-\u27BF]+",
        r" ",
        text,
    )
    text = re.sub(r"\'s", " is ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " had ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"#", " #", text)
    text = re.sub(r"@", " @", text)
    text = re.sub(r"[\!\?\.\,\+\-\$\%\^\>\<\=\:\;\*\(\)\{\}\[\]\/\~\&\'\|]", " ", text)
    text = text.strip()
    text = " ".join(text.split())

    return text

In [None]:
def detection_lang(text: str):
    try:
        lang = langdetect.detect(text)
    except Exception as e:
        tqdm.write(f"text: {text}")
        tqdm.write(str(e))
        lang = "unk"
    return lang

In [None]:
tqdm.pandas(desc="Detecting language")

all_data["text"] = all_data.tweetText.progress_apply(clean_text)
all_data["lang"] = all_data.text.progress_apply(detection_lang)

In [None]:
all_data.tweetId[0]

In [None]:
all_data[all_data.lang != "en"][["tweetId", "text"]].to_excel("all_data.xlsx", index=False)

In [None]:
all_data[all_data.lang != "en"].head(10)

In [None]:
from src.utils.google_trans_new.google_trans_new import google_translator

In [None]:
translator = google_translator(
    proxies={"https": "172.22.112.1:7890"},
    timeout=5,
)

In [None]:
from googletrans import Translator
from httpcore import SyncHTTPProxy

In [None]:
translator = Translator(proxies={"http": SyncHTTPProxy((b"http", b"172.22.112.1", 7890, b""))})

In [None]:
import hashlib
import json
import random
from hashlib import md5

import requests


class BaiDuFanyi:
    def __init__(self, appKey, appSecret):
        self.url = "https://fanyi-api.baidu.com/api/trans/vip/translate"
        self.appid = appKey
        self.secretKey = appSecret
        self.fromLang = "auto"
        self.toLang = "en"
        self.salt = random.randint(32768, 65536)
        self.header = {"Content-Type": "application/x-www-form-urlencoded"}

    def BdTrans(self, text):
        sign = self.appid + text + str(self.salt) + self.secretKey
        md = hashlib.md5()
        md.update(sign.encode(encoding="utf-8"))
        sign = md.hexdigest()
        data = {
            "appid": self.appid,
            "q": text,
            "from": self.fromLang,
            "to": self.toLang,
            "salt": self.salt,
            "sign": sign,
        }
        response = requests.post(self.url, params=data, headers=self.header)  # 发送post请求
        text = response.json()  # 返回的为json格式用json接收数据
        # print(text)
        try:
            results = text["trans_result"][0]["dst"]
        except Exception:
            results = ""
        return results


if __name__ == "__main__":
    appKey = "20221012001387816"  # 你在第一步申请的APP ID
    appSecret = "ThXoAK3TTPMmnaOKX0yF"  # 公钥
    BaiduTranslate_test = BaiDuFanyi(appKey, appSecret)
    Results = BaiduTranslate_test.BdTrans("Hello, World!")  # 要翻译的词组
    print(Results)

In [None]:
translator = BaiDuFanyi(appKey, appSecret)

In [None]:
translator.BdTrans("你好，世界！")

In [None]:
non_en_data = all_data[all_data.lang != "en"]

In [None]:
non_en_data.shape

In [None]:
tqdm.pandas(desc="Translating")

non_en_data["translated_text"] = non_en_data.text.progress_apply(translator.BdTrans)

In [None]:
all_data = all_data.merge(non_en_data[["tweetId", "translated_text"]], on="tweetId", how="left")

In [None]:
non_en_data[["tweetId", "text", "translated_text"]].to_csv("translated_text_map.csv", index=False)

In [None]:
translated_dict = {
    str(k): v
    for (k, v) in non_en_data[["tweetId", "translated_text"]].itertuples(index=False, name=None)
}

In [None]:
json.dump(translated_dict, open("translated_text_map.json", "w"))