In [12]:
import langdetect
import pandas as pd
from pyrootutils import setup_root
from tqdm import tqdm
import re

root = setup_root(".", pythonpath=True)

langdetect.DetectorFactory.seed = 0

In [13]:
dev_data = pd.read_csv(
    root / "data/image-verification-corpus-master/mediaeval2015/devset/tweets.txt",
    delimiter="\t",
)

# %%
test_data = pd.read_csv(
    root / "data/image-verification-corpus-master/mediaeval2015/testset/tweets.txt",
    delimiter="\t",
)

In [14]:
all_data = pd.concat([dev_data, test_data], axis=0)

In [15]:
def clean_text(text):
    try:
        text = text.decode("utf-8").lower()
    except Exception as ex:
        text = text.encode("utf-8").decode("utf-8").lower()
    text = re.sub("\u2019|\u2018", "'", text)
    text = re.sub("\u201c|\u201d", '"', text)
    text = re.sub("[\u2000-\u206F]", " ", text)
    text = re.sub("[\u20A0-\u20CF]", " ", text)
    text = re.sub("[\u2100-\u214F]", " ", text)
    text = re.sub(r"http:\ ", "http:", text)
    text = re.sub(r"http[s]?:[^\ ]+", " ", text)
    text = re.sub(r"&gt;", " ", text)
    text = re.sub(r"&lt;", " ", text)
    text = re.sub(r"&quot;", " ", text)
    text = re.sub(r"\"", " ", text)
    text = re.sub(r"#\ ", "#", text)
    text = re.sub(r"\\n", " ", text)
    text = re.sub(r"\\", " ", text)
    text = re.sub(r"[\(\)\[\]\{\}]", r" ", text)
    text = re.sub(
        "[" "\U0001F300-\U0001F64F" "\U0001F680-\U0001F6FF" "\u2600-\u26FF\u2700-\u27BF]+",
        r" ",
        text,
    )
    text = re.sub(r"\'s", " is ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " had ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"#", " #", text)
    text = re.sub(r"@", " @", text)
    text = re.sub(r"[\!\?\.\,\+\-\$\%\^\>\<\=\:\;\*\(\)\{\}\[\]\/\~\&\'\|]", " ", text)
    text = text.strip()
    text = " ".join(text.split())

    return text

In [16]:
def detection_lang(text: str):
    try:
        lang = langdetect.detect(text)
    except Exception as e:
        tqdm.write(f"text: {text}")
        tqdm.write(str(e))
        lang = "unk"
    return lang

In [17]:
tqdm.pandas(desc="Detecting language")

all_data["text"] = all_data.tweetText.progress_apply(clean_text)
all_data["lang"] = all_data.text.progress_apply(detection_lang)

Detecting language: 100%|██████████| 18032/18032 [00:00<00:00, 28069.04it/s]
Detecting language:  67%|██████▋   | 12014/18032 [01:12<00:36, 166.00it/s]

text: 
No features in text.


Detecting language: 100%|██████████| 18032/18032 [01:42<00:00, 176.68it/s]


In [33]:
all_data.tweetId[0]

0    263046056240115712
0    578854927457349632
Name: tweetId, dtype: int64

In [34]:
all_data[all_data.lang != "en"][["tweetId", "text"]].to_excel("all_data.xlsx", index=False)

In [35]:
all_data[all_data.lang != "en"].head(10)

Unnamed: 0,tweetId,tweetText,userId,imageId(s),username,timestamp,label,lang,text
0,263046056240115712,¿Se acuerdan de la película: “El día después d...,21226711,sandyA_fake_46,iAnnieM,Mon Oct 29 22:34:01 +0000 2012,fake,es,¿se acuerdan de la película el día después de ...
1,262995061304852481,@milenagimon: Miren a Sandy en NY! Tremenda i...,192378571,sandyA_fake_09,CarlosVerareal,Mon Oct 29 19:11:23 +0000 2012,fake,es,@milenagimon miren a sandy en ny tremenda imag...
2,262979898002534400,"Buena la foto del Huracán Sandy, me recuerda a...",132303095,sandyA_fake_09,LucasPalape,Mon Oct 29 18:11:08 +0000 2012,fake,es,buena la foto del huracán sandy me recuerda a ...
13,262990978611286016,Good luck #ny #newyork #usa #hurricane #sandy ...,125724906,sandyA_fake_29,gsevigny,Mon Oct 29 18:55:10 +0000 2012,fake,cy,good luck #ny #newyork #usa #hurricane #sandy
35,263422787513901056,Mans best friend #love #hurricane #sandy #dog ...,174085679,sandyA_fake_21,CafeBustelo711,Tue Oct 30 23:31:01 +0000 2012,fake,da,mans best friend #love #hurricane #sandy #dog
66,263019769895190528,"Que Deus proteja o Soho, a All Saints e a XL !...",83630316,sandyA_fake_09,nilmar,Mon Oct 29 20:49:34 +0000 2012,fake,ca,que deus proteja o soho a all saints e a xl #s...
67,263276356165586944,#sandy #hurricane #fun #usa http://t.co/I61JSFID,86832033,sandyA_fake_47,hakosanart,Tue Oct 30 13:49:09 +0000 2012,fake,es,#sandy #hurricane #fun #usa
149,263142454511951873,Акула на шоссейной магистрали \n#hurricane #sa...,376885703,sandyA_fake_05,policy_by,Tue Oct 30 04:57:04 +0000 2012,fake,bg,акула на шоссейной магистрали #hurricane #sandy
179,263107559597146112,Nunca imaginei imaginar essa cena na vida real...,24748643,sandyA_fake_17,lyviagamerco,Tue Oct 30 02:38:25 +0000 2012,fake,it,nunca imaginei imaginar essa cena na vida real...
222,263009179613134848,Holy frankenstorm! #newyork #frankenstorm #hur...,327202954,sandyA_fake_34,maddieg_rae,Mon Oct 29 20:07:29 +0000 2012,fake,no,holy frankenstorm #newyork #frankenstorm #hurr...


In [36]:
from src.utils.google_trans_new.google_trans_new import google_translator

In [41]:
translator = google_translator(
    proxies={"https": "172.22.112.1:7890"},
    timeout=5,
)

In [1]:
from googletrans import Translator
from httpcore import SyncHTTPProxy

In [3]:
translator = Translator(
    proxies={"http": SyncHTTPProxy((b'http', b'172.22.112.1', 7890, b''))})

In [28]:
import requests
import random
import json
import hashlib
from hashlib import md5

class BaiDuFanyi:
    def __init__(self, appKey, appSecret):
        self.url = 'https://fanyi-api.baidu.com/api/trans/vip/translate'
        self.appid = appKey
        self.secretKey = appSecret
        self.fromLang = 'auto'
        self.toLang = 'en'
        self.salt = random.randint(32768,65536)
        self.header = {'Content-Type': 'application/x-www-form-urlencoded'}
    def BdTrans(self,text):
        sign = self.appid + text + str(self.salt) + self.secretKey
        md = hashlib.md5()
        md.update(sign.encode(encoding='utf-8'))
        sign =md.hexdigest()
        data = {
            "appid": self.appid,
            "q": text,
            "from": self.fromLang,
            "to": self.toLang,
            "salt": self.salt,
            "sign": sign
        }
        response = requests.post(self.url, params=data, headers=self.header)  # 发送post请求
        text = response.json()  # 返回的为json格式用json接收数据
        # print(text)
        try:
            results = text['trans_result'][0]['dst']
        except Exception:
            results = ""
        return results

if __name__=='__main__':
    appKey = '20221012001387816'   #你在第一步申请的APP ID
    appSecret = 'ThXoAK3TTPMmnaOKX0yF' #公钥
    BaiduTranslate_test = BaiDuFanyi(appKey,appSecret)
    Results = BaiduTranslate_test.BdTrans("Hello, World!")#要翻译的词组
    print(Results)


Hello, World!


In [29]:
translator = BaiDuFanyi(appKey, appSecret)

In [27]:
translator.BdTrans("你好，世界！")

'Hello, World!'

In [20]:
non_en_data = all_data[all_data.lang != "en"]

In [22]:
non_en_data.shape

(4477, 9)

In [30]:
tqdm.pandas(desc="Translating")

non_en_data["translated_text"] = non_en_data.text.progress_apply(translator.BdTrans)

Translating: 100%|██████████| 4477/4477 [47:18<00:00,  1.58it/s] 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_en_data["translated_text"] = non_en_data.text.progress_apply(translator.BdTrans)


In [38]:
all_data = all_data.merge(non_en_data[["tweetId", "translated_text"]], on="tweetId", how="left")

In [39]:
non_en_data[["tweetId", "text", "translated_text"]].to_csv("translated_text_map.csv", index=False)

In [45]:
translated_dict ={str(k): v for (k, v) in non_en_data[["tweetId", "translated_text"]].itertuples(index=False, name=None)}

In [47]:
import json

json.dump(translated_dict, open("translated_text_map.json", "w"))