In [None]:
import re

def remove_unwanted_symbols(sent: str):
    """

    remove unwanted symbols

    """
    
    
    symbols = '[\(\)|/#&%*\[\]《》\{\}~<=>`——-]+'
    sent = re.sub(symbols, "", sent)
    return sent.strip()

In [None]:
print(remove_unwanted_symbols("Hel{l}o..."))

In [6]:
!pip install camel-tools
!camel_data full

In [9]:
import camel_tools as camel
from camel_tools.utils.stringutils import force_encoding, force_unicode
from camel_tools.utils.charmap import CharMapper

def ar_clean(mapper, sent: str):
    sent = force_unicode(sent)
    return mapper.map_string(sent)


mapper = CharMapper.builtin_mapper('arclean')
print(ar_clean(mapper, "تُريدُ إخْباري ماذا أنت تَعتقدُ 122؟"))

تريد إخباري ماذا أنت تعتقد 122?


In [10]:
from camel_tools.tokenizers.word import simple_word_tokenize
print(simple_word_tokenize("اريد ان اذهب الى السينما, المتحف"))

['اريد', 'ان', 'اذهب', 'الى', 'السينما', ',', 'المتحف']


In [11]:
# Go to URL
def rmURL(line):
    import  re
    pattern = re.compile(r'http[a-zA-Z0-9.?/&=:]*')
    return pattern.sub('', line.strip())
print(rmURL("mohamed is going to visit http://mabdelrehim.com/about jhjh"))

mohamed is going to visit  jhjh


In [12]:
!pip install demoji



In [13]:
import demoji
demoji.download_codes()
print(demoji.replace_with_desc("'I bet you didn't know that 🙋, 🙋‍♂️, and 🙋‍♀️ are three different emojis."))

Downloading emoji data ...
... OK (Got response in 0.16 seconds)
Writing emoji data to /home/nlp-mt/.demoji/codes.json ...
... OK
'I bet you didn't know that :person raising hand:, :person raising hand:‍♂️, and :woman raising hand: are three different emojis.


In [14]:
import string
print(string.printable)

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ 	



In [15]:
def lang_detect(src_sent: str, trg_sent: str, src_lang: str, trg_lang: str , fasttext_model):

    """

    Takes a parallel pair of sentences in a parallel corpus and returns whether to remove the pair
    or not based on
        1- A large portion of the source sentence is in a language other than the source language
        2- A large portion of the target sentence is in a language other than the target language

    """

    detected_src, confidence_src = fasttext_model.predict(src_sent, k=1)
    detected_src = detected_src[0].replace("__label__", "")
    confidence_src = confidence_src[0]

    detected_trg, confidence_trg = fasttext_model.predict(trg_sent, k=1)
    detected_trg = detected_trg[0].replace("__label__", "")
    confidence_trg = confidence_trg[0]

    #print(detected_src,"|||||||",detected_src)

    if (not (detected_src == src_lang)) or (not (detected_trg == trg_lang)):
        print(f'src {detected_src} trg {detected_trg}')
        return True
    elif (confidence_src < 0.85) or (confidence_trg < 0.85):
        print(f'confidence src {confidence_src} confidence trg {confidence_trg}')
        return True
    else:
        print(f'src {detected_src} trg {detected_trg}')
        print(f'confidence src {confidence_src} confidence trg {confidence_trg}')
        return False


In [16]:
!wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin

--2021-02-19 13:09:34--  https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.75.142, 104.22.74.142, 172.67.9.4, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.75.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 131266198 (125M) [application/octet-stream]
Saving to: 'lid.176.bin.1'


2021-02-19 13:09:44 (13.2 MB/s) - 'lid.176.bin.1' saved [131266198/131266198]



In [18]:
import fasttext
fasttext_model = fasttext.load_model('lid.176.bin')
lang_detect(src_sent="诊所将继续遵循 PHE 制定的 COVID-19 方案，有感染风险者应转诊至控制通道，而不是亲自前往诊所就诊。", trg_sent="ستظل العيادات تتبع بروتوكول هيئة الصحة العامة في إنجلترا لكوفيد-19 فيما يتعلق بالأشخاص المعرضين لخطر الإصابة بالعدوى الذين يجب وضع إشارات تحذيرية لتحديدهم في مسار الاحتواء، عوضًا عن حضورهم جسديًّا.؟", src_lang="zh", trg_lang="ar" , fasttext_model=fasttext_model)



src zh trg ar
confidence src 0.9829135537147522 confidence trg 0.9992139935493469


False

In [19]:
!pip install laserembeddings
!pip install laserembeddings[zh]
!pip install laserembeddings[ja]
!python -m laserembeddings download-models

Downloading models into /anaconda/envs/data-env2/lib/python3.8/site-packages/laserembeddings/data

✅   Downloaded https://dl.fbaipublicfiles.com/laser/models/93langs.fcodes    
✅   Downloaded https://dl.fbaipublicfiles.com/laser/models/93langs.fvocab    
✅   Downloaded https://dl.fbaipublicfiles.com/laser/models/bilstm.93langs.2018-12-26.pt    

✨ You're all set!


In [29]:
from laserembeddings import Laser
from sklearn.metrics.pairwise import cosine_similarity
laser = Laser()

def laser_filter(laser_model, src: str, trg: str, src_lang: str, trg_lang: str):
    """
    
    The laser model tries to embed sentences that are of the same meaning with similar embeddings
    basic experimentation tells that sentences with cosine similarity score >= 0.8 are of acceptable quality
    
    """
    embeddings_a = laser_model.embed_sentences(
        [src],
        lang=src_lang)
    embeddings_b = laser_model.embed_sentences(
        [trg],
        lang=trg_lang)
    similatiry_score = cosine_similarity(embeddings_a, embeddings_b).flatten()[0]
    if similarity_score < 0.8:
        return True
    else:
        return False
    

In [32]:
get_laser_similarity("There should be no memory", "应该 没有 什么 记忆 了", "en", "zh")

[0.8142539]


In [9]:

import argparse
from pathlib import Path
import pandas as pd
import os
import time
import fasttext
from laserembeddings import Laser
from tqdm import tqdm



def batched_laser_filter(laser_model, sources, targets, src_lang: str, trg_lang: str, batch_size=4096):
    """
    
    The laser model tries to embed sentences that are of the same meaning with similar embeddings
    basic experimentation tells that sentences with cosine similarity score >= 0.8 are of acceptable quality
    
    """
    start = time.time()
    i = 0
    embeddings_a = []
    with tqdm(total=len(sources) // batch_size) as pbar:
        while i < len(sources):
            embeddings_a.extend(laser_model.embed_sentences(
            sources[i:i+batch_size],
            lang=src_lang))
            i = i + batch_size
            pbar.update(1)
    end = time.time()
    elapsed = end - start
    print(f"---- Calculated laser embeddings for {len(sources)} sentences from source language in {elapsed // 60},  mins , {elapsed} % 60,  secs")

    start = time.time()
    i = 0
    embeddings_b = []
    with tqdm(total=len(targets) // batch_size) as pbar:
        while i < len(targets):
            embeddings_b.extend(laser_model.embed_sentences(
            targets[i:i+batch_size],
            lang=trg_lang))
            i = i + batch_size
            pbar.update(1)
    end = time.time()
    elapsed = end - start
    print(f"---- Calculated laser embeddings for {len(targets)} sentences from target language in {elapsed // 60},  mins , {elapsed} % 60,  secs")

    similarities = []
    start = time.time()
    for a, b in tqdm(zip(embeddings_a, embeddings_b)):
        similarities.append(cosine_similarity([a], [b]))
    end = time.time()
    elapsed = end - start
    print(f"---- Calculated cosine for {len(targets)} pairs in {elapsed // 60},  mins , {elapsed} % 60,  secs")


laser = Laser()
data_file_src = open(os.path.join("../open-subtitles-2018", "data" + "." + "zh"), "r")
data_file_trg = open(os.path.join("../open-subtitles-2018", "data" + "." + "en"), "r")

src_sents = data_file_src.readlines()
trg_sents = data_file_trg.readlines()

df = pd.DataFrame(
    {
        "zh": src_sents,
        "en": trg_sents
    }
)
batched_laser_filter(laser, src_sents, trg_sents, "zh", "en")
print(f'---- Number of samples in before cleaning: ', len(df))

  1%|          | 34/2735 [02:09<2:51:33,  3.81s/it]


KeyboardInterrupt: 

In [20]:
from sklearn.metrics.pairwise import cosine_similarity
print(cosine_similarity([[1, 1, 1]], [[1, 10, 100]]))

[[0.63764677]]
