## 1. Select english sentences from the dataset

In [1]:
import pandas as pd
from tqdm import tqdm
import copy
from spacy.language import Language
from spacy_langdetect import LanguageDetector
import spacy
from langdetect import detect, DetectorFactory, detect_langs
from polyglot.detect import Detector
from nltk import tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\thoma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### 1.1 Delete duplicates

In [5]:
data = pd.read_csv("./data/dataset_cleand-1.csv", names=['id', 'native', 'language', 'text'], header=0)

In [6]:
data['text'] = data['text'].apply(lambda x : ' '.join(x.splitlines()).strip())

In [7]:
data.head(5)

Unnamed: 0,id,native,language,text
0,0,French,"[{'language': 'Spanish', 'level': 45, 'profici...","Je suis Monique 58 ans, de Metz dans le Nord-E..."
1,1,Japanese,"[{'language': 'English', 'level': 45, 'profici...",Hi there:) I live in Tokyo. I have been learni...
2,2,Spanish<br/>Catalan,"[{'language': 'English', 'level': 45, 'profici...",Hi everyone! I'm looking for a language excha...
3,3,Spanish,"[{'language': 'German', 'level': 19.2857142857...",Wuieres aprender español? Yo te puedo ayudar
4,4,Marathi<br/>Hindi,"[{'language': 'English', 'level': 45, 'profici...",I am second year undergraduate student from Mu...


In [9]:
# removing duplicates
print(f'Orginal Length: {len(data)}')
data = data.drop_duplicates(['native','language','text'])
print(f'New Length: \t{len(data)}')

Orginal Length: 82645
New Length: 	28936


### 1.2 use langdetect to extract english sentences

In [10]:
# disable warning
pd.options.mode.chained_assignment = None 

In [11]:
new_data = []
for i in tqdm(range(len(data))):
    entry = data.iloc[i]
    try:
        detected_langs = detect_langs(entry['text'])
        for lang in detected_langs:
            if lang.lang == 'en' and lang.prob > 0.5:
                new_data.append(entry)
                '''
                print(entry['text'])
                print(detected_langs)
                '''
                new_data[len(new_data)- 1]["probabilities"] = detected_langs 
                if lang.prob >= 0.99:
                    new_data[len(new_data)- 1]['english_only'] = True
                else:
                    new_data[len(new_data) - 1]['english_only'] = False
    except:
        print("weird entry")
        print(entry)

 14%|██████████▋                                                                 | 4055/28936 [00:29<02:46, 149.27it/s]

weird entry
id                                                       7324
native                                                 Arabic
language    [{'language': 'English', 'level': 45, 'profici...
text        I love Japan♡♡♡♡♡♡♡ ♡♡♡♡♡♡♡♡♡♡♡♡♡♡♡♡♡♡♡♡♡♡♡♡♡♡♡♡♡
Name: 7324, dtype: object


 69%|███████████████████████████████████████████████████▉                       | 20056/28936 [02:36<01:17, 114.97it/s]

weird entry
id                                                      58860
native                                                Persian
language    [{'language': 'English', 'level': 45, 'profici...
text        ᴄᴏғғᴇᴇ ᴀᴅᴅɪᴄᴛ ◑ᴸᴼᴼᴷᴵᴺᴳ ᶠᴼᴿ ᴬ ᴳᴼᴼᴰ ᴮᴼᴼᴷ, ᵐᵒᵛⁱᵉ&...
Name: 58860, dtype: object


100%|███████████████████████████████████████████████████████████████████████████| 28936/28936 [03:46<00:00, 127.54it/s]


In [12]:
new_data_copy = copy.deepcopy(new_data)

In [13]:
new_data_df = pd.DataFrame(new_data_copy)

In [14]:
new_data_df.to_csv("./data/english_only.csv")

### 1.3 use polyglot to further filter for english sentences

In [15]:
nlp = spacy.load("en_core_web_sm")

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
def get_lang_detector(nlp, name):
    return LanguageDetector()


Language.factory("language_detector", func=get_lang_detector)
nlp.add_pipe('language_detector', last=True)

<spacy_langdetect.spacy_langdetect.LanguageDetector at 0x245959c1d50>

In [17]:
DetectorFactory.seed = 42

In [18]:
csv_file = pd.read_csv("./data/english_only.csv", encoding='utf-8')
dataset = csv_file[['id','native','language','text']]
original_datset = copy.deepcopy(dataset)

c = 0
filtered_dataset = pd.DataFrame()

for i in tqdm(range(len(dataset['text']))):
    splitted_text = tokenize.sent_tokenize(dataset['text'][i])
    english_sent = []
    for sent in splitted_text:
        try:
            langs = detect_langs(sent)
            has_key = False
            lang_index = 0
            for l in range(len(langs)):
                if langs[l].lang == "en":
                    has_key = True
                    lang_index = l
                    break
            if has_key and langs[lang_index].prob > 0.9:
                english_sent.append(sent)
        except:
            pass
    if len(english_sent) != 0:
        dataset['text'][i] = " ".join(english_sent)
        # print(dataset['text'][i] ==  " ".join(english_sent))
        filtered_dataset = pd.concat([filtered_dataset, pd.DataFrame(dataset.iloc[[i]])])
        c += 1  

print(c)

100%|████████████████████████████████████████████████████████████████████████████| 25706/25706 [11:13<00:00, 38.15it/s]

25013





In [19]:
filtered_dataset.to_csv('./data/english_only_refined.csv')

In [21]:
# Test detector
text_content = "Hello. I'm looking for language partners for practice my English. I can help you with Italian of course. I wish to know new cultures. Ciao. Sto cercando dei partners linguistici per parlare inglese. Io posso aiutare a migliorare l'italiano. Desidero conoscere nuove culture."
#print(detect_langs(text_content))
#print(Detector(text_content))

Prediction is reliable: True
Language 1: name: Italian     code: it       confidence:  55.0 read bytes:   593
Language 2: name: English     code: en       confidence:  44.0 read bytes:  1067
Language 3: name: un          code: un       confidence:   0.0 read bytes:     0


In [22]:
filtered_dataset_polyg = pd.DataFrame()
c = 0
for i in range(len(dataset['text'])):
    try: 
        langss = Detector(dataset['text'][i])
        langs = langss.languages
        if dataset['text'][i] == text_content:
            print(langss)
        has_key = False
        lang_index = 0
        for l in range(len(langs)):
            if langs[l].code == "en":
                has_key = True
                lang_index = l
                break
        if has_key and langs[lang_index].confidence > 0.99:
            filtered_dataset_polyg = pd.concat([filtered_dataset_polyg, pd.DataFrame(dataset.iloc[[i]])])
            c += 1
    except:
        pass
print(c)

Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to dete

25688


In [18]:
print(filtered_dataset_polyg)

         id                native  \
1         2   Spanish<br/>Catalan   
4         6               Italian   
9        23   Spanish<br/>Catalan   
11       27               Spanish   
13       34               Spanish   
...     ...                   ...   
3225  32988  Spanish<br/>Galician   
3227  32990  Spanish<br/>Galician   
3228  32991  Spanish<br/>Galician   
3231  32995  Spanish<br/>Galician   
3233  32997  Spanish<br/>Galician   

                                               language  \
1     [{'language': 'English', 'level': 45, 'profici...   
4     [{'language': 'English', 'level': 45, 'profici...   
9     [{'language': 'Danish', 'level': 6.42857142857...   
11    [{'language': 'English', 'level': 45, 'profici...   
13    [{'language': 'English', 'level': 45, 'profici...   
...                                                 ...   
3225  [{'language': 'English', 'level': 45, 'profici...   
3227  [{'language': 'English', 'level': 45, 'profici...   
3228  [{'language': 'Eng

In [30]:
filtered_dataset_polyg.to_csv('./data/only_english_polyg.csv')