## 1. Select english sentences from the dataset

In [1]:
import pandas as pd
from tqdm import tqdm
import copy
from spacy.language import Language
from spacy_langdetect import LanguageDetector
import spacy
from langdetect import detect, DetectorFactory, detect_langs
from polyglot.detect import Detector
from nltk import tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\thoma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### 1.1 Delete duplicates

In [6]:
data = pd.read_csv("./data/dataset_cleand-1.csv", names=['id', 'native', 'language', 'text'], header=0)

In [7]:
data['text'] = data['text'].apply(lambda x : ' '.join(x.splitlines()).strip())

In [8]:
data.head(5)

Unnamed: 0,id,native,language,text
0,0,French,"[{'language': 'Spanish', 'level': 45, 'profici...","Je suis Monique 58 ans, de Metz dans le Nord-E..."
1,1,Japanese,"[{'language': 'English', 'level': 45, 'profici...",Hi there:) I live in Tokyo. I have been learni...
2,2,Spanish<br/>Catalan,"[{'language': 'English', 'level': 45, 'profici...",Hi everyone! I'm looking for a language excha...
3,3,Spanish,"[{'language': 'German', 'level': 19.2857142857...",Wuieres aprender español? Yo te puedo ayudar
4,4,Marathi<br/>Hindi,"[{'language': 'English', 'level': 45, 'profici...",I am second year undergraduate student from Mu...


In [9]:
# removing duplicates
print(f'Orginal Length: {len(data)}')
data = data.drop_duplicates(['native','language','text'])
print(f'New Length: \t{len(data)}')

Orginal Length: 82645
New Length: 	28936


### 1.2 use langdetect to extract english sentences

In [10]:
# disable warning
pd.options.mode.chained_assignment = None 

In [11]:
nlp = spacy.load("en_core_web_sm")

In [12]:
# setup langdetect
def get_lang_detector(nlp, name):
    return LanguageDetector()

Language.factory("language_detector", func=get_lang_detector)
nlp.add_pipe('language_detector', last=True)

<spacy_langdetect.spacy_langdetect.LanguageDetector at 0x21d6c00fb50>

In [13]:
DetectorFactory.seed = 42

In [14]:
csv_file = pd.read_csv("./data/english_only.csv", encoding='utf-8')
dataset = csv_file[['id','native','language','text']]
original_datset = copy.deepcopy(dataset)

c = 0
filtered_dataset = pd.DataFrame()

# loop through dataset, create a dataset with only english sentences...
# ...according to some probability
for i in tqdm(range(len(dataset['text']))):
    splitted_text = tokenize.sent_tokenize(dataset['text'][i])
    english_sent = []
    for sent in splitted_text:
        try:
            langs = detect_langs(sent)
            has_key = False
            lang_index = 0
            for l in range(len(langs)):
                if langs[l].lang == "en":
                    has_key = True
                    lang_index = l
                    break
            if has_key and langs[lang_index].prob > 0.9:
                english_sent.append(sent)
        except:
            pass
    if len(english_sent) != 0:
        dataset['text'][i] = " ".join(english_sent)
        # print(dataset['text'][i] ==  " ".join(english_sent))
        filtered_dataset = pd.concat([filtered_dataset, pd.DataFrame(dataset.iloc[[i]])])
        c += 1  

print(c)

100%|████████████████████████████████████████████████████████████████████████████| 25706/25706 [10:58<00:00, 39.04it/s]

25013





In [16]:
filtered_dataset.to_csv('./data/english_only.csv')

### 1.3 use polyglot to further filter for english sentences

In [27]:
# Test detector
text_content = "Hello. I'm looking for language partners for practice my English. I can help you with Italian of course. I wish to know new cultures. Ciao. Sto cercando dei partners linguistici per parlare inglese. Io posso aiutare a migliorare l'italiano. Desidero conoscere nuove culture."
#print(detect_langs(text_content))
print(Detector(text_content))

Prediction is reliable: True
Language 1: name: Italian     code: it       confidence:  55.0 read bytes:   593
Language 2: name: English     code: en       confidence:  44.0 read bytes:  1067
Language 3: name: un          code: un       confidence:   0.0 read bytes:     0


In [17]:
from polyglot.detect.base import logger as polyglot_logger
polyglot_logger.setLevel("ERROR")

In [29]:
filtered_dataset_polyg = pd.DataFrame()
c = 0

for i in tqdm(range(len(dataset['text']))):
    try: 
        langss = Detector(dataset['text'][i])
        langs = langss.languages
        has_key = False
        lang_index = 0
        for l in range(len(langs)):
            if langs[l].code == "en":
                has_key = True
                lang_index = l
                break
        
        if has_key and langs[lang_index].confidence > 0.99:
            filtered_dataset_polyg = pd.concat([filtered_dataset_polyg, pd.DataFrame(dataset.iloc[[i]])])
            c += 1
    except:
        pass
print(c)

100%|███████████████████████████████████████████████████████████████████████████| 25706/25706 [00:30<00:00, 851.03it/s]

25689





In [31]:
filtered_dataset_polyg.head(5)

Unnamed: 0,id,native,language,text
0,1,Japanese,"[{'language': 'English', 'level': 45, 'profici...",Hi there:) I live in Tokyo. I have been learni...
1,2,Spanish<br/>Catalan,"[{'language': 'English', 'level': 45, 'profici...",I'm looking for a language exchange to improve...
2,4,Marathi<br/>Hindi,"[{'language': 'English', 'level': 45, 'profici...",I am second year undergraduate student from Mu...
3,6,Italian,"[{'language': 'English', 'level': 45, 'profici...",i am here for learn english and i help you wit...
4,7,Chinese (Mandarin),"[{'language': 'English', 'level': 45, 'profici...","Hello, I like to make friends around the world..."


In [33]:
filtered_dataset_polyg.to_csv('./data/english_only_refined.csv')