In [191]:
import pandas
import numpy
import gcld3
from pycountry import languages
import transformers
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
import translators as ts
from autocorrect import Speller
import re
import fasttext

In [130]:
# returns text's language
# speed or FastText would be better

def detect_language(text):
    detector = gcld3.NNetLanguageIdentifier(min_num_bytes=0, 
                                        max_num_bytes=100000)
    result = detector.FindLanguage(text=text)
    if languages.get(alpha_2=result.language) == None:
        return None
    return languages.get(alpha_2=result.language).name

In [131]:
## translates to English

def translate(text):
    return ts.google(text, if_use_cn_host=True)

In [132]:
## removes urls

def remove_url(text):
    text = re.sub(r'https?:\/\/(www\.)?[-a-zA-Z0–9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0–9@:%_\+.~#?&//=]*)', '', 
                   text, flags=re.MULTILINE)  # to remove links that start with HTTP/HTTPS in the tweet
    text = re.sub(r'[-a-zA-Z0–9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0–9@:%_\+.~#?&//=]*)', '', 
                   text, flags=re.MULTILINE) # to remove other url links
    return text

In [133]:
## removes emoji

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags 
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [134]:
## removes numbers and punctuations

def remove_nums_punctuations(text): 
    punctuations = '''!()-![]{};:+'"\,<>./?@#$%^&*_~'''
    nums = '0123456789'
    return ''.join([i for i in text if i not in punctuations and i not in nums])

In [135]:
## converting apostrophe/short words in the better form

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [136]:
## removes extra white spaces

def remove_extra_whitespaces(text):
    return re.sub(' {2,}', ' ', text)

In [137]:
## removes specific stop words

def remove_stop_words(text):
    stop_words=['in','of','at','a','the, i, them, you, their, myself, our, ours, me, he, him, it, ours']
    tokens = text.split(' ')
    return ' '.join([word for word in tokens if word not in stop_words])

In [178]:
## spell checker

def spell_check(text):
    spell = Speller('en')
    s = ' '.join([spell(w) for w in text.split(' ')])
    return s

In [177]:
## preprocessing

def preprocess(text):
    text = text.lower()
    text = remove_extra_whitespaces(text)
    text = remove_url(text)
    text = remove_emoji(text)
    text = remove_nums_punctuations(text)
    text = spell_check(text)
    text = decontracted(text)
    text = remove_stop_words(text)
    return text

In [None]:
TRAIN_PATH = '/Users/test/Downloads/archive/train.ft.txt'
TEST_PATH = '/Users/test/Downloads/archive/test.ft.txt'
NEW_TRAIN_PATH = '/Users/test/Downloads/archive/new_train.ft.txt'
NEW_TEST_PATH = '/Users/test/Downloads/archive/new_test.ft.txt'

In [194]:
train_txt = open(TRAIN_PATH, "r")
new_train_txt = open(NEW_TRAIN_PATH, "w")
p = 0.0
cnt = 0.0
for line in train_txt:
    if detect_language(line[10:]) == 'English':
        preprocessed = line[:10] + preprocess(line[10:])
        new_train_txt.write(preprocessed)
    if cnt == 36000:
        break
    if cnt == 360*p:
        print(str(p) + '%')
        p+=1
    cnt+=1
train_txt.close()
new_train_txt.close()

In [None]:
test_txt = open(TEST_PATH, "r")
new_test_txt = open(NEW_TEST_PATH, "w")
for line in test_txt:
    review = line[10:]
    language = detect_language(review)
    if language == None:
        continue
    if language != 'English':
        review = translate(review)
    preprocessed = line[:10] + preprocessed(review)
    new_test_txt.write(preprocessed)
test_txt.close()
new_test_txt.close()

In [192]:
model = fasttext.train_supervised(NEW_TRAIN_PATH, wordNgrams=2)

In [193]:
def print_results(sample_size, precision, recall):
    precision   = round(precision, 2)
    recall      = round(recall, 2)
    print(f'{sample_size=}')
    print(f'{precision=}')
    print(f'{recall=}')

print_results(*model.test(NEW_TEST_PATH))

sample_size=400000
precision=0.54
recall=0.54


In [None]:
def get_prediction(tp):
    if tp[0][0] == '__label__2':
        return 'Positive'
    else:
        return 'Negative'