In [1]:
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import SnowballStemmer
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import re
import string
import nltk
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr, kendalltau
import gensim.downloader as api



In [2]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

# Data Importing and Sorting

In [3]:
files = os.listdir('corpus')
files.remove('.DS_Store')
files.remove('scores_ru-en.csv')
scaler = MinMaxScaler()
for file_ in files:
  name = file_.split('-')[0] + file_.split('-')[1]
  vars()[name] = pd.read_csv(os.path.join('corpus', file_, 'scores.csv'))
  vars()[name].drop(columns = ['source', 'annotators', 'z-score'], inplace = True)
  vars()[name]['avg-score'] = scaler.fit_transform(vars()[name]['avg-score'].values.reshape(-1,1)) #normalizing values betwewen 0 and 1

In [4]:
english = csen.copy()
for df in [deen, ruen, zhen]:
  english = english.append(df)

In [5]:
finnish = enfi.copy()
chinese = enzh.copy()

In [6]:
english.reset_index(drop = True, inplace = True)

# Cleaning the corpus (updated  cleaning function)

In [7]:
def clean(text_list,
          command_dict = None,
#           lower = False,
#           keep_numbers = False,
#           keep_expression = False,
#           remove_char = False,
#           remove_stop = False,
#           remove_tag = False,
#           lemmatize = False,
#           stemmer = False,
          english = True
          ):
    """
    Function that a receives a list of strings and preprocesses it.
    
    :param text_list: List of strings.
    :param lemmatize: Tag to apply lemmatization if True.
    :param stemmer: Tag to apply the stemmer if True.
    """
    
    if english:
        lang = 'english'
    else:
        lang = 'finnish'
    
    stop = set(stopwords.words(lang))
    stem = SnowballStemmer(lang)
    
    updates = []
    for j in tqdm(range(len(text_list))):
        
        text = text_list[j]
        
        #LOWERCASE TEXT
        if command_dict['lower']:
            text = text.lower()
            
        #KEEP NUMBERS AS TOKENS
        if command_dict['keep_numbers']:
            text = re.sub("[\d+]", 'NUMBER', text)
        
        #KEEP '?' and '!' AS TOKENS
        if command_dict['keep_expression']:
            text = re.sub("[\?|\!]", 'EXPRESSION', text)
            
        #REMOVE THAT IS NOT TEXT
        if command_dict['remove_char']:
            text = re.sub("[^a-zA-Z]", ' ', text)
            
        #REMOVE TAGS
        if command_dict['remove_tag']:
            text = BeautifulSoup(text).get_text()
        
        #REMOVE STOP WORDS
        if command_dict['remove_stop']:
            text = ' '.join([word for word in text.split(' ') if word not in stop])
        
        #LEMMATIZATION
        if command_dict['lemmatize']:
            if english:
                lemma = WordNetLemmatizer()
                text = " ".join(lemma.lemmatize(word) for word in text.split())
#             else:
#                 lemma = libvoikko.Voikko(u"fi")
#                 text = " ".join(lemma.analyze(word)[0]['BASEFORM'] for word in text.split())
        
        #STEMMER
        if command_dict['stemmer']:
            text = " ".join(stem.stem(word) for word in text.split())
        
        updates.append(text)
        
    return updates

def clean_zh_stopwords(text_list, stopwords_set='merged'):
    """
    Function that removes chinese stopwords
    
    :param stopwords_set: remove words of both sets (merged), just the 1st (fst) or just the second (snd) 
    """
    updates = []
    
    zh_stopwords1 = [line.strip() for line in open('chinese_stopwords/chinese_stopwords1.txt', 'r', encoding='utf-8').readlines()]
    zh_stopwords2 = [line.strip() for line in open('chinese_stopwords/chinese_stopwords2.txt', 'r', encoding='utf-8').readlines()]
    
    if stopwords_set == 'merged':
        stop = list(set(zh_stopwords1 + zh_stopwords2))
    elif stopwords_set == 'fst':
        stop = zh_stopwords1
    elif stopwords_set == 'snd':
        stop = zh_stopwords2
        

    for j in tqdm(range(len(text_list))):
        text = text_list[j]
        text = ' '.join([word for word in text.split(' ') if word not in stop])
        updates.append(text)
        
    return updates
        
    
def update_df(dataframe, list_updated):
    dataframe.update(pd.DataFrame({"Text": list_updated}))

In [8]:
cleaning_dict = {'lower': False, 'keep_numbers': True, 'keep_expression': False, 'remove_char': True, 'remove_stop': True, 'remove_tag': False, 'lemmatize': False, 'stemmer': True}
english_cleaned = pd.DataFrame()
english_cleaned['avg-score'] = english['avg-score']
for column in ['reference', 'translation']:
    english_cleaned[column] = clean(english[column], cleaning_dict)

  0%|          | 0/77688 [00:00<?, ?it/s]

  0%|          | 0/77688 [00:00<?, ?it/s]

# Train, Dev & Test Split

In [9]:
en_train_cleaned, en_dev_cleaned = train_test_split(english_cleaned, shuffle = True, test_size = 0.2, random_state = 7)
en_dev_cleaned, en_test_cleaned = train_test_split(en_dev_cleaned, shuffle = True, test_size = 0.5, random_state = 7)

##NEED TO IMPLEMENT
# fin_train, fin_dev = train_test_split(finnish, shuffle = True, test_size = 0.2, random_state = 7)
# fin_dev, fin_test = train_test_split(fin_dev, shuffle = True, test_size = 0.5, random_state = 7)

# ch_train, ch_dev = train_test_split(chinese, shuffle = True, test_size = 0.2, random_state = 7)
# ch_dev, ch_test = train_test_split(ch_dev, shuffle = True, test_size = 0.5, random_state = 7)

# Encoding with pretrained model

In [10]:
model = api.load('word2vec-google-news-300')

In [11]:
names = ['en_train_cleaned', 'en_dev_cleaned', 'en_test_cleaned']
for j,df in tqdm(enumerate([en_train_cleaned, en_dev_cleaned, en_test_cleaned]), desc = 'Overall encoding process'):
    name = 'encoded_' + names[j]
    vars()[name] = pd.DataFrame()
    df.reset_index(drop = True, inplace = True)
    for column in ['reference', 'translation']:
        
        ready_for_encoding = [sent.split(' ') for sent in df[column]]
        filtered_doc = []
        for doc in tqdm(ready_for_encoding, desc = f'Filtering {names[j]} {column}'):
            words = filter(lambda x: x in model.index_to_key, doc)
            filtered_doc.append(words)
        vars()[name][column] = list(map(lambda x: [model.get_vector(word) for word in x], tqdm(filtered_doc, desc = f'Encoding {names[j]} {column}')))
        
    vars()[name].to_csv(name)

Overall encoding process: 0it [00:00, ?it/s]

Filtering en_train_cleaned reference:   0%|          | 0/62150 [00:00<?, ?it/s]

Encoding en_train_cleaned reference:   0%|          | 0/62150 [00:00<?, ?it/s]

Filtering en_train_cleaned translation:   0%|          | 0/62150 [00:00<?, ?it/s]

Encoding en_train_cleaned translation:   0%|          | 0/62150 [00:00<?, ?it/s]

Filtering en_dev_cleaned reference:   0%|          | 0/7769 [00:00<?, ?it/s]

Encoding en_dev_cleaned reference:   0%|          | 0/7769 [00:00<?, ?it/s]

Filtering en_dev_cleaned translation:   0%|          | 0/7769 [00:00<?, ?it/s]

Encoding en_dev_cleaned translation:   0%|          | 0/7769 [00:00<?, ?it/s]

Filtering en_test_cleaned reference:   0%|          | 0/7769 [00:00<?, ?it/s]

Encoding en_test_cleaned reference:   0%|          | 0/7769 [00:00<?, ?it/s]

Filtering en_test_cleaned translation:   0%|          | 0/7769 [00:00<?, ?it/s]

Encoding en_test_cleaned translation:   0%|          | 0/7769 [00:00<?, ?it/s]

# Regression

# Classification