In [35]:
import numpy as np

class LexMTurkDataProvider():  
    _filename = "/home/luis/RCI/multilingual-lexical-simplification/data/lexmturk/lexmturk.txt"  
    def get_position(self, word, sentence):
        """
        Returns the position of the word in the sentence. (Taken from germaneval_data_provider)
        """
        words = sentence.split()
        try:
            index = words.index(word)
        except:
            index = next((i for i, s in enumerate(words) if word in s), None)
        return index
    
    def process_line(self, line):
        parts = line.strip().split("\t")
        
        sentence = parts[0]
        complex_word = parts[1]
        position = self.get_position(complex_word, sentence)

        # Count the occurrences and group by counts simultaneously
        count_dict = {}
        for item in parts[2:]:
            count_dict[item] = count_dict.get(item, 0) + 1

        grouped_count_dict = {}
        for item, count in count_dict.items():
            if count in grouped_count_dict:
                grouped_count_dict[count].append(item)
            else:
                grouped_count_dict[count] = [item]

        # Sort by counts in descending order and create the final ranked dictionary
        substitution_dict = {rank + 1: items for rank, (count, items) in enumerate(sorted(grouped_count_dict.items(), key=lambda x: x[0], reverse=True))}

        return [sentence, complex_word, position, substitution_dict]
    
    def provide_data_as_numpy_array(self):
        with open(self._filename, "r", encoding="utf-8") as file:
            lines = file.readlines()

        processed_lines = [self.process_line(line) for line in lines]
        return np.array(processed_lines, dtype=object)

In [36]:
provider = LexMTurkDataProvider()
provider.provide_data_as_numpy_array()

array([['"In March 1992 , Linux version 0.95 was the first to be capable of running X. This large version number jump was due to a feeling that a version 1.0 with no major missing pieces was imminent ."',
        'pieces', 34,
        {1: ['parts'], 2: ['bits'], 3: ['components'], 4: ['component', 'sections', 'elements', 'part', 'information', 'items']}],
       ['Much of the water carried by these streams is diverted .',
        'diverted', 9,
        {1: ['redirected'], 2: ['rerouted'], 3: ['changed', 'moved'], 4: ['drawn away', 'turned', 'separated', 'switched', 'split', 'altered'], 5: ['led away', 'sent away', 'veered', 'channeled', 'deflected']}],
       ['"Harry also becomes the worthy possessor of the remaining Deathly Hallows : the Invisibility Cloak and the Resurrection Stone , hence becoming the true Master of Death ."',
        'possessor', 5,
        {1: ['owner'], 2: ['holder'], 3: ['keeper'], 4: ['buyer', 'master', 'teacher']}],
       ...,
       ['"Some features , howev

In [10]:
_filename = "/home/luis/RCI/multilingual-lexical-simplification/data/lexmturk/lexmturk.txt"
with open(_filename, "r", encoding="utf-8") as file:
    lines = file.readlines()

In [29]:
def get_position(word, sentence):
    """
    Returns the position of the word in the sentence.
    """
    words = sentence.split()
    try:
        index = words.index(word)
    except:
        index = next((i for i, s in enumerate(words) if word in s), None)
    return index

In [32]:
lines[0]
parts = lines[1].strip().split("\t")
sentence = parts[0]
word = parts[1]
position = get_position(word, sentence)

print(sentence)
print(word)
print(position)

# Count the occurrences and group by counts simultaneously
count_dict = {}
for item in parts[2:]:
    count_dict[item] = count_dict.get(item, 0) + 1

grouped_count_dict = {}
for item, count in count_dict.items():
    if count in grouped_count_dict:
        grouped_count_dict[count].append(item)
    else:
        grouped_count_dict[count] = [item]

# Sort by counts in descending order and create the final ranked dictionary
ranked_dict = {rank + 1: items for rank, (count, items) in enumerate(sorted(grouped_count_dict.items(), key=lambda x: x[0], reverse=True))}

ranked_dict

Much of the water carried by these streams is diverted .
diverted
9


{1: ['redirected'],
 2: ['rerouted'],
 3: ['changed', 'moved'],
 4: ['drawn away', 'turned', 'separated', 'switched', 'split', 'altered'],
 5: ['led away', 'sent away', 'veered', 'channeled', 'deflected']}