# Merging each sentence with its judgements from all the assessors

In [1]:
import pandas as pd

# List of file names
files = [
    'assignments_from_pool_37939772__27-02-2023.tsv',
    'assignments_from_pool_37806285__16-03-2023.tsv'
]

# Read and concatenate all files into a single DataFrame
all_data = pd.concat([pd.read_csv(file, sep='\t') for file in files])



In [2]:
all_data.head(1)

Unnamed: 0,INPUT:text,INPUT:text1,INPUT:text2,OUTPUT:category,GOLDEN:category,HINT:text,HINT:default_language,ASSIGNMENT:link,ASSIGNMENT:task_id,ASSIGNMENT:assignment_id,ASSIGNMENT:worker_id,ASSIGNMENT:status,ASSIGNMENT:started
0,"На суде он свою вину отрицал , факты у обвинен...",,,[3],[7],,,https://toloka.yandex.ru/task/37939772/000242e...,000242ea3c--63fc6ce52666752d794a07b6,000242ea3c--63fc72235628fb35225214d1,3f8c1eda2e28803022f75dfd249567e8,APPROVED,2023-02-27T09:04:35.867


In [3]:
all_data.shape

(23140, 13)

In [4]:
all_data.dropna(inplace = True, axis=1)
all_data.columns = 'text cat link task_id assignment_id worker_id status started'.split()
all_data['judgement'] = all_data.cat.apply(lambda x: int(x[1]))
all_data = all_data.drop(all_data.columns[1], axis=1)
all_data.head(1)

Unnamed: 0,text,link,task_id,assignment_id,worker_id,status,started,judgement
0,"На суде он свою вину отрицал , факты у обвинен...",https://toloka.yandex.ru/task/37939772/000242e...,000242ea3c--63fc6ce52666752d794a07b6,000242ea3c--63fc72235628fb35225214d1,3f8c1eda2e28803022f75dfd249567e8,APPROVED,2023-02-27T09:04:35.867,3


In [5]:

# Group the data by 'INPUT:text' and aggregate the 'OUTPUT:category' values into a list
grouped_data = all_data.groupby('text')['judgement'].apply(list).reset_index()


In [6]:

# Find the length of the longest list in the 'judgement' column
max_length = grouped_data['judgement'].apply(len).max()



In [7]:
# Divide the 'judgement' column into multiple columns and fill NaNs with an empty string
score_columns = pd.DataFrame(grouped_data['judgement'].to_list(), columns=[f'judgement_{i}' for i in range(1, max_length + 1)])
score_columns.fillna(0, inplace=True)

In [8]:
# Concatenate the original 'INPUT:text' column with the new score columns
final_data = pd.concat([grouped_data['text'], score_columns], axis=1)


In [9]:
final_data.head()

Unnamed: 0,text,judgement_1,judgement_2,judgement_3,judgement_4,judgement_5,judgement_6,judgement_7,judgement_8,judgement_9,...,judgement_300,judgement_301,judgement_302,judgement_303,judgement_304,judgement_305,judgement_306,judgement_307,judgement_308,judgement_309
0,Автомобили появятся в салонах официальных диле...,2,3,1,1,2,2,2,5,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Автор сам бывший шахтер и очень хорошо описыва...,3,5,6,7,5,6,4,4,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Агент МИ6 работал паспортистом в посольстве Ве...,3,2,2,7,2,3,2,3,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Администрация города обратилась в прокуратуру ...,1,2,4,1,5,1,2,4,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Академик Аганбегян считает : в первую очередь ...,1,3,2,6,5,4,6,4,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
ru_ds = final_data.loc[:, 'text' : 'judgement_10']

In [21]:
ru_ds.head()

Unnamed: 0,text,judgement_1,judgement_2,judgement_3,judgement_4,judgement_5,judgement_6,judgement_7,judgement_8,judgement_9,judgement_10
0,Автомобили появятся в салонах официальных диле...,2,3,1,1,2,2,2,5,2,4
1,Автор сам бывший шахтер и очень хорошо описыва...,3,5,6,7,5,6,4,4,5,5
2,Агент МИ6 работал паспортистом в посольстве Ве...,3,2,2,7,2,3,2,3,3,1
3,Администрация города обратилась в прокуратуру ...,1,2,4,1,5,1,2,4,2,2
4,Академик Аганбегян считает : в первую очередь ...,1,3,2,6,5,4,6,4,3,4


In [22]:
ru_ds.shape

(1200, 11)

In [24]:
ru_ds.to_csv('complexity_ds_ru.csv', index = False)

# Merging [Russian, English, and Italian] datasets

In [25]:
from nltk.tokenize import word_tokenize
import pandas as pd


In [26]:
%%capture
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [27]:

# Function to calculate the number of tokens in a sentence
def count_tokens(sentence):
    return len(sentence.split())

# Function to calculate the total frequency of words in a sentence
def total_frequency(sentence, frequency_list):
    return sum(frequency_list.get(word, 0) for word in sentence.split())

# Function to calculate the average token length in a sentence
def average_token_length(sentence):
    words = sentence.split()
    return sum(len(word) for word in words) / len(words)




In [45]:
# Read the files into DataFrames
english_data = pd.read_csv("complexity_ds_en.csv")
italian_data = pd.read_csv("complexity_ds_it.csv")
russian_data = pd.read_csv("complexity_ds_ru.csv")


In [46]:
russian_data.head(1)

Unnamed: 0,text,judgement_1,judgement_2,judgement_3,judgement_4,judgement_5,judgement_6,judgement_7,judgement_8,judgement_9,judgement_10
0,Автомобили появятся в салонах официальных диле...,2,3,1,1,2,2,2,5,2,4


In [47]:
russian_data.describe()

Unnamed: 0,judgement_1,judgement_2,judgement_3,judgement_4,judgement_5,judgement_6,judgement_7,judgement_8,judgement_9,judgement_10
count,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0
mean,3.815,3.766667,3.7675,3.85,3.8475,3.839167,3.899167,3.8475,3.849167,3.826667
std,1.596736,1.512987,1.561669,1.564568,1.574688,1.600135,1.618492,1.554432,1.571668,1.583892
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
50%,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
75%,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
max,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0


In [30]:

# Calculate the average judgements for each DataFrame
english_data["avg_judgement"] = english_data.loc[:, "judgement1":"judgement20"].mean(axis=1)
italian_data["avg_judgement"] = italian_data.loc[:, "judgement1":"judgement20"].mean(axis=1)
russian_data["avg_judgement"] = russian_data.loc[:, "judgement_1":"judgement_10"].mean(axis=1)



In [31]:
# Add a column for the language
english_data["language"] = "English"
italian_data["language"] = "Italian"
russian_data["language"] = "Russian"



In [32]:
italian_data = italian_data.rename(columns={'SENTENCE': 'sentence'})
english_data = english_data.rename(columns={'SENTENCE': 'sentence'})
russian_data = russian_data.rename(columns={'text': 'sentence'})

In [33]:
english_data = english_data[['sentence', 'avg_judgement', 'language']]
italian_data = italian_data[['sentence', 'avg_judgement', 'language']]
russian_data = russian_data[['sentence', 'avg_judgement', 'language']]

In [34]:
english_data.shape

(1200, 3)

In [35]:
# Compute the total lemma frequencies for each sentence in each dataset
def total_lemma_frequencies(text, freq_list):
    tokens = word_tokenize(text)
    return sum(freq_list.get(token, 0) for token in tokens)

# Load the Russian frequency list
freq_df = pd.read_csv('freqrnc2011.csv', sep='\t')
russian_freq_list = freq_df.set_index('Lemma')['Freq(ipm)'].to_dict()

# Load the English frequency list
english_freq_df = pd.read_excel("SUBTLEX-US_frequency_list.xlsx") 
english_freq_list = dict(zip(english_freq_df['Word'], english_freq_df['Lg10WF']))

# Load the Italian frequency list
italian_freq_list = {}

with open("lemma-WITHOUTnumberssymbols-frequencies-paisa.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()

for line in lines:
    if not line.startswith("#") and ',' in line:
        # Split only at the first comma
        lemma, freq = line.strip().split(',', 1)
        italian_freq_list[lemma] = int(freq)

russian_data['total_lemma_freq'] = russian_data['sentence'].apply(lambda x: total_lemma_frequencies(x, russian_freq_list))
english_data['total_lemma_freq'] = english_data['sentence'].apply(lambda x: total_lemma_frequencies(x, english_freq_list))
italian_data['total_lemma_freq'] = italian_data['sentence'].apply(lambda x: total_lemma_frequencies(x, italian_freq_list))


In [36]:
# Merge the DataFrames
merged_data = pd.concat([english_data, italian_data, russian_data], ignore_index=True)


In [37]:
merged_data.head()

Unnamed: 0,sentence,avg_judgement,language,total_lemma_freq
0,Amcast Industrial Corp. said it plans to repur...,2.6,English,63.017062
1,GDP is the total value of a nation's output of...,1.55,English,52.255728
2,"Town & Country Ford in Charlotte, N.C., still ...",1.95,English,41.779319
3,"A couple in Rockford, Ill., raised $ 12,591 ea...",2.9,English,75.976735
4,Yesterday the company said it had filed a requ...,3.3,English,89.013683


In [38]:
merged_data.shape

(3522, 4)

In [39]:
# Calculate the number of tokens, total frequency, and average token length
merged_data["num_tokens"] = merged_data["sentence"].apply(count_tokens)
merged_data["avg_token_length"] = merged_data["sentence"].apply(average_token_length)



In [40]:
merged_data.head(1)

Unnamed: 0,sentence,avg_judgement,language,total_lemma_freq,num_tokens,avg_token_length
0,Amcast Industrial Corp. said it plans to repur...,2.6,English,63.017062,22,5.227273


In [41]:
merged_data[merged_data['language'] == 'Russian']

Unnamed: 0,sentence,avg_judgement,language,total_lemma_freq,num_tokens,avg_token_length
2322,Автомобили появятся в салонах официальных диле...,2.4,Russian,62748.4,10,5.600000
2323,Автор сам бывший шахтер и очень хорошо описыва...,5.0,Russian,125001.2,30,4.633333
2324,Агент МИ6 работал паспортистом в посольстве Ве...,2.8,Russian,62748.4,10,6.100000
2325,Администрация города обратилась в прокуратуру ...,2.4,Russian,74447.2,15,5.600000
2326,Академик Аганбегян считает : в первую очередь ...,3.8,Russian,99152.8,20,5.950000
...,...,...,...,...,...,...
3517,"Я тут же отозвалась и написала : "" Дорогой Сен...",3.7,Russian,124585.3,25,3.560000
3518,"Я уверена , что когда эта барышня вырастет , т...",3.6,Russian,172802.2,35,4.171429
3519,"Якобы это он написал Алексею Каплеру в "" Киноп...",4.3,Russian,128075.6,25,3.840000
3520,Японский центр в Нижнем Новгороде открылся в п...,1.5,Russian,63014.3,10,5.200000


In [42]:
merged_data.to_csv('merged_datasets.csv', index = False)