In [2]:
import sys
import pandas as pd

In [3]:
# sets the maximum width of columns and remove ellipses
pd.set_option('display.max_colwidth', None)

In [4]:
# Connect modules to the current notebook
sys.path.insert(1, '../')

In [6]:
# read dataset and Remove unnecessary column
df = pd.read_csv('../data/raw/filtered.tsv', sep='\t', index_col=0)
df = df.drop('lenght_diff', axis=1)
df.head(30)

Unnamed: 0,reference,translation,similarity,ref_tox,trn_tox
0,"If Alkar is flooding her with psychic waste, that explains the high level of neurotransmitters.","if Alkar floods her with her mental waste, it would explain the high levels of neurotransmitter.",0.785171,0.014195,0.981983
1,Now you're getting nasty.,you're becoming disgusting.,0.749687,0.065473,0.999039
2,"Well, we could spare your life, for one.","well, we can spare your life.",0.919051,0.213313,0.985068
3,"Ah! Monkey, you've got to snap out of it.","monkey, you have to wake up.",0.664333,0.053362,0.994215
4,I've got orders to put her down.,I have orders to kill her.,0.726639,0.009402,0.999348
5,I'm not gonna have a child... ...with the same genetic disorder as me who's gonna die. L...,I'm not going to breed kids with a genetic disorder that makes them die.,0.703185,0.950956,0.035846
6,"They're all laughing at us, so we'll kick your ass.",they're laughing at us. We'll show you.,0.618866,0.999492,0.000131
7,Maine was very short on black people back then.,there wasn't much black in Maine then.,0.720482,0.96368,0.14871
8,"Briggs, what the hell's happening?","Briggs, what the hell is going on?",0.920373,0.159096,0.841071
9,"Another one simply had no clue what to do, so whenever he met my brother he'd beat the crap out of him, and then say:","another simply didn't know what to do, so whenever he met my brother, he nearly beat the shit out of him.",0.87754,0.055371,0.930472


In [7]:
df.describe()

Unnamed: 0,similarity,ref_tox,trn_tox
count,577777.0,577777.0,577777.0
mean,0.758469,0.541372,0.43449
std,0.092695,0.457571,0.458904
min,0.600001,3.3e-05,3.3e-05
25%,0.681105,0.012171,0.000707
50%,0.754439,0.806795,0.085133
75%,0.831244,0.990469,0.973739
max,0.95,0.999724,0.99973


In [8]:
# Swap columns with ref_tox and trn_tox mixed up
df_mixed_up = df[(df['ref_tox'] < df['trn_tox'])]
df_mixed_up.columns = ['translation', 'reference', 'similarity', 'trn_tox', 'ref_tox']
df_proper = df
df_proper.loc[df_mixed_up.index] = df_mixed_up.loc[df_mixed_up.index]

In [9]:
sum(df_proper['ref_tox'] > df_proper['trn_tox'])

577777

In [10]:
# Create ref len and trn len columns
df_proper['reference_length'] = df_proper['reference'].apply(lambda x: len(x.split()))
df_proper['translation_length'] = df_proper['translation'].apply(lambda x: len(x.split()))

In [11]:
df_proper.describe()

Unnamed: 0,similarity,ref_tox,trn_tox,reference_length,translation_length
count,577777.0,577777.0,577777.0,577777.0,577777.0
mean,0.758469,0.94026,0.035601,10.112026,9.851858
std,0.092695,0.100831,0.079399,6.988392,6.990752
min,0.600001,0.500139,3.3e-05,1.0,1.0
25%,0.681105,0.940145,0.000164,5.0,5.0
50%,0.754439,0.983842,0.003456,8.0,8.0
75%,0.831244,0.997519,0.027242,13.0,13.0
max,0.95,0.99973,0.499494,253.0,179.0


In [12]:
# Get only rows with very toxic reference and with very low toxicity translation
df_similar = df_proper[(df_proper['ref_tox'] > 0.99) & (df_proper['trn_tox'] < 0.01) & (df_proper['similarity'] > 0.7) & (df_proper['reference_length'] <= 60) & (df_proper['translation_length'] <= 60)]
df_similar.head(10)

Unnamed: 0,reference,translation,similarity,ref_tox,trn_tox,reference_length,translation_length
4,I have orders to kill her.,I've got orders to put her down.,0.726639,0.999348,0.009402,6,7
22,"Real life starts the first time you fuck, kid.","boy, real life starts up first.",0.866697,0.998222,0.000114,9,6
25,"Shit, this one I can't even pronounce.","gosh, I can't even pronounce this.",0.777253,0.997452,0.00012,7,6
29,"Hey, leave the poor bastard alone!",leave the poor man alone!,0.857554,0.999382,0.000578,6,5
41,It told you this was a waste of my fucking time.,I told you this was a waste of my time.,0.904062,0.995877,0.000479,11,10
42,she was a killer.,It was from the killer.,0.836845,0.996774,0.001205,4,5
43,"I swear to God, the best thing I ever did in my life was save that little son of a bitch","I swear to God, the best thing I've ever done in my life was to save this little Mutt,",0.932305,0.999071,0.0009,21,19
46,"'Shut up, you two, 'said Granny.","'Be quiet, you two,' said Granny.",0.746109,0.999243,0.001151,6,6
71,I don't have to do shit.,I don't have to do anything.,0.806763,0.995474,4.7e-05,6,6
72,"God damn, this is gonna be a long night.","dude, it's gonna be a long night.",0.747039,0.998138,6.8e-05,9,7


In [13]:
df_similar.describe()

Unnamed: 0,similarity,ref_tox,trn_tox,reference_length,translation_length
count,106986.0,106986.0,106986.0,106986.0,106986.0
mean,0.795648,0.997194,0.001422,9.176042,8.481063
std,0.064305,0.0026,0.002305,6.027564,5.8707
min,0.700001,0.990001,3.4e-05,1.0,1.0
25%,0.741094,0.995753,6.8e-05,5.0,4.0
50%,0.786845,0.998248,0.000228,8.0,7.0
75%,0.84244,0.999274,0.001688,12.0,11.0
max,0.95,0.999724,0.01,60.0,60.0


In [34]:
df = df_similar

In [35]:
df.head()

Unnamed: 0,reference,translation,similarity,ref_tox,trn_tox,reference_length,translation_length
4,I have orders to kill her.,I've got orders to put her down.,0.726639,0.999348,0.009402,6,7
22,"Real life starts the first time you fuck, kid.","boy, real life starts up first.",0.866697,0.998222,0.000114,9,6
25,"Shit, this one I can't even pronounce.","gosh, I can't even pronounce this.",0.777253,0.997452,0.00012,7,6
29,"Hey, leave the poor bastard alone!",leave the poor man alone!,0.857554,0.999382,0.000578,6,5
41,It told you this was a waste of my fucking time.,I told you this was a waste of my time.,0.904062,0.995877,0.000479,11,10


In [36]:
from transformers import RobertaTokenizer

# Get tokenizer trained on toxic words to get tokens of words
tokenizer = RobertaTokenizer.from_pretrained('SkolkovoInstitute/roberta_toxicity_classifier', cache_dir="../.cache/tokenizers/roberta_toxicity_classifier")

In [38]:
df.loc[:, 'tokenized_reference'] = df.loc[:, 'reference'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
df.loc[:, 'tokenized_translation'] = df.loc[:, 'translation'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))

In [40]:
# Get length of tokenized text
df.loc[:, 'tokenized_ref_len'] = df['tokenized_reference'].apply(lambda x: len(x))
df.loc[:, 'tokenized_trn_len'] = df['tokenized_translation'].apply(lambda x: len(x))

In [41]:
df.describe()

Unnamed: 0,similarity,ref_tox,trn_tox,reference_length,translation_length,tokenized_ref_len,tokenized_trn_len
count,106986.0,106986.0,106986.0,106986.0,106986.0,106986.0,106986.0
mean,0.795648,0.997194,0.001422,9.176042,8.481063,14.177247,13.271204
std,0.064305,0.0026,0.002305,6.027564,5.8707,7.250894,7.084342
min,0.700001,0.990001,3.4e-05,1.0,1.0,4.0,3.0
25%,0.741094,0.995753,6.8e-05,5.0,4.0,9.0,8.0
50%,0.786845,0.998248,0.000228,8.0,7.0,12.0,11.0
75%,0.84244,0.999274,0.001688,12.0,11.0,17.0,16.0
max,0.95,0.999724,0.01,60.0,60.0,78.0,202.0


In [42]:
df[df['tokenized_trn_len'] > 75]

Unnamed: 0,reference,translation,similarity,ref_tox,trn_tox,reference_length,translation_length,tokenized_reference,tokenized_translation,tokenized_ref_len,tokenized_trn_len
357856,"others were, though, noting that I don't have any money - I vowed not to make me appear so exposed; the gorillas took to their heads that they would force me, and they started to fuck me, my Ulysse Méroua, a man made for the image of God!","It was a different matter when, noticing I did not indulge in these frolics—I had sworn that nothing would induce me to make such an exhibition of myself—the gorillas took it into their heads to compel me by force and to belabor me with their pikes—me, Ulysse Merou, a man created in the image of God!",0.705638,0.99326,0.002857,48,56,"[0, 7443, 29, 58, 6, 600, 6, 5196, 14, 38, 218, 75, 33, 143, 418, 111, 38, 7588, 45, 7, 146, 162, 2082, 98, 4924, 131, 5, 40365, 19485, 362, 7, 49, 3885, 14, 51, 74, 1370, 162, 6, 8, 51, 554, 7, 26536, 162, 6, 127, 121, 32142, 1090, 26617, 8508, 102, 6, 10, 313, 156, 13, 5, 2274, 9, 1840, 328, 2]","[0, 243, 21, 10, 430, 948, 77, 6, 27515, 38, 222, 45, 27707, 11, 209, 21016, 5895, 29, 578, 100, 56, 11370, 14, 1085, 74, 28944, 162, 7, 146, 215, 41, 6318, 9, 2185, 578, 627, 40365, 19485, 362, 24, 88, 49, 3885, 7, 28319, 162, 30, 1370, 8, 7, 12138, 15313, 162, 19, 49, 181, 13349, 578, 1794, 6, 121, 32142, 1090, 4213, 1438, 6, 10, 313, 1412, 11, 5, 2274, 9, 1840, 328, 2]",64,76
506927,"Well, then you must be as blind as Anne Frank. 'Cause what's the point in having an Internet connection if you're not using it to look at weird, fucked-up pictures of dirty sex you'll never have yourself?","well, you gotta be blind like Anne Frank, 'cause what's a net for, not looking for pictures of a freak-out-a-a-pop-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-",0.714508,0.998825,0.004467,37,20,"[0, 8346, 6, 172, 47, 531, 28, 25, 7709, 25, 7896, 3848, 4, 128, 43326, 99, 18, 5, 477, 11, 519, 41, 3742, 2748, 114, 47, 214, 45, 634, 24, 7, 356, 23, 7735, 6, 42647, 12, 658, 3493, 9, 11216, 2099, 47, 581, 393, 33, 2512, 116, 2]","[0, 3056, 6, 47, 16112, 28, 7709, 101, 7896, 3848, 6, 128, 27037, 99, 18, 10, 1161, 13, 6, 45, 546, 13, 3493, 9, 10, 21905, 12, 995, 12, 102, 12, 102, 12, 15076, 12, 102, 12, 102, 12, 102, 12, 102, 12, 102, 12, 102, 12, 102, 12, 102, 12, 102, 12, 102, 12, 102, 12, 102, 12, 102, 12, 102, 12, 102, 12, 102, 12, 102, 12, 102, 12, 102, 12, 102, 12, 102, 12, 102, 12, 102, 12, 102, 12, 102, 12, 102, 12, 102, 12, 102, 12, 102, 12, 102, 12, 102, 12, 102, 12, 102, ...]",49,202
541183,"an engine in front, an ass-drive and a big laughing piece of meat in the middle.","前置引擎 后轮驱动 Engine at the front, drive to the rear 中间坐着一坨笑得非常开心的肥肉 and a big, smiling piece of meat in the middle.",0.79376,0.99338,0.000727,16,21,"[0, 260, 3819, 11, 760, 6, 41, 8446, 12, 19306, 8, 10, 380, 11339, 2125, 9, 4884, 11, 5, 1692, 4, 2]","[0, 49075, 8384, 36714, 10809, 2840, 48558, 15722, 37127, 9085, 12736, 47111, 16948, 12736, 36484, 10809, 2840, 41907, 15375, 15389, 47645, 11423, 20008, 23, 5, 760, 6, 1305, 7, 5, 5081, 1437, 47643, 49117, 20024, 42393, 46, 16948, 36714, 46, 7471, 48105, 42393, 46, 11423, 36714, 11582, 3602, 48412, 6800, 41907, 46, 17772, 47878, 18537, 48558, 7471, 48765, 862, 44574, 36484, 9264, 8210, 36484, 9264, 23171, 8, 10, 380, 6, 12382, 2125, 9, 4884, 11, 5, 1692, 4, 2]",22,79


In [52]:
df[df['tokenized_ref_len'] > 75]

Unnamed: 0,reference,translation,similarity,ref_tox,trn_tox,reference_length,translation_length,tokenized_reference,tokenized_translation,tokenized_ref_len,tokenized_trn_len
130611,"The poor bastards fryin' on the electric fence, the proximity mines poppin' under 'em, the microwave sentinels openin' up with the remote-control machine-gun nests, and the fire-control system swiveling the guns and flamethrowers around as long as anything was quiverin' within a mile of the place.","they were flitted in electrical barricades, unlined, with electronically controlled mines under their feet, the short-range signals opened fire to remote-controlled machine-gun nests, and the counteroffensive system rotated guns and flamethrowers after all the two kilns were a long way off.",0.741572,0.995635,6.4e-05,46,41,"[0, 133, 2129, 25753, 5954, 25950, 179, 108, 15, 5, 3459, 8146, 6, 5, 15854, 12321, 4202, 3807, 179, 108, 223, 128, 991, 6, 5, 28562, 1051, 179, 2507, 490, 179, 108, 62, 19, 5, 6063, 12, 17665, 3563, 12, 8215, 37537, 6, 8, 5, 668, 12, 17665, 467, 3514, 2088, 1527, 5, 5013, 8, 2342, 424, 4774, 4610, 268, 198, 25, 251, 25, 932, 21, 2677, 8538, 179, 108, 624, 10, 7245, 9, 5, 317, 4, 2]","[0, 10010, 58, 2342, 16430, 11, 8980, 19007, 4216, 6, 35237, 6158, 6, 19, 30319, 4875, 12321, 223, 49, 1730, 6, 5, 765, 12, 9435, 8724, 1357, 668, 7, 6063, 12, 9947, 3563, 12, 8215, 37537, 6, 8, 5, 3231, 34361, 467, 39187, 5013, 8, 2342, 424, 4774, 4610, 268, 71, 70, 5, 80, 12868, 6852, 58, 10, 251, 169, 160, 4, 2]",78,63
233439,"As well you should, shit bag, 'cause right now I'm heading down to Tulsa to see a pal of his, and you, next time you're someplace where our federal friends are listening, mention that Givens came to see you and that and you believe he's in bed with a guy from in Harlan... a guy named Boyd Crowder.","you should, you bum, 'cause right now I'm going to Tulsa for his friend, and you, when the Feds next time listen to you, you mention that Givens came to see you and you think he got tangled up with a guy in Harlan... with a guy named Boyd Crowder.",0.899491,0.993228,0.005255,58,50,"[0, 1620, 157, 47, 197, 6, 15328, 3298, 6, 128, 27037, 235, 122, 38, 437, 3393, 159, 7, 18532, 7, 192, 10, 8750, 9, 39, 6, 8, 47, 6, 220, 86, 47, 214, 103, 6406, 147, 84, 752, 964, 32, 6288, 6, 4521, 14, 272, 1879, 1290, 376, 7, 192, 47, 8, 14, 8, 47, 679, 37, 18, 11, 3267, 19, 10, 2173, 31, 11, 2482, 6847, 734, 10, 2173, 1440, 14449, 14088, 3624, 4, 2]","[0, 6968, 197, 6, 47, 29673, 6, 128, 27037, 235, 122, 38, 437, 164, 7, 18532, 13, 39, 1441, 6, 8, 47, 6, 77, 5, 274, 12080, 220, 86, 4161, 7, 47, 6, 47, 4521, 14, 272, 1879, 1290, 376, 7, 192, 47, 8, 47, 206, 37, 300, 31659, 62, 19, 10, 2173, 11, 2482, 6847, 734, 19, 10, 2173, 1440, 14449, 14088, 3624, 4, 2]",76,66
258009,"As well you should, shit bag, 'cause right now I'm heading down to Tulsa to see a pal of his, and you, next time you're someplace where our federal friends are listening, mention that Givens came to see you and that and you believe he's in bed with a guy from in Harlan... a guy named Boyd Crowder.","you should, you bum, 'cause right now I'm going to Tulsa for his friend, and you, when the Feds next time listen to you, you mention that Givens came to see you, and you think he got tangled up with a guy in Harlan...",0.854594,0.993228,0.004042,58,44,"[0, 1620, 157, 47, 197, 6, 15328, 3298, 6, 128, 27037, 235, 122, 38, 437, 3393, 159, 7, 18532, 7, 192, 10, 8750, 9, 39, 6, 8, 47, 6, 220, 86, 47, 214, 103, 6406, 147, 84, 752, 964, 32, 6288, 6, 4521, 14, 272, 1879, 1290, 376, 7, 192, 47, 8, 14, 8, 47, 679, 37, 18, 11, 3267, 19, 10, 2173, 31, 11, 2482, 6847, 734, 10, 2173, 1440, 14449, 14088, 3624, 4, 2]","[0, 6968, 197, 6, 47, 29673, 6, 128, 27037, 235, 122, 38, 437, 164, 7, 18532, 13, 39, 1441, 6, 8, 47, 6, 77, 5, 274, 12080, 220, 86, 4161, 7, 47, 6, 47, 4521, 14, 272, 1879, 1290, 376, 7, 192, 47, 6, 8, 47, 206, 37, 300, 31659, 62, 19, 10, 2173, 11, 2482, 6847, 734, 2]",76,59


In [43]:
# Remove rows samples with too long tokenized translation length
df = df[df['tokenized_trn_len'] <= 75]

In [44]:
df.describe()

Unnamed: 0,similarity,ref_tox,trn_tox,reference_length,translation_length,tokenized_ref_len,tokenized_trn_len
count,106983.0,106983.0,106983.0,106983.0,106983.0,106983.0,106983.0
mean,0.795649,0.997194,0.001422,9.175355,8.480394,14.176383,13.268239
std,0.064304,0.0026,0.002305,6.025843,5.868753,7.248574,7.055437
min,0.700001,0.990001,3.4e-05,1.0,1.0,4.0,3.0
25%,0.741096,0.995753,6.8e-05,5.0,4.0,9.0,8.0
50%,0.786846,0.998248,0.000228,8.0,7.0,12.0,11.0
75%,0.842447,0.999274,0.001688,12.0,11.0,17.0,16.0
max,0.95,0.999724,0.01,60.0,60.0,78.0,73.0


In [73]:
# Remove samples with very big difference between tokenized and plain lengths
df = df[~((df['tokenized_ref_len'] > (df['reference_length'] * 1.2)) & (df['reference_length'] > 20))]
df= df[~((df['tokenized_trn_len'] > (df['translation_length'] * 1.2)) & (df['translation_length'] > 20))]

In [74]:
df.describe()

Unnamed: 0,similarity,ref_tox,trn_tox,reference_length,translation_length,tokenized_ref_len,tokenized_trn_len
count,100731.0,100731.0,100731.0,100731.0,100731.0,100731.0,100731.0
mean,0.796296,0.997246,0.001402,8.179706,7.517577,12.977554,12.112815
std,0.064687,0.002585,0.002291,4.391377,4.275248,5.233505,5.104643
min,0.700001,0.990001,3.4e-05,1.0,1.0,4.0,3.0
25%,0.74136,0.995869,6.7e-05,5.0,4.0,9.0,8.0
50%,0.787434,0.998313,0.000219,7.0,6.0,12.0,11.0
75%,0.843634,0.999295,0.00164,11.0,10.0,16.0,15.0
max,0.95,0.999724,0.01,55.0,59.0,66.0,70.0


In [77]:
# Drop unnecessary columns
df = df.drop(columns=['reference_length', 'translation_length', 'tokenized_ref_len', 'tokenized_trn_len', 'tokenized_reference', 'tokenized_translation'])

In [78]:
# Save preprocessed dataset
df.to_csv('../data/internal/preprocessed.csv')