In [1]:
from parlai.tasks.md_gender import convai2
import parlai.tasks.md_gender.utils as md_utils

from copy import deepcopy

import json

In [4]:
data_path = '/home/kuina/anaconda3/envs/parlai/lib/python3.10/site-packages/data/'

opt = {'datapath': data_path, #Donde están los datos
    'datatype':'valid',
    'labels_to_use':'all',
    'add_unknown_classes': False, # help='Add unknown classes as neutral' # Esto es para incluir o no el about (???)
    'convai2_use_probably':True,
    'balance':False, # help='Whether to balance the data between classes during training',
    'task':'md_gender:convai2',
    'balance_valid':False, #help='Whether to balance the validation data'
    'unknown_temp':1.0 #help='Rate at which to sample examples from the unknown class',
}

In [5]:
convai = convai2.Convai2Teacher(opt)

md_utils.get_data_stats(convai.data,key='labels',lst=True)

[ Building data ... ]
09:39:19 | loading fbdialog data: /home/kuina/anaconda3/envs/parlai/lib/python3.10/site-packages/data/ConvAI2/valid_both_original.txt
Missing cnt: 198 / 31204
Totals for self:
	unknown: 5485 (0.7031149852583002)
	female: 1364 (0.17484937828483527)
	male: 952 (0.12203563645686451)
Totals for partner:
	unknown: 4808 (0.6163312395846686)
	female: 1725 (0.2211254967311883)
	male: 1268 (0.16254326368414307)
Total dataset counts:
PARTNER:female: 1725
PARTNER:male: 1268
PARTNER:unknown: 4808
SELF:female: 1364
SELF:male: 952
SELF:unknown: 5485


In [6]:
convai.data[0]

{'text': 'hello what are doing today ?',
 'episode_done': True,
 'your_persona': "horror movies are my favorites.\ni'm a stay at home dad.\nmy father used to work for home depot.\ni spent a decade working in the human services field.\ni have a son who is in junior high school.",
 'partner_persona': "i read twenty books a year.\ni'm a stunt double as my second job.\ni only eat kosher.\ni was raised in a single parent household.",
 'id': 'ConvAI2 Gender',
 'labels': ['SELF:male'],
 'class_type': 'self'}

In [7]:
data = {}
for item in convai.data:
    text = item['text']
    labels = item['labels'][0]
    class_type = item['class_type']

    if text not in data:

        data[text] = {class_type : [labels]}
    elif text in data and class_type not in data[text]:
        data[text][class_type] = [labels] 
    else:
        data[text][class_type].append(labels)

In [8]:
data

{'hello what are doing today ?': {'self': ['SELF:male'],
  'partner': ['PARTNER:unknown']},
 'i just got done watching a horror movie': {'self': ['SELF:male'],
  'partner': ['PARTNER:unknown']},
 'wow ! i do love a good horror movie . loving this cooler weather': {'self': ['SELF:male'],
  'partner': ['PARTNER:unknown']},
 'yes ! my son is in junior high and i just started letting him watch them too': {'self': ['SELF:male'],
  'partner': ['PARTNER:unknown']},
 'neat ! ! i used to work in the human services field': {'self': ['SELF:male'],
  'partner': ['PARTNER:unknown']},
 'yes i bet you can get hurt . my wife works and i stay at home': {'self': ['SELF:male'],
  'partner': ['PARTNER:unknown']},
 'i bet she appreciates that very much .': {'self': ['SELF:male'],
  'partner': ['PARTNER:unknown']},
 'my dad was always busy working at home depot': {'self': ['SELF:male'],
  'partner': ['PARTNER:unknown']},
 'hi ! how are you doing tonight ?': {'self': ['SELF:male'],
  'partner': ['PARTNER:unk

In [7]:
#Opción A: Dejo unknown con su probabilidad
A_data = deepcopy(data)
for item in A_data:
    partner = A_data[item]['partner']
    your = A_data[item]['self']
    partner_labels = {label : partner.count(label)/len(partner) for label in set(partner)}
    your_labels = {label : your.count(label)/len(your) for label in set(your)}

    A_data[item]['partner'] = partner_labels
    A_data[item]['self'] = your_labels

In [8]:
#Opción B: Reemplazo unknown por male y female y calculo probabilidades

from itertools import chain


B_data = deepcopy(data)
for item in B_data:
    partner = list(chain.from_iterable(['PARTNER:male','PARTNER:female'] if x == 'PARTNER:unknown' else [x] for x in B_data[item]['partner']) )
    your = list(chain.from_iterable(['SELF:male','SELF:female'] if x == 'SELF:unknown' else [x] for x in B_data[item]['self']) )

    partner_labels = {label : partner.count(label)/len(partner) for label in set(partner)}
    your_labels = {label : your.count(label)/len(your) for label in set(your)}

    B_data[item]['partner'] = partner_labels
    B_data[item]['self'] = your_labels



In [9]:
# Opción C: No almaceno probabilidades

C_data = deepcopy(data)
for item in C_data:
    C_data[item]['partner'] = list(dict.fromkeys(C_data[item]['partner']))
    C_data[item]['self'] = list(dict.fromkeys(C_data[item]['self']))
    

In [10]:
C_data

{'hello what are doing today ?': {'self': ['SELF:male'],
  'partner': ['PARTNER:unknown']},
 'i just got done watching a horror movie': {'self': ['SELF:male'],
  'partner': ['PARTNER:unknown']},
 'wow ! i do love a good horror movie . loving this cooler weather': {'self': ['SELF:male'],
  'partner': ['PARTNER:unknown']},
 'yes ! my son is in junior high and i just started letting him watch them too': {'self': ['SELF:male'],
  'partner': ['PARTNER:unknown']},
 'neat ! ! i used to work in the human services field': {'self': ['SELF:male'],
  'partner': ['PARTNER:unknown']},
 'yes i bet you can get hurt . my wife works and i stay at home': {'self': ['SELF:male'],
  'partner': ['PARTNER:unknown']},
 'i bet she appreciates that very much .': {'self': ['SELF:male'],
  'partner': ['PARTNER:unknown']},
 'my dad was always busy working at home depot': {'self': ['SELF:male'],
  'partner': ['PARTNER:unknown']},
 'hi ! how are you doing tonight ?': {'self': ['SELF:male'],
  'partner': ['PARTNER:unk

In [11]:
with open('convai_dataset_with_unk_VAL.json','w') as f:
    json.dump(C_data,f)

In [9]:
with open('convai_two_unk.json','w') as f:
    json.dump(A_data,f)

with open('convai_two_NO_unk.json','w') as f:
    json.dump(B_data,f)