In [1]:
from parlai.tasks.md_gender import convai2
import parlai.tasks.md_gender.utils as md_utils

from copy import deepcopy

import json

In [2]:
data_path = '/home/kuina/anaconda3/envs/parlai/lib/python3.10/site-packages/data/'

opt = {'datapath': data_path, #Donde están los datos
    'datatype':'train',
    'labels_to_use':'all',
    'add_unknown_classes': False, # help='Add unknown classes as neutral' # Esto es para incluir o no el about (???)
    'convai2_use_probably':True,
    'balance':False, # help='Whether to balance the data between classes during training',
    'task':'md_gender:convai2',
    #,'balance_valid':False, #help='Whether to balance the validation data'
    'unknown_temp':1.0 #help='Rate at which to sample examples from the unknown class',
}

In [3]:
convai = convai2.Convai2Teacher(opt)

md_utils.get_data_stats(convai.data,key='labels',lst=True)

[ Building data ... ]
16:53:19 | loading fbdialog data: /home/kuina/anaconda3/envs/parlai/lib/python3.10/site-packages/data/ConvAI2/train_both_original.txt
Missing cnt: 52 / 525768
Totals for self:
	unknown: 87069 (0.6624138403250103)
	female: 22036 (0.16764808813012583)
	male: 22337 (0.1699380715448639)
Totals for partner:
	unknown: 86422 (0.6574915171710717)
	female: 22730 (0.17292798344516974)
	male: 22290 (0.1695804993837586)
Total dataset counts:
PARTNER:female: 22730
PARTNER:male: 22290
PARTNER:unknown: 86422
SELF:female: 22036
SELF:male: 22337
SELF:unknown: 87069


In [4]:
convai.data[0]

{'text': "the times . i'm a carnivore , how about you ?",
 'episode_done': True,
 'your_persona': 'my father used to be a butcher.\nmy only friend is a dog.\ni work at a newspaper.\ni am a carnivore.\ni am highly educated.',
 'partner_persona': 'i can beat anyone in chess.\ni have high speed internet.\ni got hired as a security guard.\ni saw a solar eclipse when i was 8 years old.',
 'id': 'ConvAI2 Gender',
 'labels': ['PARTNER:male'],
 'class_type': 'partner'}

In [5]:
data = {}
for item in convai.data:
    text = item['text']
    labels = item['labels'][0]
    class_type = item['class_type']

    if text not in data:

        data[text] = {class_type : [labels]}
    elif text in data and class_type not in data[text]:
        data[text][class_type] = [labels] 
    else:
        data[text][class_type].append(labels)

In [6]:
data

{"the times . i'm a carnivore , how about you ?": {'partner': ['PARTNER:male',
   'PARTNER:male'],
  'self': ['SELF:male', 'SELF:male']},
 'sounds fancy . any cool perks ?': {'self': ['SELF:female'],
  'partner': ['PARTNER:unknown']},
 'that is not embarrassing in this economy': {'self': ['SELF:unknown',
   'SELF:unknown'],
  'partner': ['PARTNER:unknown', 'PARTNER:unknown']},
 'no . what is that ? ?': {'self': ['SELF:female'],
  'partner': ['PARTNER:female']},
 'person i am sorry . do you do anything active or are you too tired fo that ?': {'self': ['SELF:unknown'],
  'partner': ['PARTNER:unknown']},
 'i see i enjoy reading american literature what do you enjoy ?': {'partner': ['PARTNER:unknown',
   'PARTNER:unknown'],
  'self': ['SELF:unknown', 'SELF:unknown']},
 'cool . cities are great , i prefer the beach .': {'partner': ['PARTNER:unknown',
   'PARTNER:unknown'],
  'self': ['SELF:unknown', 'SELF:unknown']},
 'as a man i am sure you are pretty . we all have issues . like my deafnes

In [7]:
#Opción A: Dejo unknown con su probabilidad
A_data = deepcopy(data)
for item in A_data:
    partner = A_data[item]['partner']
    your = A_data[item]['self']
    partner_labels = {label : partner.count(label)/len(partner) for label in set(partner)}
    your_labels = {label : your.count(label)/len(your) for label in set(your)}

    A_data[item]['partner'] = partner_labels
    A_data[item]['self'] = your_labels

In [8]:
#Opción B: Reemplazo unknown por male y female y calculo probabilidades

from itertools import chain


B_data = deepcopy(data)
for item in B_data:
    partner = list(chain.from_iterable(['PARTNER:male','PARTNER:female'] if x == 'PARTNER:unknown' else [x] for x in B_data[item]['partner']) )
    your = list(chain.from_iterable(['SELF:male','SELF:female'] if x == 'SELF:unknown' else [x] for x in B_data[item]['self']) )

    partner_labels = {label : partner.count(label)/len(partner) for label in set(partner)}
    your_labels = {label : your.count(label)/len(your) for label in set(your)}

    B_data[item]['partner'] = partner_labels
    B_data[item]['self'] = your_labels



In [9]:
with open('convai_two_unk.json','w') as f:
    json.dump(A_data,f)

with open('convai_two_NO_unk.json','w') as f:
    json.dump(B_data,f)