In [1]:
from parlai.tasks.md_gender import convai2
import parlai.tasks.md_gender.utils as md_utils

from copy import deepcopy

import json

In [2]:
data_path = '/home/kuina/anaconda3/envs/parlai/lib/python3.10/site-packages/data/'

opt = {'datapath': data_path, #Donde están los datos
    'datatype':'train',
    'labels_to_use':'all',
    'add_unknown_classes': False, # help='Add unknown classes as neutral' # Esto es para incluir o no el about (???)
    'convai2_use_probably':True,
    'balance':False, # help='Whether to balance the data between classes during training',
    'task':'md_gender:convai2',
    #,'balance_valid':False, #help='Whether to balance the validation data'
    'unknown_temp':1.0 #help='Rate at which to sample examples from the unknown class',
}

In [3]:
convai = convai2.Convai2Teacher(opt)

md_utils.get_data_stats(convai.data,key='labels',lst=True)

[ Building data ... ]
15:57:52 | loading fbdialog data: /home/kuina/anaconda3/envs/parlai/lib/python3.10/site-packages/data/ConvAI2/train_both_original.txt
Missing cnt: 66 / 525772
Totals for self:
	unknown: 86890 (0.6610469937539466)
	female: 22439 (0.17071278044475552)
	male: 22114 (0.1682402258012979)
Totals for partner:
	unknown: 87456 (0.6653530427637835)
	female: 22138 (0.1684228144518917)
	male: 21849 (0.16622414278432476)
Total dataset counts:
PARTNER:female: 22138
PARTNER:male: 21849
PARTNER:unknown: 87456
SELF:female: 22439
SELF:male: 22114
SELF:unknown: 86890


In [4]:
convai.data[0]

{'text': "good choice . i haven't figured out what i want to do yet .",
 'episode_done': True,
 'your_persona': "i still live at home.\ni listen to death metal.\ni like cartoons.\ni'm in college.",
 'partner_persona': 'both my parents are creative.\nmy dad works in the automotive industry.\ni m a student living at home while pursuing my music industry degree.\ni dream of playing music for a living.',
 'id': 'ConvAI2 Gender',
 'labels': ['PARTNER:unknown'],
 'class_type': 'partner'}

In [5]:
data = {}
for item in convai.data:
    text = item['text']
    labels = item['labels'][0]
    class_type = item['class_type']

    if text not in data:

        data[text] = {class_type : [labels]}
    elif text in data and class_type not in data[text]:
        data[text][class_type] = [labels] 
    else:
        data[text][class_type].append(labels)

In [6]:
data

{"good choice . i haven't figured out what i want to do yet .": {'partner': ['PARTNER:unknown',
   'PARTNER:unknown'],
  'self': ['SELF:unknown', 'SELF:unknown']},
 "that's ok , i also love to watch horror shows ; do you ?": {'partner': ['PARTNER:male'],
  'self': ['SELF:unknown']},
 'i like science as well': {'self': ['SELF:unknown', 'SELF:unknown'],
  'partner': ['PARTNER:unknown', 'PARTNER:unknown']},
 'i just crunched up halloween oreos into some ice cream . pretty good .': {'self': ['SELF:unknown',
   'SELF:unknown',
   'SELF:unknown'],
  'partner': ['PARTNER:unknown', 'PARTNER:unknown', 'PARTNER:unknown']},
 "sometimes i do . while i'm asleep": {'partner': ['PARTNER:unknown',
   'PARTNER:unknown'],
  'self': ['SELF:male', 'SELF:male']},
 'haha ! just do not let her hear you say that !': {'self': ['SELF:male'],
  'partner': ['PARTNER:male']},
 'ouch , stay safe ! hows the traffic there ?': {'self': ['SELF:unknown',
   'SELF:unknown',
   'SELF:unknown'],
  'partner': ['PARTNER:unkn

In [7]:
#Opción A: Dejo unknown con su probabilidad
A_data = deepcopy(data)
for item in A_data:
    partner = A_data[item]['partner']
    your = A_data[item]['self']
    partner_labels = {label : partner.count(label)/len(partner) for label in set(partner)}
    your_labels = {label : your.count(label)/len(your) for label in set(your)}

    A_data[item]['partner'] = partner_labels
    A_data[item]['self'] = your_labels

In [8]:
#Opción B: Reemplazo unknown por male y female y calculo probabilidades

from itertools import chain


B_data = deepcopy(data)
for item in B_data:
    partner = list(chain.from_iterable(['PARTNER:male','PARTNER:female'] if x == 'PARTNER:unknown' else [x] for x in B_data[item]['partner']) )
    your = list(chain.from_iterable(['SELF:male','SELF:female'] if x == 'SELF:unknown' else [x] for x in B_data[item]['self']) )

    partner_labels = {label : partner.count(label)/len(partner) for label in set(partner)}
    your_labels = {label : your.count(label)/len(your) for label in set(your)}

    B_data[item]['partner'] = partner_labels
    B_data[item]['self'] = your_labels



In [10]:
# Opción C: No almaceno probabilidades

C_data = deepcopy(data)
for item in C_data:
    C_data[item]['partner'] = list(dict.fromkeys(C_data[item]['partner']))
    C_data[item]['self'] = list(dict.fromkeys(C_data[item]['self']))
    

In [11]:
C_data

{"good choice . i haven't figured out what i want to do yet .": {'partner': ['PARTNER:unknown'],
  'self': ['SELF:unknown']},
 "that's ok , i also love to watch horror shows ; do you ?": {'partner': ['PARTNER:male'],
  'self': ['SELF:unknown']},
 'i like science as well': {'self': ['SELF:unknown'],
  'partner': ['PARTNER:unknown']},
 'i just crunched up halloween oreos into some ice cream . pretty good .': {'self': ['SELF:unknown'],
  'partner': ['PARTNER:unknown']},
 "sometimes i do . while i'm asleep": {'partner': ['PARTNER:unknown'],
  'self': ['SELF:male']},
 'haha ! just do not let her hear you say that !': {'self': ['SELF:male'],
  'partner': ['PARTNER:male']},
 'ouch , stay safe ! hows the traffic there ?': {'self': ['SELF:unknown'],
  'partner': ['PARTNER:unknown']},
 'well , i do work at a party store ! love to party !': {'self': ['SELF:unknown'],
  'partner': ['PARTNER:unknown']},
 "ya they don't like that i believe in any a .": {'self': ['SELF:unknown'],
  'partner': ['PARTN

In [12]:
with open('convai_dataset_with_unk.json','w') as f:
    json.dump(C_data,f)

In [9]:
with open('convai_two_unk.json','w') as f:
    json.dump(A_data,f)

with open('convai_two_NO_unk.json','w') as f:
    json.dump(B_data,f)