# Preparing the data for Roberta Finetuning
- exporting two datasets:
    - fully corrected
    - raw prompts wihtout the ones in german and italian
- both contain original punctuation

In [30]:
import sqlite3
import pandas as pd

conn  = sqlite3.connect('../../giicg.db')
expanded_prompts = pd.read_sql("Select * from expanded_prompts", conn).drop_duplicates(subset=['conversational'])
filtered_prompts = pd.read_sql(
    """
    SELECT
        fp.*,
        ep.language
    FROM
        filtered_prompts fp
    JOIN
        expanded_prompts ep
    ON
        fp.message_id = ep.message_id
    """,
    conn
).drop_duplicates(subset=['conversational'])

expanded_prompts


Unnamed: 0,message_id,conversation_id,role,message_text,conversational,code,other,gender,user_id,language
0,1,1,user,"parsing data from python iterator, how it coul...","parsing data from python iterator, how it coul...",,,Man (cisgender),6,en
1,730,32,user,Write python function to do operations with in...,Write python function to do operations with in...,,report_dt\tsource\tmetric_name\tmetric_num\tme...,Man (cisgender),6,en
2,1133,55,user,Write shortest tutorial on creating RAG on ema...,Write shortest tutorial on creating RAG on ema...,,,Man (cisgender),6,en
3,1135,55,user,what is FAISS,what is FAISS,,,Man (cisgender),6,en
4,1137,55,user,Transform given code to process large .mbox file,Transform given code to process large .mbox file,,Transform given code to process large .mbox file,Man (cisgender),6,en
...,...,...,...,...,...,...,...,...,...,...
748,1646,82,user,"def run_query(query, n_results):\n query_em...",this is my code. I want to: Get nodes and edge...,"def run_query(query, n_results):\n query_em...",,Man (cisgender),92,en
749,1845,37,user,\n nun möchte ich judgement balancing m...,Now I want to bring judgement balancing into t...,,,Woman (cisgender),29,de
750,1847,37,user,\n ich sehe keine veränderung im Plot. Was ...,I do not see any change in the plot.,,,Woman (cisgender),29,de
751,1849,2,user,\n I am working on the problem of reconstru...,\n I am working on the problem of reconstru...,,Classic CV - Drone navigation\nIf you ever tho...,Man (cisgender),8,en


## Filter and Clean

In [31]:
from helpers.normalization import remove_newlines

expanded_prompts = expanded_prompts[expanded_prompts['gender'].isin(['Woman (cisgender)', 'Man (cisgender)'])].reset_index()
expanded_prompts['conversational']  = expanded_prompts['conversational'].apply(remove_newlines)

filtered_prompts = filtered_prompts[filtered_prompts['language'] == 'en'].reset_index(drop=True)
filtered_prompts = filtered_prompts[filtered_prompts['gender'].isin(['Woman (cisgender)', 'Man (cisgender)'])].reset_index()
filtered_prompts['conversational']  = filtered_prompts['conversational'].apply(remove_newlines)

expanded_prompts


Unnamed: 0,index,message_id,conversation_id,role,message_text,conversational,code,other,gender,user_id,language
0,0,1,1,user,"parsing data from python iterator, how it coul...","parsing data from python iterator, how it coul...",,,Man (cisgender),6,en
1,1,730,32,user,Write python function to do operations with in...,Write python function to do operations with in...,,report_dt\tsource\tmetric_name\tmetric_num\tme...,Man (cisgender),6,en
2,2,1133,55,user,Write shortest tutorial on creating RAG on ema...,Write shortest tutorial on creating RAG on ema...,,,Man (cisgender),6,en
3,3,1135,55,user,what is FAISS,what is FAISS,,,Man (cisgender),6,en
4,4,1137,55,user,Transform given code to process large .mbox file,Transform given code to process large .mbox file,,Transform given code to process large .mbox file,Man (cisgender),6,en
...,...,...,...,...,...,...,...,...,...,...,...
741,748,1646,82,user,"def run_query(query, n_results):\n query_em...",this is my code. I want to: Get nodes and edge...,"def run_query(query, n_results):\n query_em...",,Man (cisgender),92,en
742,749,1845,37,user,\n nun möchte ich judgement balancing m...,Now I want to bring judgement balancing into t...,,,Woman (cisgender),29,de
743,750,1847,37,user,\n ich sehe keine veränderung im Plot. Was ...,I do not see any change in the plot.,,,Woman (cisgender),29,de
744,751,1849,2,user,\n I am working on the problem of reconstru...,I am working on the problem of reconstruc...,,Classic CV - Drone navigation\nIf you ever tho...,Man (cisgender),8,en


## Data stats and subsampling of long conversations
- subsampled 50 prompts from user 73, who had over 200

In [32]:
users_per_gender = expanded_prompts.groupby('gender')['user_id'].nunique().reset_index(name='num_users')
users_per_gender

Unnamed: 0,gender,num_users
0,Man (cisgender),15
1,Woman (cisgender),13


In [33]:
messages_per_user = expanded_prompts.groupby('user_id')['message_id'].nunique().reset_index(name='num_messages')
messages_per_user

Unnamed: 0,user_id,num_messages
0,6,9
1,8,2
2,11,11
3,15,3
4,16,25
5,25,4
6,28,22
7,29,2
8,31,5
9,34,66


In [40]:
messages_per_gender = expanded_prompts.groupby('gender')['message_id'].nunique().reset_index(name='num_messages')
messages_per_gender

Unnamed: 0,gender,num_messages
0,Man (cisgender),282
1,Woman (cisgender),254


In [34]:
other_users = expanded_prompts[expanded_prompts['user_id'] != 73]
messages_per_other_users = other_users.groupby('user_id')['message_id'].nunique().reset_index(name='num_messages')
print("Mean message count: ", messages_per_other_users['num_messages'].mean())

Mean message count:  19.14814814814815


In [35]:
def subsample(df, id):
    user_73 = df[df['user_id'] == id]
    other_users = df[df['user_id'] != id]

    user_73_sampled = user_73.sample(n=19, random_state=42)

    prompts = pd.concat([other_users, user_73_sampled], ignore_index=True)

    subsampled_messages_per_user = prompts.groupby('user_id')['message_id'].nunique().reset_index(name='num_messages')
    subsampled_with_gender = subsampled_messages_per_user.merge(
        prompts[['user_id', 'gender']].drop_duplicates(),
        on='user_id',
        how='left'
    )
    print(subsampled_with_gender.groupby(['gender']).sum())

    users_per_gender = prompts.groupby('gender')['user_id'].nunique().reset_index(name='num_users')
    print(users_per_gender)

    return prompts

expanded_prompts = subsample(expanded_prompts, 73)
filtered_prompts = subsample(filtered_prompts, 73)


                   user_id  num_messages
gender                                  
Man (cisgender)        780           282
Woman (cisgender)      706           254
              gender  num_users
0    Man (cisgender)         15
1  Woman (cisgender)         13
                   user_id  num_messages
gender                                  
Man (cisgender)        652           189
Woman (cisgender)      612           186
              gender  num_users
0    Man (cisgender)         13
1  Woman (cisgender)         11


## Create label mapping

In [36]:
import json
def create_label_mapping(df):
    labels = df['gender'].astype('category')
    df['label'] = labels.cat.codes
    label2id = dict(enumerate(labels.cat.categories))
    with open("finetune/label2id.json", "w") as f:
        json.dump(label2id, f)
    return df

expanded_prompts = create_label_mapping(expanded_prompts)
filtered_prompts = create_label_mapping(filtered_prompts)


In [37]:
expanded_prompts

Unnamed: 0,index,message_id,conversation_id,role,message_text,conversational,code,other,gender,user_id,language,label
0,0,1,1,user,"parsing data from python iterator, how it coul...","parsing data from python iterator, how it coul...",,,Man (cisgender),6,en,0
1,1,730,32,user,Write python function to do operations with in...,Write python function to do operations with in...,,report_dt\tsource\tmetric_name\tmetric_num\tme...,Man (cisgender),6,en,0
2,2,1133,55,user,Write shortest tutorial on creating RAG on ema...,Write shortest tutorial on creating RAG on ema...,,,Man (cisgender),6,en,0
3,3,1135,55,user,what is FAISS,what is FAISS,,,Man (cisgender),6,en,0
4,4,1137,55,user,Transform given code to process large .mbox file,Transform given code to process large .mbox file,,Transform given code to process large .mbox file,Man (cisgender),6,en,0
...,...,...,...,...,...,...,...,...,...,...,...,...
531,501,1674,87,user,Accuracy: 1.0\n Count: 2\nMetrics for neptune...,these are the results. i to calculate a statis...,,Accuracy: 1.0\n Count: 2\nMetrics for neptune...,Woman (cisgender),73,en,1
532,416,1290,65,user,how are we currently processing non numerical ...,how are we currently processing non numerical ...,"def perform_optics_clustering(file_path, outpu...",,Woman (cisgender),73,en,1
533,425,1314,65,user,what is the reachability score,what is the reachability score,,,Woman (cisgender),73,en,1
534,309,372,21,user,"my features are saved in ""train_features.npy"" ...","my features are saved in ""train_features.npy"" ...",,,Woman (cisgender),73,en,1


In [38]:
filtered_prompts

Unnamed: 0,index,message_id,conversation_id,role,message_text,conversational,code,other,gender,user_id,language,label
0,0,1,1,user,"parsing data from python iterator, how it coul...","parsing data from python iterator, how it coul...",,,Man (cisgender),6,en,0
1,1,730,32,user,Write python function to do operations with in...,Write python function to do operations with in...,,report_dt\tsource\tmetric_name\tmetric_num\tme...,Man (cisgender),6,en,0
2,2,1133,55,user,Write shortest tutorial on creating RAG on ema...,Write shortest tutorial on creating RAG on ema...,,,Man (cisgender),6,en,0
3,3,1135,55,user,what is FAISS,what is FAISS,,,Man (cisgender),6,en,0
4,4,1137,55,user,Transform given code to process large .mbox file,Transform given code to process large .mbox file,,Transform given code to process large .mbox file,Man (cisgender),6,en,0
...,...,...,...,...,...,...,...,...,...,...,...,...
370,241,374,21,user,Pass `sample_weight` to AutoGluon.how,Pass `sample_weight` to AutoGluon.how,Pass `sample_weight` to AutoGluon.how,,Woman (cisgender),73,en,1
371,256,444,21,user,i dont understand the label part,i dont understand the label part,,,Woman (cisgender),73,en,1
372,271,500,21,user,"ok, but what kind of information can i use fro...","ok, but what kind of information can i use fro...",,,Woman (cisgender),73,en,1
373,387,1398,65,user,data/docx-2017-04/diufgzadsgf.x;1\n data/do...,this is how the output should be formatted:,import pandas as pd\n\ndef create_clustered_ou...,data/docx-2017-04/diufgzadsgf.x;1\n data/do...,Woman (cisgender),73,en,1


## Save to database

In [39]:
expanded_prompts.to_sql('expanded_roberta_prompts', conn, if_exists='replace', index=False)
#filtered_prompts.to_sql('filtered_roberta_prompts', conn, if_exists='replace', index=False)

conn.close()