# Preparing the data for Roberta Finetuning
- exporting two datasets:
    - fully corrected
    - raw prompts wihtout the ones in german and italian
- both contain original punctuation

In [33]:
import sqlite3
import pandas as pd

conn  = sqlite3.connect('../../giicg.db')
expanded_prompts = pd.read_sql("Select * from expanded_prompts", conn).drop_duplicates(subset=['conversational'])
filtered_prompts = pd.read_sql(
    """
    SELECT
        fp.*,
        ep.language
    FROM
        filtered_prompts fp
    JOIN
        expanded_prompts ep
    ON
        fp.message_id = ep.message_id
    """,
    conn
).drop_duplicates(subset=['conversational'])


## Filter and Clean

In [34]:
from helpers.normalization import remove_newlines

expanded_prompts = expanded_prompts[expanded_prompts['gender'].isin(['Woman (cisgender)', 'Man (cisgender)'])].reset_index()
expanded_prompts['conversational']  = expanded_prompts['conversational'].apply(remove_newlines)

filtered_prompts = filtered_prompts[filtered_prompts['language'] == 'en'].reset_index(drop=True)
filtered_prompts = filtered_prompts[filtered_prompts['gender'].isin(['Woman (cisgender)', 'Man (cisgender)'])].reset_index()
filtered_prompts['conversational']  = filtered_prompts['conversational'].apply(remove_newlines)


## Data stats and subsampling of long conversations
- subsampled 50 prompts from user 73, who had over 200

In [35]:
users_per_gender = all_prompts.groupby('gender')['user_id'].nunique().reset_index(name='num_users')
users_per_gender

Unnamed: 0,gender,num_users
0,Man (cisgender),15
1,Woman (cisgender),12


In [36]:
messages_per_user = all_prompts.groupby('user_id')['message_id'].nunique().reset_index(name='num_messages')
messages_per_user

Unnamed: 0,user_id,num_messages
0,6,9
1,8,2
2,11,11
3,15,3
4,16,25
5,25,4
6,28,22
7,31,5
8,34,66
9,46,5


In [37]:
def subsample(df, id):
    user_73 = df[df['user_id'] == id]
    other_users = df[df['user_id'] != id]

    user_73_sampled = user_73.sample(n=50, random_state=42)

    prompts = pd.concat([other_users, user_73_sampled], ignore_index=True)

    subsampled_messages_per_user = prompts.groupby('user_id')['message_id'].nunique().reset_index(name='num_messages')
    subsampled_with_gender = subsampled_messages_per_user.merge(
        prompts[['user_id', 'gender']].drop_duplicates(),
        on='user_id',
        how='left'
    )
    print(subsampled_with_gender.groupby(['gender']).sum())

    users_per_gender = prompts.groupby('gender')['user_id'].nunique().reset_index(name='num_users')
    print(users_per_gender)

    return prompts

expanded_prompts = subsample(expanded_prompts, 73)
filtered_prompts = subsample(filtered_prompts, 73)


                   user_id  num_messages
gender                                  
Man (cisgender)        780           282
Woman (cisgender)      677           285
              gender  num_users
0    Man (cisgender)         15
1  Woman (cisgender)         12
                   user_id  num_messages
gender                                  
Man (cisgender)        652           189
Woman (cisgender)      612           217
              gender  num_users
0    Man (cisgender)         13
1  Woman (cisgender)         11


## Create label mapping

In [38]:
import json
def create_label_mapping(df):
    labels = df['gender'].astype('category')
    df['label'] = labels.cat.codes
    label2id = dict(enumerate(labels.cat.categories))
    with open("finetune/label2id.json", "w") as f:
        json.dump(label2id, f)
    return df

expanded_prompts = create_label_mapping(expanded_prompts)
filtered_prompts = create_label_mapping(filtered_prompts)


In [39]:
expanded_prompts

Unnamed: 0,index,message_id,conversation_id,role,message_text,conversational,code,other,gender,user_id,language,label
0,0,1,1,user,"parsing data from python iterator, how it coul...","parsing data from python iterator, how it coul...",,,Man (cisgender),6,en,0
1,1,730,32,user,Write python function to do operations with in...,Write python function to do operations with in...,,report_dt\tsource\tmetric_name\tmetric_num\tme...,Man (cisgender),6,en,0
2,2,1133,55,user,Write shortest tutorial on creating RAG on ema...,Write shortest tutorial on creating RAG on ema...,,,Man (cisgender),6,en,0
3,3,1135,55,user,what is FAISS,what is FAISS,,,Man (cisgender),6,en,0
4,4,1137,55,user,Transform given code to process large .mbox file,Transform given code to process large .mbox file,,Transform given code to process large .mbox file,Man (cisgender),6,en,0
...,...,...,...,...,...,...,...,...,...,...,...,...
562,391,1234,65,user,can we add peid for when pefile fails?,can we add peid for when pefile fails?,,,Woman (cisgender),73,en,1
563,429,1322,65,user,"param_grid = {\n 'min_samples': [5, 10, 20]...",provide more steps,"param_grid = {\n 'min_samples': [5, 10, 20]...",,Woman (cisgender),73,en,1
564,334,484,21,user,i think i onlz want to think about the imbalan...,i think i only want to think about the imbalan...,,,Woman (cisgender),73,en,1
565,444,1364,65,user,from sklearn.cluster import OPTICS\nfrom sklea...,this worked. but i do not have visualizations ...,from sklearn.cluster import OPTICS\nfrom sklea...,,Woman (cisgender),73,en,1


In [40]:
filtered_prompts

Unnamed: 0,index,message_id,conversation_id,role,message_text,conversational,code,other,gender,user_id,language,label
0,0,1,1,user,"parsing data from python iterator, how it coul...","parsing data from python iterator, how it coul...",,,Man (cisgender),6,en,0
1,1,730,32,user,Write python function to do operations with in...,Write python function to do operations with in...,,report_dt\tsource\tmetric_name\tmetric_num\tme...,Man (cisgender),6,en,0
2,2,1133,55,user,Write shortest tutorial on creating RAG on ema...,Write shortest tutorial on creating RAG on ema...,,,Man (cisgender),6,en,0
3,3,1135,55,user,what is FAISS,what is FAISS,,,Man (cisgender),6,en,0
4,4,1137,55,user,Transform given code to process large .mbox file,Transform given code to process large .mbox file,,Transform given code to process large .mbox file,Man (cisgender),6,en,0
...,...,...,...,...,...,...,...,...,...,...,...,...
401,374,1364,65,user,from sklearn.cluster import OPTICS\nfrom sklea...,this worked. but i dont have visualisaitons an...,from sklearn.cluster import OPTICS\nfrom sklea...,,Woman (cisgender),73,en,1
402,278,514,21,user,this doesnt do anything,this doesnt do anything,,this doesnt do anything,Woman (cisgender),73,en,1
403,266,486,21,user,3\. **Class Weighting**this looks promising.\n...,this looks promising. so my unmanipluated clas...,,3. **Class Weighting**,Woman (cisgender),73,en,1
404,335,1268,65,user,import pandas as pd\nfrom sklearn.manifold imp...,please use different color for every label no ...,import pandas as pd\nfrom sklearn.manifold imp...,,Woman (cisgender),73,en,1


## Save to database

In [41]:
expanded_prompts.to_sql('expanded_roberta_prompts', conn, if_exists='replace', index=False)
filtered_prompts.to_sql('filtered_roberta_prompts', conn, if_exists='replace', index=False)

conn.close()