Loading and creating the mme_dataset

In [2]:
import pandas as pd

# create a dataframe with only the ExtendedSessionID column from the 71M SharedResponses.csv file

reader2 = pd.read_csv('SharedResponses.csv', usecols=['ResponseID'])

In [3]:
# code that will check if there are two of them in the dataset

# Count how many times each ResponseID appears
response_counts = reader2['ResponseID'].value_counts()

# Get the ResponseIDs that appear exactly twice
ids_to_keep = response_counts[response_counts == 2].index

# Filter the rows where ResponseID is in the list of ids_to_keep
complete_responseid = reader2[reader2['ResponseID'].isin(ids_to_keep)]


In [4]:
complete_responseid.value_counts().min()

2

In [5]:
complete_responseid.shape

(67907006, 1)

In [6]:
# Drop duplicates based on 'ResponseID' and keep the first occurrence
complete_responseid = complete_responseid.drop_duplicates(subset=['ResponseID'], keep='first')

In [7]:
reader_subset = complete_responseid.sample(n=3500000) # want 7M rows (with accounting for NAn's and deleting rows with 'random' (around 10%)), so 7M / 2 = 3.5M ResponseID's necessary

In [8]:
# checking if they are all unique

reader_subset.nunique()

ResponseID    3500000
dtype: int64

In [9]:
# transform to a list to feed to the for loop that will extract all the corresponding columns from the SharedResponses.csv file

reader_sub_list = reader_subset['ResponseID'].tolist()

In [10]:
# check if it went okay
print(len(reader_sub_list))     # should be 8500000

3500000


In [11]:
# empty dataframe to append the rows of the sessions with 24 rows to

subset = pd.DataFrame()

In [12]:
# reading SharedResponses to extract the rows with ResponseID's in reader_sub_list
# this will result in a dataframe 'subset' that contains around 7 million rows (3,5 * 2)

chunk_size = 500_000
reader = pd.read_csv('SharedResponses.csv', chunksize=chunk_size, dtype=str, low_memory=False)

for i, chunk in enumerate(reader):
    
    print(f"Processing chunk {i+1}")

    # Filter rows where ResponseID is in reader_subset
    subset_chunk = chunk[chunk['ResponseID'].isin(reader_sub_list)]

    # Append filtered chunk to empty df
    subset = pd.concat([subset, subset_chunk], ignore_index=True)

    print(f"Finished processing chunk {i+1}")

print("All chunks have been processed and combined.")

Processing chunk 1
Finished processing chunk 1
Processing chunk 2
Finished processing chunk 2
Processing chunk 3
Finished processing chunk 3
Processing chunk 4
Finished processing chunk 4
Processing chunk 5
Finished processing chunk 5
Processing chunk 6
Finished processing chunk 6
Processing chunk 7
Finished processing chunk 7
Processing chunk 8
Finished processing chunk 8
Processing chunk 9
Finished processing chunk 9
Processing chunk 10
Finished processing chunk 10
Processing chunk 11
Finished processing chunk 11
Processing chunk 12
Finished processing chunk 12
Processing chunk 13
Finished processing chunk 13
Processing chunk 14
Finished processing chunk 14
Processing chunk 15
Finished processing chunk 15
Processing chunk 16
Finished processing chunk 16
Processing chunk 17
Finished processing chunk 17
Processing chunk 18
Finished processing chunk 18
Processing chunk 19
Finished processing chunk 19
Processing chunk 20
Finished processing chunk 20
Processing chunk 21
Finished processin

In [13]:
subset.shape

(7000000, 41)

In [14]:
subset_csv = subset.to_csv('subset.csv', index=False)

In [1]:
import pandas as pd
df_mme = pd.read_csv('subset.csv')

  df_mme = pd.read_csv('subset.csv')


In [2]:
df_mme.shape

(7000000, 41)

In [3]:
df_mme.head()

Unnamed: 0,ResponseID,ExtendedSessionID,UserID,ScenarioOrder,Intervention,PedPed,Barrier,CrossingSignal,AttributeLevel,ScenarioTypeStrict,...,LargeMan,Criminal,MaleExecutive,FemaleExecutive,FemaleAthlete,MaleAthlete,FemaleDoctor,MaleDoctor,Dog,Cat
0,2222sJk4DcoqXXi98,1043988516_3525281295.0,3525281000.0,2,0,0,0,0,Rand,Random,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2223jMWDEGNeszivb,-1683127088_785070916172117.0,785070900000000.0,8,0,1,0,2,More,Utilitarian,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,2225gNWJcAeE92LXd,2069688900_9887644874714294.0,9887645000000000.0,3,0,0,1,0,More,Utilitarian,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
3,222BhQ87fGnAC4FWY,-2134337540_4570487844678215.0,4570488000000000.0,6,0,0,1,0,Male,Gender,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,222Bih22xMQR5brhF,-841718081_3084184331213722.0,3084184000000000.0,11,0,0,0,2,Pets,Species,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0


In [4]:
df_mme['ScenarioTypeStrict'].value_counts()

ScenarioTypeStrict
Utilitarian      1256212
Species          1235954
Age              1234418
Fitness          1233222
Gender           1232558
Random            615844
Social Status     191792
Name: count, dtype: int64

In [5]:
# removing all rows with 'random' in the ScenarioTypeStrict column, as LLM's don't have a random scenario

df_mme = df_mme[df_mme['ScenarioTypeStrict'] != 'Random']

# should be 7000000 - 615844 = 6384156

In [6]:
df_mme.shape

(6384156, 41)

In [7]:
# removing all rows with 'Rand' in the AttributeLevel column, as LLM's don't have a random scenario
df_mme['AttributeLevel'].value_counts()

AttributeLevel
More       628106
Less       628106
Pets       617977
Hoomans    617977
Male       616279
Female     616279
Young      596653
Old        596653
Fat        556350
Fit        556350
Rand       203250
Low         75088
High        75088
Name: count, dtype: int64

In [8]:
df_mme = df_mme[df_mme['AttributeLevel'] != 'Rand']

In [9]:
df_mme.shape # should be 6384156 - 203250 = 6180906

(6180906, 41)

In [10]:
df_mme.isna().sum()

ResponseID                      0
ExtendedSessionID               0
UserID                        756
ScenarioOrder                   0
Intervention                    0
PedPed                          0
Barrier                         0
CrossingSignal                  0
AttributeLevel                  0
ScenarioTypeStrict              0
ScenarioType                    0
DefaultChoice                   0
NonDefaultChoice                0
DefaultChoiceIsOmission         0
NumberOfCharacters              0
DiffNumberOFCharacters          0
Saved                           0
Template                   804942
DescriptionShown           804942
LeftHand                   804942
UserCountry3                58372
Man                             0
Woman                           0
Pregnant                        0
Stroller                        0
OldMan                          0
OldWoman                        0
Boy                             0
Girl                            0
Homeless      

In [11]:
# deleting rows with NaN's in the UserID column

df_mme = df_mme.dropna(subset=['UserID'])

In [12]:
df_mme.shape # should be 6180906 - 756 = 6180150

(6180150, 41)

In [13]:
# the total dataset has to be 5M rows.
# 2% of that is LLM's, so 100.000 rows
# the other 98% will be humans, so 4.900.000 rows

# need to subset 4900000 rows from the 6180150 rows
# need to delete 6180150 - 4900000 = 1280150 rows

In [14]:
# randomly delete 1280150 / 2 = 640075 unique ResponseID's (is 1280150 rows), to ensure 2% of the dataset is LLMs and 98% humans

# Getting unique UserIDs
Response_unique = df_mme['ResponseID'].unique()
print(len(Response_unique))  # should be 3090075

# Selecting 640075 UserIDs from the unique set
Response_delete = pd.Series(Response_unique).sample(n=640075, random_state=42)


3090075


In [15]:
df_mme_98 = df_mme[~df_mme['ResponseID'].isin(Response_delete)]

In [16]:
# checking shape of df_filtered

df_mme_98.shape   # should be 6180150 - 1280150 = 4900000

(4900000, 41)

In [17]:
df_mme_98.head()

Unnamed: 0,ResponseID,ExtendedSessionID,UserID,ScenarioOrder,Intervention,PedPed,Barrier,CrossingSignal,AttributeLevel,ScenarioTypeStrict,...,LargeMan,Criminal,MaleExecutive,FemaleExecutive,FemaleAthlete,MaleAthlete,FemaleDoctor,MaleDoctor,Dog,Cat
1,2223jMWDEGNeszivb,-1683127088_785070916172117.0,785070900000000.0,8,0,1,0,2,More,Utilitarian,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,222BhQ87fGnAC4FWY,-2134337540_4570487844678215.0,4570488000000000.0,6,0,0,1,0,Male,Gender,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,222Bih22xMQR5brhF,-841718081_3084184331213722.0,3084184000000000.0,11,0,0,0,2,Pets,Species,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0
5,222KuWty7pNeiv77a,1654911454_3639764894860440.0,3639765000000000.0,8,0,1,0,0,Low,Social Status,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,222PL8SYdZnhNnSGp,-2091429432_6630077581817493.0,6630078000000000.0,1,0,1,0,1,Hoomans,Species,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# check what the distribution of RepsonseID's is (kinda)
# check if the indices of a responseID here are the same after transforming the ResponseID column
# if the indices are the same means ResponseID column is succesfully transformed

indices = df_mme_98[df_mme_98["ResponseID"] == '2223jMWDEGNeszivb'].index
print(indices)

Index([1, 882974], dtype='int64')


In [19]:
# Changing the responseID

# Define the starting point for the new ResponseID
starting_id = 146784

# Step 1: Get the unique ResponseIDs
unique_response_ids = df_mme_98['ResponseID'].unique()

# Step 2: Create a mapping from old ResponseID to new 'res_' formatted ID
response_id_mapping = {old_id: f'res_{i:08d}' for i, old_id in enumerate(unique_response_ids, starting_id)}

# Step 3: Replace the original ResponseID with the new mapped IDs
df_mme_98['ResponseID'] = df_mme_98['ResponseID'].map(response_id_mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_mme_98['ResponseID'] = df_mme_98['ResponseID'].map(response_id_mapping)


In [20]:
# ResponseID starts with res_00146784
df_mme_98.head()

Unnamed: 0,ResponseID,ExtendedSessionID,UserID,ScenarioOrder,Intervention,PedPed,Barrier,CrossingSignal,AttributeLevel,ScenarioTypeStrict,...,LargeMan,Criminal,MaleExecutive,FemaleExecutive,FemaleAthlete,MaleAthlete,FemaleDoctor,MaleDoctor,Dog,Cat
1,res_00146784,-1683127088_785070916172117.0,785070900000000.0,8,0,1,0,2,More,Utilitarian,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,res_00146785,-2134337540_4570487844678215.0,4570488000000000.0,6,0,0,1,0,Male,Gender,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,res_00146786,-841718081_3084184331213722.0,3084184000000000.0,11,0,0,0,2,Pets,Species,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0
5,res_00146787,1654911454_3639764894860440.0,3639765000000000.0,8,0,1,0,0,Low,Social Status,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,res_00146788,-2091429432_6630077581817493.0,6630078000000000.0,1,0,1,0,1,Hoomans,Species,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
indices = df_mme_98[df_mme_98["ResponseID"] == 'res_00146784'].index
print(indices)

# checking if the indices are the same after transforming the ResponseID column
# compared to the indices of the ResponseID before transforming the column - they are!
# both are [2, 882974]

Index([1, 882974], dtype='int64')


In [22]:
df_mme_98.tail()

Unnamed: 0,ResponseID,ExtendedSessionID,UserID,ScenarioOrder,Intervention,PedPed,Barrier,CrossingSignal,AttributeLevel,ScenarioTypeStrict,...,LargeMan,Criminal,MaleExecutive,FemaleExecutive,FemaleAthlete,MaleAthlete,FemaleDoctor,MaleDoctor,Dog,Cat
6999990,res_02596779,-1536551642_3428762455.0,3428762000.0,8,1,1,0,0,Less,Utilitarian,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
6999991,res_02596780,-1409806035_678401095,678401100.0,6,1,0,1,0,Male,Gender,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6999992,res_02596781,-1222087996_908819923,908819900.0,13,1,1,0,0,Male,Gender,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6999995,res_02596782,59083021_6350949440772371.0,6350949000000000.0,1,1,1,0,0,Fit,Fitness,...,0.0,0.0,0.0,0.0,3.0,2.0,0.0,0.0,0.0,0.0
6999997,res_02596783,-1845235964_3077453412676796.0,3077453000000000.0,4,1,1,0,0,More,Utilitarian,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0


In [23]:
# resetting index

df_mme_98.reset_index(drop=True, inplace=True)

In [24]:
df_mme_98.columns

Index(['ResponseID', 'ExtendedSessionID', 'UserID', 'ScenarioOrder',
       'Intervention', 'PedPed', 'Barrier', 'CrossingSignal', 'AttributeLevel',
       'ScenarioTypeStrict', 'ScenarioType', 'DefaultChoice',
       'NonDefaultChoice', 'DefaultChoiceIsOmission', 'NumberOfCharacters',
       'DiffNumberOFCharacters', 'Saved', 'Template', 'DescriptionShown',
       'LeftHand', 'UserCountry3', 'Man', 'Woman', 'Pregnant', 'Stroller',
       'OldMan', 'OldWoman', 'Boy', 'Girl', 'Homeless', 'LargeWoman',
       'LargeMan', 'Criminal', 'MaleExecutive', 'FemaleExecutive',
       'FemaleAthlete', 'MaleAthlete', 'FemaleDoctor', 'MaleDoctor', 'Dog',
       'Cat'],
      dtype='object')

### EDA Countries

In [25]:
# some EDA on the country column to assess the distribution of the user ethnicity/culture
# Can't do this later as then the column will be removed because of identifyable information - the LLM dataset does not have country information

# checking which countries are in the dataset, and what their frequency is
# need this for the comparison with the non_western_dataset

print(df_mme_98['UserCountry3'].value_counts())

UserCountry3
USA    1244280
DEU     321040
BRA     279090
FRA     275218
GBR     258982
        ...   
NFK          2
LSO          2
MHL          2
GNQ          2
CAF          2
Name: count, Length: 226, dtype: int64


In [26]:
# List of countries considered 'Western' from the UN

countries = ["AND",  "AUS", "AUT", "BEL", "CAN", "CHE", "DNK", "DEU", "ESP", "FIN",  "FRA",  "GBR",  "GRC",  "IRL", 
            "ISR", "ISL", "ITA", "LIE", "LUX", "MLT", "MCO", "NLD", "NOR", "NZL", "PRT", "SMR", "SWE", "TUR", "USA" ]

# Andorra, Australia, Austria, Belgium, Canada, Switzerland, Denmark, Germany, Spain, Finland, France, United Kingdom, Greece, Ireland, 
# Israel, Iceland, Italy, Liechtenstein, Luxembourg, Malta, Monaco, Netherlands, Norway, New Zealand, Portugal, San Marino, Sweden, Turkey, USA

In [27]:
# what number of rows that is from western countries?

total_western = df_mme_98['UserCountry3'].isin(countries).sum()
print(f"Number of rows Western: {total_western}")

# this is % of the dataset that is from Western countries
print(f"Rounded % Western: {round(total_western /len(df_mme_98)*100)}")     # rounded percentage
print(f"Unrounded % Western: {total_western /len(df_mme_98)*100}")          # exact percentage

Number of rows Western: 3377172
Rounded % Western: 69
Unrounded % Western: 68.9218775510204


EDA of countries is now over

In [28]:
# already deleting some columns to ensure it fits in memory
# deleting the columns that are not necessary for the modelling: ExtendedSessionID, DefaultChoice, NonDefaultChoice, DefaultChoiceIsOmission, Template

df_mme_clean = df_mme_98.drop(columns=['ExtendedSessionID', 'DefaultChoice', 'NonDefaultChoice', 'DefaultChoiceIsOmission', 'Template', 'ScenarioType', 'ScenarioOrder', 'DescriptionShown', 'LeftHand', 'UserCountry3'], axis=1)

In [29]:
df_mme_clean.shape # should be 4900000 rows and 31 columns

(4900000, 31)

In [30]:
# binarizing UserID - making all into 0
df_mme_clean['UserID'] = 0

In [31]:
# saving this pre-preprocessed mme dataset to a csv file

df_mme_clean.to_csv('mme_dataset.csv', index=False)