In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import helpers.cleaning as data_cleaning_utils
import scipy.stats as stats
import helpers.utils as utils
from helpers.constants import *

In [2]:
users_df = data_cleaning_utils.load_qualtrics_csv('data/qualtrics.csv')
users_df = data_cleaning_utils.clean_users_df(users_df, keep_only_prolific_for_india=True, keep_only_prolific_for_us=True, remove_born_outside=True, remove_pilot=True)
users_df['group'] = None
print(len(users_df))

Removing users who were not born in the same country as they currently reside
118


In [None]:
events_df, tasks_df, suggestions_df = utils.construct_dfs_for_analysis(users_df, EVENTS_DIR, TREATMENT_LABEL, CONTROL_LABEL)

Short Shwartz Value Survey analysis

In [4]:
ssvs_cols = users_df.columns[users_df.columns.str.startswith('ssvs')].tolist()
df_ssvs = users_df[~users_df['ssvs_achievement'].isna()][['country'] + ssvs_cols]
df_ssvs = utils.compute_ssvs_scores(df_ssvs)

In [5]:
# Perform Shapiro-Wilk test for normality
utils.perform_normality_test(df_ssvs, ssvs_cols + ['conservation', 'transcendence'], filter_col='country', filter_vals=['IND', 'US'])

Unnamed: 0,col,shapiro_IND,shapiro_US
0,ssvs_power,Not normal,Not normal
1,ssvs_achievement,Not normal,Not normal
2,ssvs_hedonism,Not normal,Not normal
3,ssvs_stimulation,Not normal,Not normal
4,ssvs_self-direction,Not normal,Not normal
5,ssvs_universalism,Not normal,Not normal
6,ssvs_benevolence,Not normal,Not normal
7,ssvs_tradition,Not normal,Not normal
8,ssvs_conformity,Not normal,Not normal
9,ssvs_security,Not normal,Not normal


In [6]:
# Compute statistical significance between participants from India and US

df_stats = utils.perform_statistical_test(df_ssvs, ssvs_cols + ['conservation', 'transcendence'],
                               filter_col='country', filter_vals=['IND', 'US'], test_name='mannwhitney')
df_stats

Unnamed: 0,col,u_stat,p_value,significant
0,ssvs_power,2399.0,0.000319,True
1,ssvs_achievement,2216.0,0.008788,True
2,ssvs_hedonism,2157.5,0.022291,True
3,ssvs_stimulation,2205.0,0.010984,True
4,ssvs_self-direction,1879.0,0.422224,False
5,ssvs_universalism,2267.5,0.003075,True
6,ssvs_benevolence,2045.0,0.081727,False
7,ssvs_tradition,2347.5,0.00088,True
8,ssvs_conformity,2529.5,1.4e-05,True
9,ssvs_security,2665.0,0.0,True


Stats for the paper

In [7]:
users_df.groupby(['country', 'group']).size()

country  group
IND      AI       36
         No AI    24
US       AI       29
         No AI    29
dtype: int64

In [10]:
# Stats for user demographics table
for country in users_df.country.unique():
    dft = users_df[users_df.country == country]
    print(f'Country: {country}')
    print(f'Num participants: {len(dft)}')
    age_mean, age_std = dft.age.astype(int).mean(), dft.age.astype(int).std()
    print(f'Age: {age_mean:.2f} ± {age_std:.2f}')

    genders = dft.gender.value_counts(normalize=True).to_dict()
    print("Gender", genders)

    education = dft.education.value_counts(normalize=True).to_dict()
    print("Education", education)

    languages = dft.languages.tolist()
    print("Languages (dedup using ChatGPT)", languages)

    occupations = dft.occupation.tolist()
    print("Occupations (dedup using ChatGPT)", occupations)

    print()

Country: IND
Num participants: 60
Age: 33.38 ± 11.85
Gender {'Male': 0.7166666666666667, 'Female': 0.26666666666666666, 'Prefer not to say': 0.016666666666666666}
Education {'Graduation': 0.43333333333333335, 'Post-graduation': 0.4166666666666667, 'Upto grade 12 (Inter)': 0.15}
Languages (dedup using ChatGPT) ['Konkani, English and Hindi', 'English, Malayalam, Hindi, Tamil, French', 'Malayalam, English', 'English, Hindi', 'tamil', 'English, Hindi, Gujarati', 'English, Hindi, Punjabi', 'Malayalam, English, Hindi, Tamil', 'English, Hindi, Marathi', 'English, Urdu, Kashmiri', 'Telugu, English, Java, Python', 'English, Kannada, Hindi, Marathi', 'English, Hindi, Malayalam, Tamil ', 'English, Telugu, Hindi ', 'English, Hindi, Punjabi, Gujarati, Urdu and basic German', 'Hindi, English, Marathi', 'English, malayalam, hindi, tamil', 'English, Hindi', 'English', 'English, Hindi', 'English, Hindi, French', 'Hindi, English, German, Sanskrit', 'Marathi English Hindi', 'Hindi, Marathi, English and F

In [15]:
# Remove users with no AI use (presumably they didn't engage meaningfully with the study)??
ai_reliance_per_user = tasks_df.groupby('user_id')['ai_reliance'].mean()

users_with_no_ai_use = ai_reliance_per_user[ai_reliance_per_user == 0].index.tolist()
print(f"{len(users_with_no_ai_use)} users with no AI use:\n{users_df.loc[users_with_no_ai_use]['country'].value_counts()}")

# users_df = users_df[~users_df.index.isin(users_with_no_ai_use)]
# tasks_df = tasks_df[tasks_df['user_id'].isin(users_df.index)]

1 users with no AI use:
US    1
Name: country, dtype: int64
