In [3]:
import pandas as pd
import os  # We'll use os for reliable file path handling

# Path to the data directory (relative to the repository root)
data_path = 'Term-paper/data'  # Since we're already in the Term-paper folder

# Get list of CSV files using os.listdir (more reliable across systems)
try:
    # Get all files in the data directory
    all_files = os.listdir(data_path)
    # Filter for CSV files
    csv_files = [f for f in all_files if f.lower().endswith('.csv')]
    
    if not csv_files:
        print('No CSV files found in', data_path)
        merged_df = pd.DataFrame()
    else:
        dfs = []  # list to store (filename, dataframe) pairs
        for fname in csv_files:
            full_path = os.path.join(data_path, fname)  # Use os.path.join for reliable path construction
            try:
                df = pd.read_csv(full_path, sep=';')
                dfs.append((fname, df))
                print('Loaded', fname, '->', df.shape)
            except Exception as e:
                print('Failed to read', full_path, ':', e)

        # If files were successfully read, merge them
        if dfs:
            # Start with first DataFrame
            merged_df = dfs[0][1]
            for name, df in dfs[1:]:
                # find common columns to merge on
                common = [c for c in merged_df.columns if c in df.columns]
                if common:
                    # merge on all common columns (outer join to keep data)
                    merged_df = pd.merge(merged_df, df, how='outer', on=common)
                else:
                    # no common columns: concatenate side-by-side
                    merged_df = pd.concat([merged_df, df], axis=1)
            
            print('Final merged shape:', merged_df.shape)
        else:
            merged_df = pd.DataFrame()
            print('No files were successfully read')

except Exception as e:
    print('Error accessing data directory:', e)
    merged_df = pd.DataFrame()

Loaded SCE-Sep-2013.csv -> (1529, 29)
Loaded SCE-Nov-2022.csv -> (1184, 29)
Loaded SCE-Jan-2021.csv -> (1259, 29)
Loaded CPI.csv -> (944, 1)
Loaded SCE-Jan-2020.csv -> (1317, 29)
Loaded SCE-Nov-2023.csv -> (1098, 29)
Loaded SCE-Nov-2021.csv -> (1281, 29)
Loaded SCE-Aug-2018.csv -> (1331, 29)
Loaded SCE-Aug-2024.csv -> (1108, 29)
Loaded SCE-Jul-2013.csv -> (1197, 29)
Loaded SCE-Jan-2022.csv -> (1235, 29)
Loaded SCE-Dec-2019.csv -> (1262, 29)
Loaded SCE-Dec-2018.csv -> (1268, 29)
Loaded SCE-Dec-2024.csv -> (976, 29)
Loaded SCE-Jan-2023.csv -> (1178, 29)
Loaded SCE-Aug-2019.csv -> (1290, 29)
Loaded SCE-Nov-2020.csv -> (1233, 29)
Loaded SCE-Sep-2015.csv -> (1262, 29)
Loaded SCE-Mar-2017.csv -> (1365, 29)
Loaded SCE-Nov-2024.csv -> (1037, 29)
Loaded SCE-Nov-2018.csv -> (1323, 29)
Loaded SCE-Aug-2021.csv -> (1265, 29)
Loaded SCE-Jul-2016.csv -> (1305, 29)
Loaded SCE-Apr-2015.csv -> (1283, 29)
Loaded SCE-May-2014.csv -> (1280, 29)
Loaded SCE-Dec-2020.csv -> (1337, 29)
Loaded SCE-Dec-2021.csv 

In [4]:
merged_df

Unnamed: 0,userid,wid,date,weight,female,educ,age,hispanic,black,couple,...,num_lit_q3_correct,num_lit_q5,num_lit_q5_correct,num_lit_q6,num_lit_q6_correct,num_lit_q8,num_lit_q8_correct,num_lit_q9,num_lit_q9_correct,"DATE,CPI"
0,70000220,201306,2013-06-04,16.3,1.0,3.0,28.0,0.0,1.0,0.0,...,0.0,100.0,1.0,5.0,1.0,,,,,
1,70000224,201306,2013-06-03,0.2,0.0,4.0,65.0,0.0,0.0,1.0,...,1.0,100.0,1.0,5.0,1.0,,,,,
2,70000234,201306,2013-06-17,4.1,1.0,3.0,41.0,0.0,0.0,1.0,...,1.0,100.0,1.0,5.0,1.0,,,,,
3,70000238,201306,2013-06-13,3.0,0.0,3.0,74.0,0.0,0.0,1.0,...,1.0,1.0,0.0,5.0,1.0,,,,,
4,70000238,201307,2013-07-10,1.9,0.0,3.0,74.0,0.0,0.0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176096,75025299,202412,2024-12-19,0.6,1.0,3.0,33.0,0.0,0.0,1.0,...,1.0,100.0,1.0,5.0,1.0,2.0,0.0,2.0,1.0,
176097,75025320,202412,2024-12-05,0.8,1.0,4.0,56.0,1.0,0.0,0.0,...,1.0,100.0,1.0,5.0,1.0,3.0,1.0,2.0,1.0,
176098,75025337,202412,2024-12-21,1.0,1.0,3.0,68.0,0.0,0.0,1.0,...,1.0,100.0,1.0,5.0,1.0,3.0,1.0,2.0,1.0,
176099,75025373,202412,2024-12-09,2.4,1.0,2.0,58.0,0.0,0.0,0.0,...,1.0,100.0,1.0,1.0,0.0,3.0,1.0,2.0,1.0,


In [5]:
merged_df = merged_df.drop('DATE,CPI', axis=1)

In [7]:
num_nunique_id = merged_df['userid'].nunique()
print('Number of unique user IDs:', num_nunique_id)

Number of unique user IDs: 23369


In [30]:
merged_df.head()

Unnamed: 0,userid,wid,date,weight,female,educ,age,hispanic,black,couple,...,num_lit_q3,num_lit_q3_correct,num_lit_q5,num_lit_q5_correct,num_lit_q6,num_lit_q6_correct,num_lit_q8,num_lit_q8_correct,num_lit_q9,num_lit_q9_correct
0,70000220,201306,2013-06-04,16.3,1.0,3.0,28.0,0.0,1.0,0.0,...,100.0,0.0,100.0,1.0,5.0,1.0,,,,
1,70000224,201306,2013-06-03,0.2,0.0,4.0,65.0,0.0,0.0,1.0,...,10.0,1.0,100.0,1.0,5.0,1.0,,,,
2,70000234,201306,2013-06-17,4.1,1.0,3.0,41.0,0.0,0.0,1.0,...,10.0,1.0,100.0,1.0,5.0,1.0,,,,
3,70000238,201306,2013-06-13,3.0,0.0,3.0,74.0,0.0,0.0,1.0,...,10.0,1.0,1.0,0.0,5.0,1.0,,,,
4,70000238,201307,2013-07-10,1.9,0.0,3.0,74.0,0.0,0.0,,...,10.0,1.0,1.0,0.0,5.0,1.0,,,,


In [12]:
print(f'{len(merged_df['wid'].unique())} is the number of survey waves')

139 is the number of survey waves


In [17]:
merged_df['date'] = pd.to_datetime(merged_df['date'], format='%Y-%m-%d')

In [19]:
merged_df['date'].dtype

dtype('<M8[ns]')

In [21]:
first_date = merged_df['date'].min()
last_date = merged_df['date'].max()

print("First date in dataset:", first_date)
print("Last date in dataset:", last_date)

First date in dataset: 2013-06-01 00:00:00
Last date in dataset: 2024-12-31 00:00:00


# Part 2

In [27]:


# Identify all numeracy-related columns automatically
num_cols = [col for col in merged_df.columns if col.startswith('num_lit_')]

# Fill missing numeracy values per individual
merged_df[num_cols] = merged_df.groupby('userid')[num_cols].transform(lambda x: x.ffill().bfill())


In [28]:
merged_df[num_cols].isna().sum()


num_lit_q1              217
num_lit_q1_correct      217
num_lit_q2              438
num_lit_q2_correct      438
num_lit_q3              643
num_lit_q3_correct      643
num_lit_q5              434
num_lit_q5_correct      434
num_lit_q6             1053
num_lit_q6_correct     1053
num_lit_q8            36396
num_lit_q8_correct    36396
num_lit_q9            36718
num_lit_q9_correct    36718
dtype: int64

In [29]:
merged_df.groupby('userid')[num_cols].first().isna().sum()


num_lit_q1              66
num_lit_q1_correct      66
num_lit_q2              97
num_lit_q2_correct      97
num_lit_q3             150
num_lit_q3_correct     150
num_lit_q5             126
num_lit_q5_correct     126
num_lit_q6             218
num_lit_q6_correct     218
num_lit_q8            5384
num_lit_q8_correct    5384
num_lit_q9            5435
num_lit_q9_correct    5435
dtype: int64

In [31]:
merged_df = merged_df.dropna(subset=['female','age','educ'])

In [33]:
merged_df[['female', 'age', 'educ']].isna().sum()

female    0
age       0
educ      0
dtype: int64

In [35]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 175233 entries, 0 to 176100
Data columns (total 29 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   userid              175233 non-null  int64         
 1   wid                 175233 non-null  int64         
 2   date                175233 non-null  datetime64[ns]
 3   weight              175211 non-null  float64       
 4   female              175233 non-null  float64       
 5   educ                175233 non-null  float64       
 6   age                 175233 non-null  float64       
 7   hispanic            175146 non-null  float64       
 8   black               175233 non-null  float64       
 9   couple              162489 non-null  float64       
 10  num_kids            175205 non-null  float64       
 11  owner               23204 non-null   float64       
 12  inflation           174572 non-null  float64       
 13  house_price_change  174939 non-nul

In [36]:
before = len(merged_df)
print("Rows before drop:", before)

merged_df = merged_df.dropna(subset=['inflation','house_price_change','prob_stocks_up'])
after = len(merged_df)
print("Rows after drop:", after)
print("Rows dropped due to missing key variables:", before - after)

Rows before drop: 175233
Rows after drop: 173550
Rows dropped due to missing key variables: 1683


In [37]:
merged_df[num_cols]

Unnamed: 0,num_lit_q1,num_lit_q1_correct,num_lit_q2,num_lit_q2_correct,num_lit_q3,num_lit_q3_correct,num_lit_q5,num_lit_q5_correct,num_lit_q6,num_lit_q6_correct,num_lit_q8,num_lit_q8_correct,num_lit_q9,num_lit_q9_correct
0,550.0,0.0,20.0,0.0,100.0,0.0,100.0,1.0,5.0,1.0,,,,
1,550.0,0.0,20.0,0.0,10.0,1.0,100.0,1.0,5.0,1.0,,,,
2,550.0,0.0,20.0,0.0,10.0,1.0,100.0,1.0,5.0,1.0,,,,
3,65.0,0.0,20.0,0.0,10.0,1.0,1.0,0.0,5.0,1.0,,,,
4,65.0,0.0,20.0,0.0,10.0,1.0,1.0,0.0,5.0,1.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176096,550.0,0.0,242.0,1.0,10.0,1.0,100.0,1.0,5.0,1.0,2.0,0.0,2.0,1.0
176097,150.0,1.0,40.0,0.0,10.0,1.0,100.0,1.0,5.0,1.0,3.0,1.0,2.0,1.0
176098,150.0,1.0,242.0,1.0,10.0,1.0,100.0,1.0,5.0,1.0,3.0,1.0,2.0,1.0
176099,150.0,1.0,224.0,0.0,10.0,1.0,100.0,1.0,1.0,0.0,3.0,1.0,2.0,1.0


In [39]:
before = len(merged_df)
print("Rows before drop:", before)

merged_df = merged_df.dropna(subset=num_cols)

after = len(merged_df)
print("Rows after drop:", after)
print("Rows dropped due to missing numeracy data:", before - after)

Rows before drop: 173550
Rows after drop: 137576
Rows dropped due to missing numeracy data: 35974


In [47]:
threshhold_min = merged_df['inflation'].quantile(0.001)
threshhold_max = merged_df['inflation'].quantile(0.999)
print(threshhold_min)
print(threshhold_max)

-75.0
100.0


In [48]:
before = len(merged_df)
print("Rows before drop:", before)

merged_df = merged_df[merged_df['inflation'].between(threshhold_min, threshhold_max)]

after = len(merged_df)
print("Rows after drop:", after)
print("Rows dropped due to outliers in inflation:", before - after)

Rows before drop: 137576
Rows after drop: 137457
Rows dropped due to outliers in inflation: 119


In [51]:
merged_df['educ']

35333     4.0
35334     4.0
35335     4.0
35336     4.0
35337     4.0
         ... 
176096    3.0
176097    4.0
176098    3.0
176099    2.0
176100    4.0
Name: educ, Length: 137457, dtype: float64

In [53]:
iscollege = (merged_df['educ']>= 3)
merged_df.loc[:, 'college'] = (merged_df['educ'] >= 3).astype(int)

In [58]:
merged_df[['educ','college']]

Unnamed: 0,educ,college
35333,4.0,1
35334,4.0,1
35335,4.0,1
35336,4.0,1
35337,4.0,1
...,...,...
176096,3.0,1
176097,4.0,1
176098,3.0,1
176099,2.0,0


In [60]:
merged_df[num_cols]

Unnamed: 0,num_lit_q1,num_lit_q1_correct,num_lit_q2,num_lit_q2_correct,num_lit_q3,num_lit_q3_correct,num_lit_q5,num_lit_q5_correct,num_lit_q6,num_lit_q6_correct,num_lit_q8,num_lit_q8_correct,num_lit_q9,num_lit_q9_correct
35333,150.0,1.0,242.0,1.0,10.0,1.0,10.0,0.0,5.0,1.0,3.0,1.0,2.0,1.0
35334,150.0,1.0,242.0,1.0,10.0,1.0,100.0,1.0,5.0,1.0,3.0,1.0,2.0,1.0
35335,150.0,1.0,242.0,1.0,10.0,1.0,100.0,1.0,5.0,1.0,3.0,1.0,2.0,1.0
35336,150.0,1.0,242.0,1.0,10.0,1.0,100.0,1.0,5.0,1.0,3.0,1.0,2.0,1.0
35337,150.0,1.0,242.0,1.0,10.0,1.0,100.0,1.0,5.0,1.0,3.0,1.0,2.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176096,550.0,0.0,242.0,1.0,10.0,1.0,100.0,1.0,5.0,1.0,2.0,0.0,2.0,1.0
176097,150.0,1.0,40.0,0.0,10.0,1.0,100.0,1.0,5.0,1.0,3.0,1.0,2.0,1.0
176098,150.0,1.0,242.0,1.0,10.0,1.0,100.0,1.0,5.0,1.0,3.0,1.0,2.0,1.0
176099,150.0,1.0,224.0,0.0,10.0,1.0,100.0,1.0,1.0,0.0,3.0,1.0,2.0,1.0


In [85]:
# Columns ending with _correct
corrects = [col for col in num_cols if col.endswith('_correct')]

# Group by user and sum the correct answers
merged_df['total_correct'] = merged_df.groupby('userid')[corrects].transform('sum').sum(axis=1)



In [82]:
merged_df = merged_df.drop(columns=['total_correct'])

In [86]:
merged_df

Unnamed: 0,userid,wid,date,weight,female,educ,age,hispanic,black,couple,...,num_lit_q5,num_lit_q5_correct,num_lit_q6,num_lit_q6_correct,num_lit_q8,num_lit_q8_correct,num_lit_q9,num_lit_q9_correct,college,total_correct
35333,70057317,201504,2015-04-10,0.5,0.0,4.0,70.0,0.0,0.0,1.0,...,10.0,0.0,5.0,1.0,3.0,1.0,2.0,1.0,1,6.0
35334,70057321,201504,2015-04-05,0.5,1.0,4.0,72.0,0.0,0.0,0.0,...,100.0,1.0,5.0,1.0,3.0,1.0,2.0,1.0,1,77.0
35335,70057321,201505,2015-05-11,0.5,1.0,4.0,72.0,0.0,0.0,0.0,...,100.0,1.0,5.0,1.0,3.0,1.0,2.0,1.0,1,77.0
35336,70057321,201506,2015-06-20,0.4,1.0,4.0,72.0,0.0,0.0,0.0,...,100.0,1.0,5.0,1.0,3.0,1.0,2.0,1.0,1,77.0
35337,70057321,201507,2015-07-22,0.4,1.0,4.0,72.0,0.0,0.0,0.0,...,100.0,1.0,5.0,1.0,3.0,1.0,2.0,1.0,1,77.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176096,75025299,202412,2024-12-19,0.6,1.0,3.0,33.0,0.0,0.0,1.0,...,100.0,1.0,5.0,1.0,2.0,0.0,2.0,1.0,1,5.0
176097,75025320,202412,2024-12-05,0.8,1.0,4.0,56.0,1.0,0.0,0.0,...,100.0,1.0,5.0,1.0,3.0,1.0,2.0,1.0,1,6.0
176098,75025337,202412,2024-12-21,1.0,1.0,3.0,68.0,0.0,0.0,1.0,...,100.0,1.0,5.0,1.0,3.0,1.0,2.0,1.0,1,7.0
176099,75025373,202412,2024-12-09,2.4,1.0,2.0,58.0,0.0,0.0,0.0,...,100.0,1.0,1.0,0.0,3.0,1.0,2.0,1.0,0,5.0


In [72]:
merged_df.loc[:,'total_correct'] = merged_df['userid'].map(
    merged_df.groupby('userid')[corrects].sum().sum(axis=1)
)

In [76]:
correct_counts_per_user

Unnamed: 0_level_0,num_lit_q1_correct,num_lit_q2_correct,num_lit_q3_correct,num_lit_q5_correct,num_lit_q6_correct,num_lit_q8_correct,num_lit_q9_correct
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
70057317,1.0,1.0,1.0,0.0,1.0,1.0,1.0
70057321,11.0,11.0,11.0,11.0,11.0,11.0,11.0
70057352,9.0,9.0,9.0,9.0,9.0,9.0,9.0
70057367,7.0,7.0,7.0,7.0,7.0,7.0,7.0
70057373,11.0,11.0,11.0,11.0,11.0,11.0,11.0
...,...,...,...,...,...,...,...
75025299,0.0,1.0,1.0,1.0,1.0,0.0,1.0
75025320,1.0,0.0,1.0,1.0,1.0,1.0,1.0
75025337,1.0,1.0,1.0,1.0,1.0,1.0,1.0
75025373,1.0,0.0,1.0,1.0,0.0,1.0,1.0


In [80]:
merged_df['num_lit_q1_correct'].min()

0.0

In [87]:
corrects = [col for col in merged_df.columns if col.endswith('_correct')]

# Ensure values are 0 or 1
merged_df[corrects] = merged_df[corrects].clip(upper=1)


In [88]:
merged_df['total_correct'] = merged_df[corrects].sum(axis=1)


In [89]:
merged_df

Unnamed: 0,userid,wid,date,weight,female,educ,age,hispanic,black,couple,...,num_lit_q5,num_lit_q5_correct,num_lit_q6,num_lit_q6_correct,num_lit_q8,num_lit_q8_correct,num_lit_q9,num_lit_q9_correct,college,total_correct
35333,70057317,201504,2015-04-10,0.5,0.0,4.0,70.0,0.0,0.0,1.0,...,10.0,0.0,5.0,1.0,3.0,1.0,2.0,1.0,1,7.0
35334,70057321,201504,2015-04-05,0.5,1.0,4.0,72.0,0.0,0.0,0.0,...,100.0,1.0,5.0,1.0,3.0,1.0,2.0,1.0,1,8.0
35335,70057321,201505,2015-05-11,0.5,1.0,4.0,72.0,0.0,0.0,0.0,...,100.0,1.0,5.0,1.0,3.0,1.0,2.0,1.0,1,8.0
35336,70057321,201506,2015-06-20,0.4,1.0,4.0,72.0,0.0,0.0,0.0,...,100.0,1.0,5.0,1.0,3.0,1.0,2.0,1.0,1,8.0
35337,70057321,201507,2015-07-22,0.4,1.0,4.0,72.0,0.0,0.0,0.0,...,100.0,1.0,5.0,1.0,3.0,1.0,2.0,1.0,1,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176096,75025299,202412,2024-12-19,0.6,1.0,3.0,33.0,0.0,0.0,1.0,...,100.0,1.0,5.0,1.0,2.0,0.0,2.0,1.0,1,6.0
176097,75025320,202412,2024-12-05,0.8,1.0,4.0,56.0,1.0,0.0,0.0,...,100.0,1.0,5.0,1.0,3.0,1.0,2.0,1.0,1,7.0
176098,75025337,202412,2024-12-21,1.0,1.0,3.0,68.0,0.0,0.0,1.0,...,100.0,1.0,5.0,1.0,3.0,1.0,2.0,1.0,1,8.0
176099,75025373,202412,2024-12-09,2.4,1.0,2.0,58.0,0.0,0.0,0.0,...,100.0,1.0,1.0,0.0,3.0,1.0,2.0,1.0,0,6.0


In [95]:
# Group by user and pick the first total_correct per user (if multiple rows per user)
user_totals = merged_df.groupby('userid')['total_correct'].first()
print(user_totals.head())


userid
70057317    7.0
70057321    8.0
70057352    8.0
70057367    8.0
70057373    8.0
Name: total_correct, dtype: float64


In [97]:
counts = user_totals.value_counts().sort_index()

In [98]:
percentiles = counts / counts.sum() * 100
print(percentiles)

total_correct
0.0     0.141163
2.0     0.728402
3.0     2.405421
4.0     5.714286
5.0    10.519481
6.0    16.933936
7.0    27.402597
8.0    36.154715
Name: count, dtype: float64


In [102]:
mean = merged_df['total_correct'].mean()

In [103]:
merged_df['num_lit_high'] = (merged_df['total_correct'] >= mean).astype(int)

In [104]:
merged_df

Unnamed: 0,userid,wid,date,weight,female,educ,age,hispanic,black,couple,...,num_lit_q5_correct,num_lit_q6,num_lit_q6_correct,num_lit_q8,num_lit_q8_correct,num_lit_q9,num_lit_q9_correct,college,total_correct,num_lit_high
35333,70057317,201504,2015-04-10,0.5,0.0,4.0,70.0,0.0,0.0,1.0,...,0.0,5.0,1.0,3.0,1.0,2.0,1.0,1,7.0,1
35334,70057321,201504,2015-04-05,0.5,1.0,4.0,72.0,0.0,0.0,0.0,...,1.0,5.0,1.0,3.0,1.0,2.0,1.0,1,8.0,1
35335,70057321,201505,2015-05-11,0.5,1.0,4.0,72.0,0.0,0.0,0.0,...,1.0,5.0,1.0,3.0,1.0,2.0,1.0,1,8.0,1
35336,70057321,201506,2015-06-20,0.4,1.0,4.0,72.0,0.0,0.0,0.0,...,1.0,5.0,1.0,3.0,1.0,2.0,1.0,1,8.0,1
35337,70057321,201507,2015-07-22,0.4,1.0,4.0,72.0,0.0,0.0,0.0,...,1.0,5.0,1.0,3.0,1.0,2.0,1.0,1,8.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176096,75025299,202412,2024-12-19,0.6,1.0,3.0,33.0,0.0,0.0,1.0,...,1.0,5.0,1.0,2.0,0.0,2.0,1.0,1,6.0,0
176097,75025320,202412,2024-12-05,0.8,1.0,4.0,56.0,1.0,0.0,0.0,...,1.0,5.0,1.0,3.0,1.0,2.0,1.0,1,7.0,1
176098,75025337,202412,2024-12-21,1.0,1.0,3.0,68.0,0.0,0.0,1.0,...,1.0,5.0,1.0,3.0,1.0,2.0,1.0,1,8.0,1
176099,75025373,202412,2024-12-09,2.4,1.0,2.0,58.0,0.0,0.0,0.0,...,1.0,1.0,0.0,3.0,1.0,2.0,1.0,0,6.0,0


In [107]:
num_nunique_id = merged_df['userid'].nunique()
print('Number of unique user IDs:', num_nunique_id)

print(f'{len(merged_df)} is the total number of observations')

print(f'{len(merged_df['wid'].unique())} is the number of survey waves')

first_date = merged_df['date'].min()
last_date = merged_df['date'].max()

print("First date in dataset:", first_date)
print("Last date in dataset:", last_date)


Number of unique user IDs: 17710
137457 is the total number of observations
117 is the number of survey waves
First date in dataset: 2015-04-02 00:00:00
Last date in dataset: 2024-12-31 00:00:00
