# Read and merge all CSV files from `data` (only import pandas)

This notebook reads every CSV in `Term-paper/data` using only `pandas` as an explicit import,
lists files using the Jupyter/IPython shell, then merges them into a single DataFrame.
If DataFrames share column names the merge uses those columns; otherwise DataFrames are concatenated side-by-side.

In [13]:
import pandas as pd

# Path to the data directory (relative to the repository root)
# Use relative path to data directory
data_path = 'data'  # Since we're already in the Term-paper folder

# List files using the IPython shell
files = get_ipython().getoutput(f'ls {data_path}/*.csv')

# List files using the IPython shell so we don't import os/pathlib/glob explicitly
files = []
try:
    # List the files in the data_path directory
    files = get_ipython().getoutput(f'ls "{data_path}"')
except Exception:
    # If running outside IPython this will fail; keep files empty so we don't error out
    files = []

# Filter CSV files
csv_files = [f for f in files if isinstance(f, str) and f.lower().endswith('.csv')]

if not csv_files:
    print('No CSV files found in', data_path)
else:
    dfs = []  # list of (filename, df)
    for fname in csv_files:
        full = f'{data_path}/{fname}'
        try:
            df = pd.read_csv(full, sep=';')
            dfs.append((fname, df))
            print('Loaded', fname, '->', df.shape)
        except Exception as e:
            print('Failed to read', full, ':', e)

    # If nothing successfully read, create empty merged_df
    if not dfs:
        merged_df = pd.DataFrame()
    else:
        # Start with first DataFrame
        merged_df = dfs[0][1]
        for name, df in dfs[1:]:
            # find common columns to merge on
            common = [c for c in merged_df.columns if c in df.columns]
            if common:
                # merge on all common columns (outer join to keep data)
                merged_df = pd.merge(merged_df, df, how='outer', on=common)
            else:
                # no common columns: concatenate side-by-side (columns may be duplicated)
                merged_df = pd.concat([merged_df, df], axis=1)

    print('Final merged shape:', merged_df.shape)

# keep merged_df available in the notebook namespace even if no files were found
try:
    merged_df
except NameError:
    merged_df = pd.DataFrame()

Loaded CPI.csv -> (944, 1)
Loaded SCE-Apr-2014.csv -> (1311, 29)
Loaded SCE-Apr-2015.csv -> (1283, 29)
Loaded SCE-Apr-2016.csv -> (1214, 29)
Loaded SCE-Apr-2017.csv -> (1276, 29)
Loaded SCE-Apr-2018.csv -> (1300, 29)
Loaded SCE-Apr-2019.csv -> (1336, 29)
Loaded SCE-Apr-2020.csv -> (1300, 29)
Loaded SCE-Apr-2021.csv -> (1243, 29)
Loaded SCE-Apr-2022.csv -> (1269, 29)
Loaded SCE-Apr-2023.csv -> (1255, 29)
Loaded SCE-Apr-2024.csv -> (1082, 29)
Loaded SCE-Aug-2013.csv -> (1769, 29)
Loaded SCE-Aug-2014.csv -> (1352, 29)
Loaded SCE-Aug-2015.csv -> (1226, 29)
Loaded SCE-Aug-2016.csv -> (1271, 29)
Loaded SCE-Aug-2017.csv -> (1344, 29)
Loaded SCE-Aug-2018.csv -> (1331, 29)
Loaded SCE-Aug-2019.csv -> (1290, 29)
Loaded SCE-Aug-2020.csv -> (1193, 29)
Loaded SCE-Aug-2021.csv -> (1265, 29)
Loaded SCE-Aug-2022.csv -> (1300, 29)
Loaded SCE-Aug-2023.csv -> (1130, 29)
Loaded SCE-Aug-2024.csv -> (1108, 29)
Loaded SCE-Dec-2013.csv -> (1350, 29)
Loaded SCE-Dec-2014.csv -> (1302, 29)
Loaded SCE-Dec-2015.csv

In [14]:
merged_df

Unnamed: 0,"DATE,CPI",userid,wid,date,weight,female,educ,age,hispanic,black,...,num_lit_q3,num_lit_q3_correct,num_lit_q5,num_lit_q5_correct,num_lit_q6,num_lit_q6_correct,num_lit_q8,num_lit_q8_correct,num_lit_q9,num_lit_q9_correct
0,,70000220,201306,2013-06-04,16.3,1.0,3.0,28.0,0.0,1.0,...,100.0,0.0,100.0,1.0,5.0,1.0,,,,
1,,70000224,201306,2013-06-03,0.2,0.0,4.0,65.0,0.0,0.0,...,10.0,1.0,100.0,1.0,5.0,1.0,,,,
2,,70000234,201306,2013-06-17,4.1,1.0,3.0,41.0,0.0,0.0,...,10.0,1.0,100.0,1.0,5.0,1.0,,,,
3,,70000238,201306,2013-06-13,3.0,0.0,3.0,74.0,0.0,0.0,...,10.0,1.0,1.0,0.0,5.0,1.0,,,,
4,,70000238,201307,2013-07-10,1.9,0.0,3.0,74.0,0.0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176096,,75025299,202412,2024-12-19,0.6,1.0,3.0,33.0,0.0,0.0,...,10.0,1.0,100.0,1.0,5.0,1.0,2.0,0.0,2.0,1.0
176097,,75025320,202412,2024-12-05,0.8,1.0,4.0,56.0,1.0,0.0,...,10.0,1.0,100.0,1.0,5.0,1.0,3.0,1.0,2.0,1.0
176098,,75025337,202412,2024-12-21,1.0,1.0,3.0,68.0,0.0,0.0,...,10.0,1.0,100.0,1.0,5.0,1.0,3.0,1.0,2.0,1.0
176099,,75025373,202412,2024-12-09,2.4,1.0,2.0,58.0,0.0,0.0,...,10.0,1.0,100.0,1.0,1.0,0.0,3.0,1.0,2.0,1.0


In [15]:
# Quick inspection
merged_df.head()

# Show shape explicitly
print('merged_df shape:', merged_df.shape)

merged_df shape: (176101, 30)
