# Read and merge all CSV files from `data` (only import pandas)

This notebook reads every CSV in `Term-paper/data` using only `pandas` as an explicit import,
lists files using the Jupyter/IPython shell, then merges them into a single DataFrame.
If DataFrames share column names the merge uses those columns; otherwise DataFrames are concatenated side-by-side.

In [None]:
import pandas as pd

# Path to the data directory (relative to the repository root)
data_path = "Term-paper/data"

# List files using the IPython shell so we don't import os/pathlib/glob explicitly
files = []
try:
    # List the files in the data_path directory
    files = get_ipython().getoutput(f'ls "{data_path}"')
except Exception:
    # If running outside IPython this will fail; keep files empty so we don't error out
    files = []

# Filter CSV files
csv_files = [f for f in files if isinstance(f, str) and f.lower().endswith('.csv')]

if not csv_files:
    print('No CSV files found in', data_path)
else:
    dfs = []  # list of (filename, df)
    for fname in csv_files:
        full = f'{data_path}/{fname}'
        try:
            df = pd.read_csv(full)
            dfs.append((fname, df))
            print('Loaded', fname, '->', df.shape)
        except Exception as e:
            print('Failed to read', full, ':', e)

    # If nothing successfully read, create empty merged_df
    if not dfs:
        merged_df = pd.DataFrame()
    else:
        # Start with first DataFrame
        merged_df = dfs[0][1]
        for name, df in dfs[1:]:
            # find common columns to merge on
            common = [c for c in merged_df.columns if c in df.columns]
            if common:
                # merge on all common columns (outer join to keep data)
                merged_df = pd.merge(merged_df, df, how='outer', on=common)
            else:
                # no common columns: concatenate side-by-side (columns may be duplicated)
                merged_df = pd.concat([merged_df, df], axis=1)

    print('Final merged shape:', merged_df.shape)

# keep merged_df available in the notebook namespace even if no files were found
try:
    merged_df
except NameError:
    merged_df = pd.DataFrame()

No CSV files found in ../data


In [None]:
# Quick inspection
merged_df.head()

# Show shape explicitly
print('merged_df shape:', merged_df.shape)