In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [7]:
# open chunk_0.csv
columns = pd.read_csv("chunk_0.csv", nrows=0).columns.tolist()
print(len(columns))  # Should be 1818


3953


In [4]:
from collections import defaultdict
import random  # Import the random module

sample_count = 3
column_samples = defaultdict(list)


num_chunks = 3  # You can increase this if some columns don't fill up
for i in range(num_chunks):
    chunk_path = f"chunk_{i}.csv"
    print(f"Processing {chunk_path}...")
    chunk = pd.read_csv(chunk_path, low_memory=False)
    
    for col in columns:
        current_samples = column_samples[col]
        if len(current_samples) < sample_count:
            values = chunk[col].dropna().tolist()
            if values:
                needed = sample_count - len(current_samples)
                sampled = random.sample(values, min(needed, len(values)))
                column_samples[col].extend(sampled)

with open("column_samples.txt", "w", encoding="utf-8") as f:
    for col in columns:
        samples = column_samples[col]
        f.write(f"{col}:\t{samples}\n")

print("Done! Sample values saved to column_samples.txt ✅")

Processing chunk_0.csv...
Processing chunk_1.csv...
Processing chunk_2.csv...
Done! Sample values saved to column_samples.txt ✅


In [None]:
cols_of_interest = ["country_name", "country_ID", "year", 
                    "v2x_polyarchy", "v2x_libdem", "v2x_partipdem", "v2x_delibdem", "v2x_egaldem",
                    "v2elcomvot", "v2elsuffrage", "v2elbupfin", "v2elintim", "v2elaccept", "v2eltrnout"
                    "v2elreggov", "v2elrgpwr", "v2ellocgov", "v2ellocpwr", "v2ddyrci", "v2ddyrrf", "v2ddcredal", "v2csprtcpt"
                    "v2mecenefm", "v2xed_ed_inpt", "e_gdppc"
                    ]  # and so on

categorical_cols = ["country_name", "country_ID", "v2elcomvot", "v2elbupfin", "v2elintim", "v2elaccept", "v2elreggov", "v2elrgpwr", "v2ellocgov", "v2ellocpwr"
                    "v2ddcredal", "v2csprtcpt", "v2mecenefm"]

In [10]:
# start cleaning the data
import glob

# List of your selected columns
cols_of_interest = ["country_name", "country_id", "year", 
                    "v2x_polyarchy", "v2x_libdem", "v2x_partipdem", "v2x_delibdem", "v2x_egaldem",
                    "v2elcomvot", "v2elsuffrage", "v2elpubfin", "v2elintim", "v2elaccept", "v2eltrnout",
                    "v2elreggov", "v2elrgpwr", "v2ellocgov", "v2ellocpwr", "v2ddyrci", "v2ddyrrf", "v2ddcredal", "v2csprtcpt",
                    "v2mecenefm"
                    ]  # and so on

categorical_cols = ["country_name", "country_id", "v2elcomvot", "v2elpubfin", "v2elintim", "v2elaccept", "v2elreggov", "v2elrgpwr", "v2ellocgov", "v2ellocpwr",
                    "v2ddcredal", "v2csprtcpt", "v2mecenefm"]

# Container for cleaned data
filtered_chunks = []

# Loop over all chunk files, go through each chunk and filter the data
# remove any row with a year less than 1980, and remove all columns not in cols_of_interest
for chunk_path in glob.glob("chunk_*.csv"):
    print(f"Processing {chunk_path}...")
    chunk = pd.read_csv(chunk_path, low_memory=False)
    chunk = chunk[chunk["year"] >= 1980]
    chunk = chunk[cols_of_interest]
    filtered_chunks.append(chunk)

# Concatenate all filtered chunks into a single DataFrame
filtered_data = pd.concat(filtered_chunks)

# Save the cleaned data to a new CSV file
filtered_data.to_csv("filtered_data.csv", index=False)



Processing chunk_1.csv...
Processing chunk_19.csv...
Processing chunk_2.csv...
Processing chunk_4.csv...
Processing chunk_12.csv...
Processing chunk_5.csv...
Processing chunk_17.csv...
Processing chunk_16.csv...
Processing chunk_8.csv...
Processing chunk_9.csv...
Processing chunk_3.csv...
Processing chunk_7.csv...
Processing chunk_6.csv...
Processing chunk_0.csv...
Processing chunk_10.csv...
Processing chunk_18.csv...
Processing chunk_11.csv...
Processing chunk_13.csv...
Processing chunk_15.csv...
Processing chunk_14.csv...


In [13]:
categorical_cols = [
    col for col in categorical_cols if col not in ["country_name", "year"]
]
numeric_cols = [
    col for col in cols_of_interest if col not in categorical_cols + ["country_name", "year"]
]
# Group and aggregate
# Define aggregation functions
def mode_or_nan(series):
    try:
        return series.mode().iloc[0]
    except IndexError:
        return pd.NA

agg_funcs = {col: "mean" for col in numeric_cols}
for col in categorical_cols:
    agg_funcs[col] = mode_or_nan

# DO NOT include 'year' or 'country_name' in agg_funcs
grouped = filtered_data.groupby(["year", "country_name"]).agg(agg_funcs).reset_index()


# Save to file
grouped.to_csv("grouped_data.csv", index=False)
print("Saved to grouped_data.csv ✅")

Saved to grouped_data.csv ✅
