# Block 1: Setup and Imports

In [7]:
# Essential Imports
import pandas as pd
import numpy as np
import os
import zipfile
import random
from datetime import datetime, timedelta

In [None]:
# Install Kaggle and Ipeadata libraries if they haven't been installed in this session
# Uncomment the line below if running in a fresh environment
#%pip install kaggle ipeadatapy python-bcb

In [6]:
import ipeadatapy as ip
from bcb import sgs

In [None]:
# NOTE: Drive is already mounted via Colab's interface. If it is not, uncomment the line below.
#from google.colab import drive
# We skip the drive.mount() command here to avoid the Value Error.

# Define the base path to your project folder on Google Drive
# ATTENTION: Verify that this path leads to the folder containing 'dados_macro_brasil.csv'!
DRIVE_BASE_PATH = '/content/drive/MyDrive/Projeto-Credito' # <-- Adjust if your path is different!
MACRO_DATA_PATH = os.path.join(DRIVE_BASE_PATH, 'dados_macro_brasil.csv')

print("Setup complete. Drive path defined at:", DRIVE_BASE_PATH)

Collecting ipeadatapy
  Downloading ipeadatapy-0.1.9-py3-none-any.whl.metadata (297 bytes)
Collecting python-bcb
  Downloading python_bcb-0.3.3-py3-none-any.whl.metadata (2.5 kB)
Downloading ipeadatapy-0.1.9-py3-none-any.whl (10 kB)
Downloading python_bcb-0.3.3-py3-none-any.whl (19 kB)
Installing collected packages: python-bcb, ipeadatapy
Successfully installed ipeadatapy-0.1.9 python-bcb-0.3.3
Setup complete. Drive path defined at: /content/drive/MyDrive/Projeto-Credito


In [2]:
# Block 2.1: Generate and Save Placeholder CSV (RUN THIS FIRST)

# 1. Define the directory path (ensure the folder exists in your Google Drive)
DRIVE_BASE_PATH = '/content/drive/MyDrive/Project_01'
MACRO_DATA_PATH = os.path.join(DRIVE_BASE_PATH, 'brasil_macro_data.csv')

# 2. Create the directory if it doesn't exist (optional, but safer)
os.makedirs(DRIVE_BASE_PATH, exist_ok=True)
print(f"Directory check complete: {DRIVE_BASE_PATH}")

# 3. Create a simple, empty DataFrame with the expected columns/index for now
placeholder_data = {
    'date': pd.to_datetime(['2023-01-01', '2023-02-01']),
    'SELIC': [13.75, 13.65], 
    'IPCA': [5.79, 5.60] 
}
df_placeholder = pd.DataFrame(placeholder_data)
df_placeholder.set_index('date', inplace=True)

# 4. Save the placeholder DataFrame to the expected path
df_placeholder.to_csv(MACRO_DATA_PATH)

print(f"✅ Placeholder CSV created and saved at: {MACRO_DATA_PATH}")
print("You can now safely run the loading block (Block 2).")

Directory check complete: /content/drive/MyDrive/Project_01
✅ Placeholder CSV created and saved at: /content/drive/MyDrive/Project_01/brasil_macro_data.csv
You can now safely run the loading block (Block 2).


In [3]:
# Block 3: Kaggle Data Download and Unzip (Home Credit Default Risk)

# CRITICAL CORRECTION: Use the competition slug and command
KAGGLE_COMPETITION_SLUG = 'home-credit-default-risk' 

# Path Variables (Ensure DRIVE_BASE_PATH is correct, e.g., '/content/drive/MyDrive/Project_01')
KAGGLE_DIR = os.path.join(DRIVE_BASE_PATH, 'data/raw/kaggle') 
DATA_DIR_RAW = os.path.join(KAGGLE_DIR, KAGGLE_COMPETITION_SLUG)
ZIP_FILE = os.path.join(KAGGLE_DIR, KAGGLE_COMPETITION_SLUG + '.zip')

# 1. Create necessary directories
os.makedirs(KAGGLE_DIR, exist_ok=True)
os.makedirs(DATA_DIR_RAW, exist_ok=True)
print(f"Directories created/checked: {KAGGLE_DIR}")

# 2. Download the dataset using the Kaggle API 
print(f"Attempting to download competition dataset: {KAGGLE_COMPETITION_SLUG}")
try:
    # Use 'competitions download -c' syntax
    !kaggle competitions download -c {KAGGLE_COMPETITION_SLUG} -p {KAGGLE_DIR} --force
    print("✅ Kaggle download command executed.")

except Exception as e:
    print(f"❌ ERROR: Kaggle download command failed unexpectedly. Error: {e}")
    # Note: If the 403 error persists, the problem is your API key/rules acceptance, not the command syntax.


# 3. Unzip the downloaded file
print(f"\nUnzipping data from: {ZIP_FILE}")
try:
    with zipfile.ZipFile(ZIP_FILE, 'r') as zip_ref:
        zip_ref.extractall(DATA_DIR_RAW)
    print(f"✅ Data unzipped to: {DATA_DIR_RAW}")
    
    # Clean up the zip file
    os.remove(ZIP_FILE)
    print("Zip file removed.")

except FileNotFoundError:
    print(f"❌ ERROR: Zip file not found at {ZIP_FILE}. This means the download failed. Check API key/rules acceptance.")
except Exception as e:
    print(f"❌ ERROR during unzipping: {e}")

Directories created/checked: /content/drive/MyDrive/Project_01/data/raw/kaggle
Attempting to download competition dataset: home-credit-default-risk
Downloading home-credit-default-risk.zip to /content/drive/MyDrive/Project_01/data/raw/kaggle
100% 687M/688M [00:06<00:00, 96.1MB/s]
100% 688M/688M [00:06<00:00, 120MB/s] 
✅ Kaggle download command executed.

Unzipping data from: /content/drive/MyDrive/Project_01/data/raw/kaggle/home-credit-default-risk.zip
✅ Data unzipped to: /content/drive/MyDrive/Project_01/data/raw/kaggle/home-credit-default-risk
Zip file removed.


In [11]:
ip.list_series('IPCA')

Unnamed: 0,CODE,NAME
969,BM12_IPCA2012,IPCA - núcleo médias aparadas com suavização -...
970,BM12_IPCA20N12,IPCA - núcleo médias aparadas sem suavização ...
971,BM12_IPCACOM12,IPCA - preços livres - comercializáveis - taxa...
972,BM12_IPCAEXC12,IPCA - núcleo por exclusão - sem monitorados e...
973,BM12_IPCAEXCEX212,IPCA - núcleo por exclusão - EX1 - taxa de var...
974,BM12_IPCAEXP1212,Expectativa média de Inflação - IPCA - taxa ac...
975,BM12_IPCAEXP612,Expectativa média de Inflação - IPCA - taxa an...
976,BM12_IPCANCOM12,IPCA - preços livres - não comercializáveis - ...
977,BM12_IPCAPL12,IPCA - preços livres - taxa de variação
978,BM12_IPCAPLBD12,IPCA - preços livres - bens duráveis - taxa de...


In [12]:
ip.metadata('PAN12_IPCAG12')

Unnamed: 0,CODE,NAME,COMMENT,LAST UPDATE,BIG THEME,SOURCE ACRONYM,SOURCE,SOURCE URL,FREQUENCY,MEASURE,UNIT,SERIES STATUS,THEME CODE,COUNTRY,NUMERICA
0,PAN12_IPCAG12,Índice de Preços ao Consumidor Ampliado (IPCA),Índice Nacional de Preços ao Consumidor Amplo ...,2025-10-10T08:58:00.65-03:00,Macroeconômico,IBGE/SNIPC,Instituto Brasileiro de Geografia e Estatístic...,www.ibge.gov.br,Mensal,(% a.a.),,A,17,BRA,True


In [13]:
# Block 4: Macro Data Acquisition via API (User-Defined 10-Year Window)

from datetime import datetime, timedelta

# --- USER INPUT ---
# Define the end date for your SELIC data query (format YYYY-MM-DD)

USER_END_DATE_STR = '2018-12-31'
# ------------------

# 1. Dynamically calculate the 10-year window based on the user's end date
USER_END_DATE = datetime.strptime(USER_END_DATE_STR, '%Y-%m-%d')
# Calculate the start date (10 years earlier)
START_DATE = (USER_END_DATE - timedelta(days=365*10)).strftime('%Y-%m-%d')

# Set the end date for the BCB query
END_DATE = USER_END_DATE.strftime('%Y-%m-%d')

print(f"BCB SELIC Data Window: {START_DATE} to {END_DATE}")


# 1. Acquire SELIC Rate (from Banco Central do Brasil - BCB)
print("Acquiring SELIC data...")
try:
    # SGS Code 432: Annualized rate (last day of the period)
    
    df_selic = sgs.get({'SELIC': 432}, start=START_DATE, end=END_DATE)
    df_selic.index = df_selic.index.to_period('M') 
    print("✅ SELIC Data acquired successfully.")
except Exception as e:
    print(f"❌ ERROR acquiring SELIC data from BCB: {e}")
    df_selic = pd.DataFrame() 

# 2. Acquire IPCA (Brazilian Inflation Index - from Ipeadata)
# IPCA maintains its wide range (since 2000) for better history.
print("Acquiring IPCA data...")
try:
    # IPCA Code PAN12_IPCAG12 (12-month accumulated)
    df_ipca = ip.timeseries(
        'PAN12_IPCAG12',
        yearGreaterThan = START_DATE, 
        yearSmallerThan= END_DATE
    )
    df_ipca = df_ipca.rename(columns={'GMEC12_IPCA12': 'IPCA'})
    df_ipca.index = df_ipca.index.to_period('M') 
    df_ipca = df_ipca[['IPCA']] 
    print("✅ IPCA Data acquired successfully.")
except Exception as e:
    print(f"❌ ERROR acquiring IPCA data from Ipeadata: {e}")
    df_ipca = pd.DataFrame()

# 3. Merge the datasets, clean, and save
if not df_selic.empty and not df_ipca.empty:
    # Ensure both indices are monthly PeriodIndex and sorted before merging
    try:
        if not isinstance(df_selic.index, pd.PeriodIndex):
            df_selic.index = pd.to_datetime(df_selic.index).to_period('M')
        if not isinstance(df_ipca.index, pd.PeriodIndex):
            df_ipca.index = pd.to_datetime(df_ipca.index).to_period('M')
    except Exception:
        # Fallback: coerce via to_datetime then to_period
        df_selic.index = pd.to_datetime(df_selic.index, errors='coerce').to_period('M')
        df_ipca.index = pd.to_datetime(df_ipca.index, errors='coerce').to_period('M')
    
    df_selic = df_selic.sort_index()
    df_ipca = df_ipca.sort_index()
    
    # Use merge on indices (outer) to guarantee union of periods and avoid potential column-name collisions
    df_macro = pd.merge(df_ipca, df_selic, left_index=True, right_index=True, how='outer')
    # Fill forward then backfill any leading NaNs so we don't keep NaNs at the start
    df_macro = df_macro.sort_index().ffill().bfill()
    
    df_macro.to_csv(MACRO_DATA_PATH)
    
    print("\n✅ Final Macro Data Merged and Saved:")
    print(df_macro.head())
    print("\nMacro Data Shape:", df_macro.shape)
else:
    print("\n❌ Could not merge data due to empty datasets.")

BCB SELIC Data Window: 2009-01-02 to 2018-12-31
Acquiring SELIC data...
❌ ERROR acquiring SELIC data from BCB: Expected object or value
Acquiring IPCA data...
❌ ERROR acquiring SELIC data from BCB: Expected object or value
Acquiring IPCA data...
❌ ERROR acquiring IPCA data from Ipeadata: Invalid comparison between dtype=int32 and str

❌ Could not merge data due to empty datasets.
❌ ERROR acquiring IPCA data from Ipeadata: Invalid comparison between dtype=int32 and str

❌ Could not merge data due to empty datasets.


In [None]:
# Block 5: Load Micro Data and Simulate Time Variable

# Define the file path for the main training data
RAW_TRAIN_FILE = os.path.join(DATA_DIR_RAW, 'application_train.csv')

print(f"Attempting to load Micro Data from: {RAW_TRAIN_FILE}")

try:
    df_train = pd.read_csv(RAW_TRAIN_FILE)
    
    # 1. Simulate the Time Variable (Crucial for Macro Merge)
    # The Home Credit data is cross-sectional (no monthly date). 
    # We must assign a random monthly period to each client 
    # for the purpose of joining with the monthly macro data (df_macro).
    
    # Range of dates for simulation (e.g., last 5 years relative to the data creation)
    # The actual date of the dataset is not critical, only the relative time index.
    start_date = datetime(2013, 1, 1)
    end_date = datetime(2018, 5, 1) # Data was originally published around this time
    
    date_range = pd.period_range(start=start_date, end=end_date, freq='M')
    
    # Assign a random date from the range to each client
    df_train['TIME_INDEX'] = random.choices(date_range, k=len(df_train))
    df_train['TIME_INDEX'] = df_train['TIME_INDEX'].astype('object')
    
    # 2. Convert to Period Index for Joining
    df_train['TIME_INDEX'] = df_train['TIME_INDEX'].apply(lambda x: pd.Period(x, freq='M'))
    
    print("✅ Micro Data loaded and Time Index simulated successfully.")
    print(df_train[['SK_ID_CURR', 'TARGET', 'TIME_INDEX']].head())
    print("\nMicro Data Shape:", df_train.shape)

except FileNotFoundError:
    print(f"❌ ERROR: File not found at {RAW_TRAIN_FILE}. The Kaggle download or unzipping may have failed.")
except Exception as e:
    print(f"❌ An error occurred during micro data loading: {e}")

Attempting to load Micro Data from: /content/drive/MyDrive/Project_01/data/raw/kaggle/home-credit-default-risk/application_train.csv
✅ Micro Data loaded and Time Index simulated successfully.
   SK_ID_CURR  TARGET TIME_INDEX
0      100002       1    2015-01
1      100003       0    2015-05
2      100004       0    2015-03
3      100006       0    2017-01
4      100007       0    2016-06

Micro Data Shape: (307511, 123)


In [None]:
# Block 6: Load Macro Data and Prepare Time Variable

# The path to the macro data file was defined in Block 1 (e.g., .../Project_01/brasil_macro_data.csv)
print(f"Attempting to load Macro Data from: {MACRO_DATA_PATH}")

try:
    df_macro = pd.read_csv(MACRO_DATA_PATH)
    
    # 1. Convert the Date Column to Datetime
    # We assume the time column is named 'DATE' (or 'date', depending on the API output/saving convention)
    # Adjust 'DATE' if your CSV uses a different column name (e.g., 'date')
    if 'DATE' in df_macro.columns:
        df_macro['DATE'] = pd.to_datetime(df_macro['DATE'])
    elif 'date' in df_macro.columns:
        df_macro['DATE'] = pd.to_datetime(df_macro['date'])
        df_macro = df_macro.drop(columns=['date']) # Clean up the old column if it was renamed
    else:
        raise ValueError("Time column ('DATE' or 'date') not found in macro data.")


    # 2. Convert to Period Index (Monthly Frequency - 'M')
    # This conversion is fundamental for merging with df_train['TIME_INDEX']
    df_macro['TIME_INDEX'] = df_macro['DATE'].dt.to_period('M')
    
    # 3. Drop the original datetime column
    df_macro = df_macro.drop(columns=['DATE'])
    
    # Set TIME_INDEX as the index for easier joining (optional, but good practice)
    df_macro = df_macro.set_index('TIME_INDEX')

    # Display the result
    print("✅ Macro Data loaded and Time Index prepared successfully.")
    print(df_macro.head())
    print("\nMacro Data Shape:", df_macro.shape)

except FileNotFoundError:
    print(f"❌ ERROR: Macro file not found at {MACRO_DATA_PATH}. Check if the file is in your Google Drive.")
except Exception as e:
    print(f"❌ An error occurred during macro data loading or processing: {e}")