In [1]:
# Block 1: Setup and Imports

# Install Kaggle and Ipeadata libraries if they haven't been installed in this session
!pip install kaggle ipeadatapy python-bcb

# Essential Imports
import pandas as pd
import numpy as np
import os
import zipfile
import random
from datetime import datetime, timedelta

# NOTE: Drive is already mounted via Colab's interface.
# We skip the drive.mount() command here to avoid the Value Error.

# Define the base path to your project folder on Google Drive
# ATTENTION: Verify that this path leads to the folder containing 'dados_macro_brasil.csv'!
DRIVE_BASE_PATH = '/content/drive/MyDrive/Projeto-Credito' # <-- Adjust if your path is different!
MACRO_DATA_PATH = os.path.join(DRIVE_BASE_PATH, 'dados_macro_brasil.csv')

print("Setup complete. Drive path defined at:", DRIVE_BASE_PATH)

Collecting ipeadatapy
  Downloading ipeadatapy-0.1.9-py3-none-any.whl.metadata (297 bytes)
Collecting python-bcb
  Downloading python_bcb-0.3.3-py3-none-any.whl.metadata (2.5 kB)
Downloading ipeadatapy-0.1.9-py3-none-any.whl (10 kB)
Downloading python_bcb-0.3.3-py3-none-any.whl (19 kB)
Installing collected packages: python-bcb, ipeadatapy
Successfully installed ipeadatapy-0.1.9 python-bcb-0.3.3
Setup complete. Drive path defined at: /content/drive/MyDrive/Projeto-Credito


In [2]:
# Block 2.1: Generate and Save Placeholder CSV (RUN THIS FIRST)

# 1. Define the directory path (ensure the folder exists in your Google Drive)
DRIVE_BASE_PATH = '/content/drive/MyDrive/Project_01'
MACRO_DATA_PATH = os.path.join(DRIVE_BASE_PATH, 'brasil_macro_data.csv')

# 2. Create the directory if it doesn't exist (optional, but safer)
os.makedirs(DRIVE_BASE_PATH, exist_ok=True)
print(f"Directory check complete: {DRIVE_BASE_PATH}")

# 3. Create a simple, empty DataFrame with the expected columns/index for now
placeholder_data = {
    'date': pd.to_datetime(['2023-01-01', '2023-02-01']),
    'SELIC': [13.75, 13.65], 
    'IPCA': [5.79, 5.60] 
}
df_placeholder = pd.DataFrame(placeholder_data)
df_placeholder.set_index('date', inplace=True)

# 4. Save the placeholder DataFrame to the expected path
df_placeholder.to_csv(MACRO_DATA_PATH)

print(f"✅ Placeholder CSV created and saved at: {MACRO_DATA_PATH}")
print("You can now safely run the loading block (Block 2).")

Directory check complete: /content/drive/MyDrive/Project_01
✅ Placeholder CSV created and saved at: /content/drive/MyDrive/Project_01/brasil_macro_data.csv
You can now safely run the loading block (Block 2).


In [3]:
# Block 3: Kaggle Data Download and Unzip (Home Credit Default Risk)

# CRITICAL CORRECTION: Use the competition slug and command
KAGGLE_COMPETITION_SLUG = 'home-credit-default-risk' 

# Path Variables (Ensure DRIVE_BASE_PATH is correct, e.g., '/content/drive/MyDrive/Project_01')
KAGGLE_DIR = os.path.join(DRIVE_BASE_PATH, 'data/raw/kaggle') 
DATA_DIR_RAW = os.path.join(KAGGLE_DIR, KAGGLE_COMPETITION_SLUG)
ZIP_FILE = os.path.join(KAGGLE_DIR, KAGGLE_COMPETITION_SLUG + '.zip')

# 1. Create necessary directories
os.makedirs(KAGGLE_DIR, exist_ok=True)
os.makedirs(DATA_DIR_RAW, exist_ok=True)
print(f"Directories created/checked: {KAGGLE_DIR}")

# 2. Download the dataset using the Kaggle API 
print(f"Attempting to download competition dataset: {KAGGLE_COMPETITION_SLUG}")
try:
    # Use 'competitions download -c' syntax
    !kaggle competitions download -c {KAGGLE_COMPETITION_SLUG} -p {KAGGLE_DIR} --force
    print("✅ Kaggle download command executed.")

except Exception as e:
    print(f"❌ ERROR: Kaggle download command failed unexpectedly. Error: {e}")
    # Note: If the 403 error persists, the problem is your API key/rules acceptance, not the command syntax.


# 3. Unzip the downloaded file
print(f"\nUnzipping data from: {ZIP_FILE}")
try:
    with zipfile.ZipFile(ZIP_FILE, 'r') as zip_ref:
        zip_ref.extractall(DATA_DIR_RAW)
    print(f"✅ Data unzipped to: {DATA_DIR_RAW}")
    
    # Clean up the zip file
    os.remove(ZIP_FILE)
    print("Zip file removed.")

except FileNotFoundError:
    print(f"❌ ERROR: Zip file not found at {ZIP_FILE}. This means the download failed. Check API key/rules acceptance.")
except Exception as e:
    print(f"❌ ERROR during unzipping: {e}")

Directories created/checked: /content/drive/MyDrive/Project_01/data/raw/kaggle
Attempting to download competition dataset: home-credit-default-risk
Downloading home-credit-default-risk.zip to /content/drive/MyDrive/Project_01/data/raw/kaggle
100% 687M/688M [00:06<00:00, 96.1MB/s]
100% 688M/688M [00:06<00:00, 120MB/s] 
✅ Kaggle download command executed.

Unzipping data from: /content/drive/MyDrive/Project_01/data/raw/kaggle/home-credit-default-risk.zip
✅ Data unzipped to: /content/drive/MyDrive/Project_01/data/raw/kaggle/home-credit-default-risk
Zip file removed.


In [4]:
# Block 4: Load Micro Data and Simulate Time Variable

# Define the file path for the main training data
RAW_TRAIN_FILE = os.path.join(DATA_DIR_RAW, 'application_train.csv')

print(f"Attempting to load Micro Data from: {RAW_TRAIN_FILE}")

try:
    df_train = pd.read_csv(RAW_TRAIN_FILE)
    
    # 1. Simulate the Time Variable (Crucial for Macro Merge)
    # The Home Credit data is cross-sectional (no monthly date). 
    # We must assign a random monthly period to each client 
    # for the purpose of joining with the monthly macro data (df_macro).
    
    # Range of dates for simulation (e.g., last 5 years relative to the data creation)
    # The actual date of the dataset is not critical, only the relative time index.
    start_date = datetime(2013, 1, 1)
    end_date = datetime(2018, 5, 1) # Data was originally published around this time
    
    date_range = pd.period_range(start=start_date, end=end_date, freq='M')
    
    # Assign a random date from the range to each client
    df_train['TIME_INDEX'] = random.choices(date_range, k=len(df_train))
    df_train['TIME_INDEX'] = df_train['TIME_INDEX'].astype('object')
    
    # 2. Convert to Period Index for Joining
    df_train['TIME_INDEX'] = df_train['TIME_INDEX'].apply(lambda x: pd.Period(x, freq='M'))
    
    print("✅ Micro Data loaded and Time Index simulated successfully.")
    print(df_train[['SK_ID_CURR', 'TARGET', 'TIME_INDEX']].head())
    print("\nMicro Data Shape:", df_train.shape)

except FileNotFoundError:
    print(f"❌ ERROR: File not found at {RAW_TRAIN_FILE}. The Kaggle download or unzipping may have failed.")
except Exception as e:
    print(f"❌ An error occurred during micro data loading: {e}")

Attempting to load Micro Data from: /content/drive/MyDrive/Project_01/data/raw/kaggle/home-credit-default-risk/application_train.csv
✅ Micro Data loaded and Time Index simulated successfully.
   SK_ID_CURR  TARGET TIME_INDEX
0      100002       1    2015-01
1      100003       0    2015-05
2      100004       0    2015-03
3      100006       0    2017-01
4      100007       0    2016-06

Micro Data Shape: (307511, 123)
