In [None]:
# INE Data Exploration

This notebook performs an exploratory analysis of the data provided by the Spanish National Statistics Institute (INE).

## Objectives
- Analyze the structure and quality of INE data
- Identify trends and patterns in key variables
- Detect potential data issues or inconsistencies
- Generate informative visualizations
- Propose necessary transformations for modeling


In [1]:
# Import libraries and utilities
import polars as pl
import plotly.express as px
import plotly.graph_objects as go
from pathlib import Path
import sys

# Add utilities directory to path
sys.path.append('../notebooks')
from utils.exploratory_analysis import *

# Visualization settings
px.defaults.template = "plotly_white"
px.defaults.width = 1000
px.defaults.height = 600

# Data directories
DATA_PATH = Path('../data/raw/data-INE')


ModuleNotFoundError: No module named 'polars'

In [None]:
## 1. Data Loading and Initial Inspection

First, we'll load the INE dataset and perform an initial inspection of its structure and contents.


In [None]:
# Load the INE dataset
file_path = DATA_PATH / 'data_ine.csv'
df_ine = load_dataset(file_path)

if df_ine is not None:
    print("Dataset loaded successfully!")
    # Initial analysis
    stats = analyze_dataset(df_ine, "INE Data")


In [None]:
## 2. Distribution Analysis

### 2.1 Numeric Variables
Let's analyze the distribution of numeric variables through histograms and box plots.


In [None]:
if df_ine is not None:
    # Numeric distributions visualization
    fig = plot_numeric_distributions(df_ine)
    if fig:
        fig.show()


In [None]:
### 2.2 Categorical Variables
Now let's examine the distribution of categorical variables through bar plots.


In [None]:
if df_ine is not None:
    # Categorical distributions visualization
    figs = plot_categorical_distributions(df_ine)
    for fig in figs:
        fig.show()


In [None]:
## 3. Correlation Analysis

Let's analyze the correlations between numeric variables to identify important relationships.


In [None]:
if df_ine is not None:
    # Correlation matrix
    fig = plot_correlation_matrix(df_ine)
    if fig:
        fig.show()


In [None]:
## 4. Data Quality Analysis

Let's perform a detailed analysis of data quality issues.


In [None]:
if df_ine is not None:
    # Detailed null analysis
    null_percentages = (df_ine.null_count() / len(df_ine) * 100).to_dict()
    
    print("\nNull Value Percentages:")
    for col, pct in null_percentages.items():
        if pct > 0:
            print(f"- {col}: {pct:.2f}%")
    
    # Check for potential duplicates
    print(f"\nDuplicate Rows: {df_ine.is_duplicated().sum()}")
    
    # Value ranges for numeric columns
    print("\nValue Ranges for Numeric Columns:")
    numeric_cols = [col for col, dtype in zip(df_ine.columns, df_ine.dtypes) 
                   if pl.datatypes.is_numeric(dtype)]
    
    for col in numeric_cols:
        stats = df_ine.select(pl.col(col).min().alias('min'),
                            pl.col(col).max().alias('max')).collect()
        print(f"- {col}: {stats[0]['min']} to {stats[0]['max']}")


In [None]:
## 5. Conclusions and Recommendations

### Data Quality Issues
- [To be filled after analysis]

### Key Variables and Relationships
- [To be filled after analysis]

### Required Transformations
- [To be filled after analysis]

### Next Steps
- [To be filled after analysis]
