In [None]:
# Setup imports
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

sns.set(style='whitegrid')
DATA_DIR = Path('data/raw')
print('Notebook running from:', Path.cwd())

In [None]:
# Find Excel files in data/raw
excel_files = list(DATA_DIR.glob('*.xls*'))
print(f'Found {len(excel_files)} Excel file(s) in {DATA_DIR}')
for p in excel_files:
    print('-', p.name)

if not excel_files:
    raise FileNotFoundError(f'No Excel files found in {DATA_DIR}; please add your training file to data/raw')

In [None]:
# Load the first Excel file (with fallback to install openpyxl)
file_path = excel_files[0]
try:
    df = pd.read_excel(file_path)
except ValueError as e:
    # sometimes engine needs openpyxl for .xlsx
    print('Read error, attempting to ensure openpyxl is installed:', e)
    import subprocess, sys
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'openpyxl'])
    df = pd.read_excel(file_path)

print('Loaded:', file_path.name)
print('Rows, cols:', df.shape)

In [None]:
# Quick peek
display(df.head())

# Basic info
print('
-- info --')
display(df.info())

print('
-- describe (numeric) --')
display(df.describe(include=[np.number]).T)

In [None]:
# Missing values summary
na_counts = df.isna().sum().sort_values(ascending=False)
display(na_counts[na_counts>0])

# Column datatypes
display(df.dtypes)

In [None]:
# Try to detect a datetime column and set as index if present
time_cols = [c for c in df.columns if c.lower() in ('timestamp', 'datetime', 'date', 'time')]
if not time_cols:
    # heuristic: object columns that parse to datetime
    for c in df.select_dtypes(include=['object']).columns:
        try:
            pd.to_datetime(df[c].dropna().iloc[:50])
            time_cols.append(c)
        except Exception:
            pass

print('Detected time columns:', time_cols)

if time_cols:
    tcol = time_cols[0]
    df[tcol] = pd.to_datetime(df[tcol], errors='coerce')
    df = df.sort_values(tcol).set_index(tcol)
    display(df.head())
    # simple time series plot for numeric columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()[:3]
    if numeric_cols:
        df[numeric_cols].plot(subplots=False, figsize=(12,4));
        plt.title('First numeric columns over time')
else:
    print('No time column detected; skipping time-series plots')

In [None]:
# Target/feature check
possible_target = 'energy_consumption'
if possible_target in df.columns:
    sns.histplot(df[possible_target].dropna(), kde=True);
    plt.title(possible_target)
else:
    print('Target column `energy_consumption` not found. Available columns:')
    print(list(df.columns)[:30])

## Next steps
- Clean and process missing values (`data/processed`).
- Use `src/features/build_features.py` for deterministic transforms.
- Run `src/models/train.py` with `configs/default.yaml` after preparing the processed dataset.