# Access OneLake Files Directly (Without Azure ML)

This notebook accesses OneLake files **directly** using Azure Storage SDK.

## 1. Install Required Packages

In [None]:
# Install Azure Storage SDK for Data Lake
%pip install azure-storage-file-datalake azure-identity pandas pyarrow python-dotenv

## 2. Load Credentials from .env File

In [None]:
import os
from dotenv import load_dotenv

# Load .env file
env_path = r"your-environment-file-path"
load_dotenv(env_path)

# Get credentials
TENANT_ID = os.getenv("FABRIC_TENANT_ID")
CLIENT_ID = os.getenv("FABRIC_APP_ID")
CLIENT_SECRET = os.getenv("FABRIC_APP_SECRET")
WORKSPACE_ID = os.getenv("FABRIC_WORKSPACE_ID")

print(f"✅ Loaded credentials")
print(f"   Tenant: {TENANT_ID}")
print(f"   Client: {CLIENT_ID}")
print(f"   Workspace: {WORKSPACE_ID}")

## 3. Configure OneLake Connection

In [None]:
# OneLake Configuration
ONELAKE_ENDPOINT = "https://msit-onelake.dfs.fabric.microsoft.com"
LAKEHOUSE_ID = "b5607519-ec4b-4a83-ac2a-5443c8887e2a"

# Your file path in OneLake
FILE_PATH = "Files/RawData/AddressData.csv"

print(f"OneLake Endpoint: {ONELAKE_ENDPOINT}")
print(f"Lakehouse ID: {LAKEHOUSE_ID}")
print(f"File Path: {FILE_PATH}")

## 4. Authenticate with Service Principal

In [None]:
from azure.identity import ClientSecretCredential
from azure.storage.filedatalake import DataLakeServiceClient

# Create credential
credential = ClientSecretCredential(
    tenant_id=TENANT_ID,
    client_id=CLIENT_ID,
    client_secret=CLIENT_SECRET
)

# Create Data Lake client
service_client = DataLakeServiceClient(
    account_url=ONELAKE_ENDPOINT,
    credential=credential
)

print("✅ Created OneLake client")

## 5. Read CSV File from OneLake

In [None]:
import pandas as pd
from io import BytesIO

# Get file system (workspace)
file_system_client = service_client.get_file_system_client(WORKSPACE_ID)

# Get file client
file_client = file_system_client.get_file_client(f"{LAKEHOUSE_ID}/{FILE_PATH}")

# Download file
download = file_client.download_file()
file_content = download.readall()

# Read into pandas
df = pd.read_csv(BytesIO(file_content))

print(f"✅ File loaded successfully!")
print(f"   Shape: {df.shape}")
print(f"   Columns: {list(df.columns)}")
print(f"\nFirst 5 rows:")
display(df.head())

## 6. Helper Function: Read Any File from OneLake

In [None]:
def read_onelake_csv(file_path: str) -> pd.DataFrame:
    """
    Read a CSV file from OneLake
    
    Args:
        file_path: Path relative to lakehouse (e.g., "Files/data.csv")
    
    Returns:
        pandas DataFrame
    """
    try:
        # Get file client
        file_client = file_system_client.get_file_client(f"{LAKEHOUSE_ID}/{file_path}")

        # Download and read
        download = file_client.download_file()
        file_content = download.readall()

        # Parse CSV
        df = pd.read_csv(BytesIO(file_content))

        print(f"✅ Loaded: {file_path}")
        print(f"   Shape: {df.shape}")

        return df

    except Exception as e:
        print(f"❌ Error reading {file_path}: {str(e)}")
        raise

# Test the function
df_address = read_onelake_csv("Files/data.csv")
display(df_address.head())

## 7. Read Parquet Files

In [None]:
def read_onelake_parquet(file_path: str) -> pd.DataFrame:
    """
    Read a Parquet file from OneLake
    
    Args:
        file_path: Path relative to lakehouse (e.g., "Files/data.parquet")
    
    Returns:
        pandas DataFrame
    """
    try:
        file_client = file_system_client.get_file_client(f"{LAKEHOUSE_ID}/{file_path}")
        download = file_client.download_file()
        file_content = download.readall()
        
        df = pd.read_parquet(BytesIO(file_content))
        
        print(f"✅ Loaded: {file_path}")
        print(f"   Shape: {df.shape}")
        
        return df
        
    except Exception as e:
        print(f"❌ Error reading {file_path}: {str(e)}")
        raise

# Example usage (update with your actual file)
# df_parquet = read_onelake_parquet("Files/data.parquet")

## 8. Write Files to OneLake

In [None]:
def write_onelake_csv(df: pd.DataFrame, file_path: str) -> None:
    """
    Write a DataFrame to OneLake as CSV
    
    Args:
        df: pandas DataFrame
        file_path: Path relative to lakehouse (e.g., "Files/output.csv")
    """
    try:
        # Convert DataFrame to CSV bytes
        csv_buffer = BytesIO()
        df.to_csv(csv_buffer, index=False)
        csv_bytes = csv_buffer.getvalue()
        
        # Get file client
        file_client = file_system_client.get_file_client(f"{LAKEHOUSE_ID}/{file_path}")
        
        # Upload
        file_client.upload_data(csv_bytes, overwrite=True)
        
        print(f"✅ Wrote: {file_path}")
        print(f"   Rows: {len(df)}")
        
    except Exception as e:
        print(f"❌ Error writing {file_path}: {str(e)}")
        raise

# Example: Write sample data
sample_df = pd.DataFrame({
    'id': [1, 2, 3],
    'name': ['Alice', 'Bob', 'Charlie'],
    'value': [100, 200, 300]
})

write_onelake_csv(sample_df, "Files/output/sample_output.csv")
print("\n💡 File saved to: Files/output/sample_output.csv")

## 9. List Files in OneLake

In [None]:
def list_onelake_files(folder_path: str = "Files") -> list:
    """
    List files in a OneLake folder
    
    Args:
        folder_path: Folder path relative to lakehouse (e.g., "Files/RawData")
    
    Returns:
        List of file paths
    """
    try:
        directory_client = file_system_client.get_directory_client(f"{LAKEHOUSE_ID}/{folder_path}")
        paths = directory_client.get_paths(recursive=False)
        
        files = []
        for path in paths:
            # Remove lakehouse ID prefix
            clean_path = path.name.replace(f"{LAKEHOUSE_ID}/", "")
            files.append(clean_path)
            print(f"  📄 {clean_path}")
        
        print(f"\n✅ Found {len(files)} items")
        return files
        
    except Exception as e:
        print(f"❌ Error listing files: {str(e)}")
        raise

# List files in folder
print("📁 Files in Files:\n")
files = list_onelake_files("Files/")

---

## Quick Reference

### Your Configuration
```python
ONELAKE_ENDPOINT = "https://msit-onelake.dfs.fabric.microsoft.com"
WORKSPACE_ID = "fb53fbfb-d8e9-4797-b2f5-ba80bb9a7388"  # From FABRIC_WORKSPACE_ID
LAKEHOUSE_ID = "b5607519-ec4b-4a83-ac2a-5443c8887e2a"
```

### Read Files
```python
# CSV
df = read_onelake_csv("Files/RawData/datafile.csv")

# Parquet
df = read_onelake_parquet("Files/data.parquet")
```

### Write Files
```python
write_onelake_csv(df, "Files/output/result.csv")
```

### List Files
```python
files = list_onelake_files("Files/RawData")
```

---

## Differences from Azure ML Notebook

| Azure ML Notebook | This Notebook (Direct Access) |
|-------------------|--------------------------------|
| `azureml://datastores/...` | Direct Azure Storage SDK |
| Requires Azure ML workspace | No Azure ML needed |
| Only works in Azure ML | Works anywhere |
| Uses datastore registration | Direct OneLake connection |

---

**✅ This notebook works in ANY environment - local, Fabric, or cloud!**