# ETL Pipeline 

This notebook contains the Extract, Transform, Load (ETL) pipeline for the finance dataset.

## Pipeline Steps:
1. **Extract**: Load data from CSV file
2. **Transform**: Clean data and add enrichment features
3. **Load**: Save processed data to output file



## 

## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import os

## Data Extraction

In [2]:
def extract_data(file_path):
    """Extract data from CSV file."""
    try:
        df = pd.read_csv(file_path)
        print(f"Data extracted successfully. Shape: {df.shape}")
        print("Columns:", df.columns.tolist())
        return df
    except Exception as e:
        print(f"Error extracting data: {e}")
        return None

## Data Cleaning

In [3]:
def clean_data(df):
    """Clean the dataset: handle missing values, standardize formats, remove outliers."""
    # Handle missing values
    numeric_cols = ['Stock_Price', 'Revenue_Millions', 'Net_Income_Millions', 'Market_Cap_Billions', 'EPS']
    for col in numeric_cols:
        df[col] = df[col].fillna(df[col].median())
    df['Company'] = df['Company'].fillna('Unknown')

    # Standardize date format
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

    # Remove outliers for Revenue_Millions using IQR
    Q1 = df['Revenue_Millions'].quantile(0.25)
    Q3 = df['Revenue_Millions'].quantile(0.75)
    IQR = Q3 - Q1
    df = df[(df['Revenue_Millions'] >= Q1 - 1.5 * IQR) & (df['Revenue_Millions'] <= Q3 + 1.5 * IQR)]

    print(f"Data cleaned. Shape after cleaning: {df.shape}")
    return df

## Data Enrichment

In [4]:
def enrich_data(df):
    """Add calculated fields: Profit_Margin, Revenue_Growth."""
    # Calculate profit margin
    df['Profit_Margin'] = (df['Net_Income_Millions'] / df['Revenue_Millions']) * 100

    # Calculate revenue growth (month-over-month percentage change per company)
    df = df.sort_values(['Company', 'Date'])
    df['Revenue_Growth'] = df.groupby('Company')['Revenue_Millions'].pct_change() * 100
    df['Revenue_Growth'] = df['Revenue_Growth'].fillna(0)

    print("Data enriched with Profit_Margin and Revenue_Growth.")
    return df

## Load Data

In [5]:
def load_data(df, output_path):
    """Save cleaned and enriched data to CSV."""
    # Create output directory if it doesn't exist
    output_dir = os.path.dirname(output_path)
    if output_dir:  # Only create directory if path contains a directory
        os.makedirs(output_dir, exist_ok=True)
    df.to_csv(output_path, index=False)
    print(f"Data saved to {output_path}")

In [6]:
# Main ETL pipeline
def run_etl(input_path, output_path):
    """Run the Week 2 ETL pipeline for cleaning and enrichment."""
    df = extract_data(input_path)
    if df is not None:
        df = clean_data(df)
        df = enrich_data(df)
        load_data(df, output_path)
    else:
        print("ETL pipeline failed due to extraction error.")

## Run ETL Pipeline and View Results

In [7]:
# Step 1: Load and examine the original data
print("=== ORIGINAL DATA ===")
original_data = extract_data('finance_dataset.csv')
if original_data is not None:
    print(f"\nOriginal data shape: {original_data.shape}")
    print("\nFirst 5 rows of original data:")
    print(original_data.head())
    
    print("\nData types:")
    print(original_data.dtypes)
    
    print("\nMissing values in original data:")
    print(original_data.isnull().sum())
    
    print("\nBasic statistics of original data:")
    print(original_data.describe())

=== ORIGINAL DATA ===
Error extracting data: [Errno 2] No such file or directory: 'finance_dataset.csv'


In [8]:
# Step 2: Clean the data and examine the results
if original_data is not None:
    print("\n" + "="*50)
    print("=== CLEANING DATA ===")
    cleaned_data = clean_data(original_data.copy())
    
    print(f"\nCleaned data shape: {cleaned_data.shape}")
    print(f"Rows removed during cleaning: {original_data.shape[0] - cleaned_data.shape[0]}")
    
    print("\nFirst 5 rows of cleaned data:")
    print(cleaned_data.head())
    
    print("\nMissing values after cleaning:")
    print(cleaned_data.isnull().sum())
    
    print("\nData types after cleaning:")
    print(cleaned_data.dtypes)

In [9]:
# Step 3: Enrich the data and examine the final results
if 'cleaned_data' in locals():
    print("\n" + "="*50)
    print("=== ENRICHING DATA ===")
    enriched_data = enrich_data(cleaned_data.copy())
    
    print(f"\nFinal enriched data shape: {enriched_data.shape}")
    print("\nFirst 5 rows of enriched data:")
    print(enriched_data.head())
    
    print("\nNew columns added:")
    new_columns = set(enriched_data.columns) - set(original_data.columns)
    print(list(new_columns))
    
    print("\nFinal data statistics:")
    print(enriched_data.describe())
    
    # Save the final data to processed_data folder as originally intended
    output_path = 'processed_data/cleaned_finance_dataset.csv'
    
    try:
        load_data(enriched_data, output_path)
    except PermissionError:
        print(f"Permission error: The file {output_path} might be open in another application.")
        print("Please close any applications that might have this file open (like Excel) and try again.")
        print("Or run this cell again after closing the file.")
    except Exception as e:
        print(f"Error saving file: {e}")
    
    print(f"\n=== SUMMARY ===")
    print(f"Original data: {original_data.shape[0]} rows, {original_data.shape[1]} columns")
    print(f"Final data: {enriched_data.shape[0]} rows, {enriched_data.shape[1]} columns")
    print(f"Rows removed: {original_data.shape[0] - enriched_data.shape[0]}")
    print(f"Columns added: {enriched_data.shape[1] - original_data.shape[1]}")