# <b>Log File Analysis ETL

In [100]:
import os
import csv
import sqlite3
from datetime import datetime

## <b>Extract Data

In [101]:
def extract_from_log_files(directory):
    log_data = []
    files = [file for file in os.listdir(directory) if file.endswith('.csv') or file.endswith('.tsv')]

    for file in files:
        with open(os.path.join(directory, file), 'r', newline='') as f:
            if file.endswith('.csv'):
                delimiter = ','
            elif file.endswith('.tsv'):
                delimiter = '\t'
            else:
                continue  # Skip unsupported file types

            reader = csv.reader(f, delimiter=delimiter)
            next(reader)  # Skip header
            for row in reader:
                log_data.append(row)

    return log_data

## <b>Transform Data

In [102]:
def transform_data(raw_data):
    transformed_data = []
    
    for row in raw_data:
        if len(row) < 5:
            continue  # Skip rows that do not have enough fields
        
        try:
            if row[1].strip():  # Check if timestamp field is not empty
                timestamp = datetime.fromisoformat(row[1].strip())  # Assuming the timestamp is in ISO format
            else:
                continue  # Skip rows with empty timestamps
        except ValueError:
            continue  # Skip rows with invalid timestamps
        
        # Example transformation: Extracting relevant fields
        transformed_row = {
            'timestamp': timestamp,
            'field1': row[2],
            'field2': row[3],
            'field3': row[4],
            'field4': row[0] if len(row) > 5 else ''  # Example handling optional field
        }
        
        transformed_data.append(transformed_row)
    
    return transformed_data


## <b>Load Data

In [103]:
def load_into_database(data, db_file):
    conn = sqlite3.connect(db_file)
    c = conn.cursor()
    
    # Create table if not exists
    c.execute('''CREATE TABLE IF NOT EXISTS logs
                 (id INTEGER PRIMARY KEY AUTOINCREMENT, timestamp TEXT, field1 TEXT, field2 TEXT, field3 TEXT, field4 TEXT)''')
    
    # Insert data into the table
    for row in data:
        c.execute("INSERT INTO logs (timestamp, field1, field2, field3, field4) VALUES (?, ?, ?, ?, ?)",
                  (row['timestamp'].isoformat(), row['field1'], row['field2'], row['field3'], row['field4']))
    
    conn.commit()
    conn.close()


## <b>Main Function

In [104]:
def main():
    # Step 1: Extract data from log files (CSV and TSV) in a directory
    directory = 'Logs_Data'  # Replace with your directory path
    raw_data = extract_from_log_files(directory)

    # Step 2: Transform data (renaming field names)
    transformed_data = transform_data(raw_data)

    # Step 3: Load data into SQLite database
    db_file = 'logs.db'
    load_into_database(transformed_data, db_file)

    print(f"ETL process completed. Data loaded into {db_file}.")


if __name__ == "__main__":
    main()


ETL process completed. Data loaded into logs.db.
