# 01. Data Ingestion - NASA Web Server Logs

## Mục tiêu
- Parse raw log files (train.txt, test.txt)
- Aggregate thành time series với 3 granularities (1min, 5min, 15min)
- Xử lý edge cases và missing data (storm gap)
- Lưu processed data dưới dạng parquet

## Dữ liệu
- **train.txt**: 2,934,960 log entries (01/Jul/1995 → 22/Aug/1995)
- **test.txt**: 526,650 log entries (23/Aug/1995 → 31/Aug/1995)

## Log Format (Common Log Format)
```
199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] "GET /history/apollo/ HTTP/1.0" 200 6245
│              │  │                          │                                │    │
│              │  │                          │                                │    └─ Bytes
│              │  │                          │                                └─ Status Code
│              │  │                          └─ Request
│              │  └─ Timestamp + Timezone
│              └─ User (luôn "-")
└─ Host (IP hoặc hostname)
```

In [None]:
# Import libraries
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Thêm src vào path để import modules
sys.path.insert(0, os.path.abspath('..'))

from src.data.parser import NASALogParser, quick_parse
from src.data.preprocessor import LogPreprocessor, load_timeseries, split_train_test

# Config
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
plt.style.use('seaborn-v0_8-whitegrid')

print("Libraries imported successfully!")

## 1. Định nghĩa paths

In [None]:
# Paths
DATA_DIR = '../DATA'
PROCESSED_DIR = '../data/processed'

TRAIN_FILE = os.path.join(DATA_DIR, 'train.txt')
TEST_FILE = os.path.join(DATA_DIR, 'test.txt')

# Kiểm tra files tồn tại
print(f"Train file: {TRAIN_FILE}")
print(f"  Exists: {os.path.exists(TRAIN_FILE)}")
if os.path.exists(TRAIN_FILE):
    print(f"  Size: {os.path.getsize(TRAIN_FILE) / 1024 / 1024:.2f} MB")

print(f"\nTest file: {TEST_FILE}")
print(f"  Exists: {os.path.exists(TEST_FILE)}")
if os.path.exists(TEST_FILE):
    print(f"  Size: {os.path.getsize(TEST_FILE) / 1024 / 1024:.2f} MB")

## 2. Quick Parse - Kiểm tra format dữ liệu

In [None]:
# Parse nhanh 100 dòng đầu để kiểm tra format
print("Kiểm tra format dữ liệu (100 dòng đầu)...")
sample_df = quick_parse(TRAIN_FILE, max_lines=100)

print(f"\nShape: {sample_df.shape}")
print(f"\nData types:")
print(sample_df.dtypes)
print(f"\nSample data:")
sample_df.head(10)

In [None]:
# Kiểm tra các giá trị unique
print("Phân phối HTTP Methods:")
print(sample_df['method'].value_counts())

print("\nPhân phối Status Codes:")
print(sample_df['status_code'].value_counts())

print("\nTimezone (tất cả nên là -0400 EDT):")
print(sample_df['timezone'].value_counts())

In [None]:
# Kiểm tra bytes field
print("Bytes statistics:")
print(sample_df['bytes'].describe())

print(f"\nSố entries có bytes = 0: {(sample_df['bytes'] == 0).sum()}")

## 3. Parse Full Training Data

In [None]:
# Parse toàn bộ training data
print("="*60)
print("PARSING TRAINING DATA")
print("="*60)

parser = NASALogParser()
train_df = parser.parse_file(TRAIN_FILE)

In [None]:
# Thống kê parsing
stats = parser.get_stats()
print(f"\nParsing Statistics:")
print(f"  Total lines: {stats['total']:,}")
print(f"  Success: {stats['success']:,} ({stats['success_rate']:.2f}%)")
print(f"  Failed: {stats['failed']:,}")

In [None]:
# Kiểm tra training data
print(f"\nTraining DataFrame:")
print(f"  Shape: {train_df.shape}")
print(f"  Memory usage: {train_df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB")
print(f"  Date range: {train_df['timestamp'].min()} to {train_df['timestamp'].max()}")
print(f"  Duration: {(train_df['timestamp'].max() - train_df['timestamp'].min()).days} days")

In [None]:
# Sample data
print("\nSample từ training data:")
train_df.head(10)

## 4. Parse Test Data

In [None]:
# Parse test data
print("="*60)
print("PARSING TEST DATA")
print("="*60)

parser_test = NASALogParser()
test_df = parser_test.parse_file(TEST_FILE)

In [None]:
# Kiểm tra test data
print(f"\nTest DataFrame:")
print(f"  Shape: {test_df.shape}")
print(f"  Date range: {test_df['timestamp'].min()} to {test_df['timestamp'].max()}")
print(f"  Duration: {(test_df['timestamp'].max() - test_df['timestamp'].min()).days} days")

## 5. Combine và Preprocess

In [None]:
# Combine train và test cho full analysis
full_df = pd.concat([train_df, test_df], ignore_index=True)
full_df = full_df.sort_values('timestamp').reset_index(drop=True)

print(f"Full Dataset:")
print(f"  Total records: {len(full_df):,}")
print(f"  Date range: {full_df['timestamp'].min()} to {full_df['timestamp'].max()}")

In [None]:
# Tạo preprocessor
preprocessor = LogPreprocessor(full_df)

# Lấy data summary
summary = preprocessor.get_data_summary()

print("="*60)
print("DATA SUMMARY")
print("="*60)
print(f"\nTotal Records: {summary['total_records']:,}")
print(f"\nDate Range:")
print(f"  Start: {summary['date_range']['start']}")
print(f"  End: {summary['date_range']['end']}")
print(f"  Duration: {summary['date_range']['duration_days']} days")

print(f"\nRequest Status Breakdown:")
for status, count in summary['requests'].items():
    if status != 'total':
        pct = count / summary['requests']['total'] * 100
        print(f"  {status}: {count:,} ({pct:.2f}%)")

print(f"\nBytes Statistics:")
print(f"  Total: {summary['bytes']['total'] / 1e9:.2f} GB")
print(f"  Mean: {summary['bytes']['mean']:,.0f} bytes")
print(f"  Median: {summary['bytes']['median']:,.0f} bytes")
print(f"  Max: {summary['bytes']['max']:,} bytes")

print(f"\nUnique Hosts (IPs): {summary['unique_hosts']:,}")

print(f"\nHTTP Methods:")
for method, count in summary['methods'].items():
    print(f"  {method}: {count:,}")

print(f"\nStorm Period (Server Offline):")
print(f"  Start: {summary['storm_period']['start']}")
print(f"  End: {summary['storm_period']['end']}")
print(f"  Duration: {summary['storm_period']['duration_hours']:.1f} hours")

## 6. Detect Gaps trong dữ liệu

In [None]:
# Detect gaps
gaps = preprocessor.detect_gaps(freq='1min', threshold_minutes=30)

print(f"Phát hiện {len(gaps)} gaps > 30 phút:")
if len(gaps) > 0:
    for idx, row in gaps.iterrows():
        storm_marker = " [STORM GAP]" if row['is_storm_gap'] else ""
        print(f"  {row['gap_start']} → {row['gap_end']} ({row['duration_minutes']:.0f} minutes){storm_marker}")

## 7. Tạo Time Series với nhiều Granularities

In [None]:
# Tạo time series với tất cả granularities
print("="*60)
print("TẠO TIME SERIES")
print("="*60)

granularities = preprocessor.create_all_granularities(fill_gaps=True)

print("\nSummary:")
for name, ts in granularities.items():
    print(f"\n{name}:")
    print(f"  Records: {len(ts):,}")
    print(f"  Date range: {ts.index.min()} to {ts.index.max()}")
    print(f"  Total requests: {ts['request_count'].sum():,}")
    print(f"  Avg requests per window: {ts['request_count'].mean():.2f}")
    print(f"  Max requests per window: {ts['request_count'].max():,}")

In [None]:
# Xem sample của 1-minute data
print("Sample 1-minute time series:")
granularities['1min'].head(20)

In [None]:
# Xem các cột có sẵn
print("Các cột trong time series:")
for col in granularities['1min'].columns:
    dtype = granularities['1min'][col].dtype
    print(f"  - {col}: {dtype}")

## 8. Visualize Traffic Overview

In [None]:
# Plot hourly traffic
ts_hourly = granularities['1min'].resample('1h').sum()

fig, axes = plt.subplots(2, 1, figsize=(15, 10))

# Requests per hour
ax1 = axes[0]
ax1.plot(ts_hourly.index, ts_hourly['request_count'], linewidth=0.5, alpha=0.8)
ax1.axvspan(preprocessor.STORM_START, preprocessor.STORM_END, alpha=0.3, color='red', label='Storm Gap')
ax1.set_ylabel('Requests per Hour')
ax1.set_title('NASA Web Server Traffic - Requests per Hour')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Bytes per hour
ax2 = axes[1]
ax2.plot(ts_hourly.index, ts_hourly['bytes_total'] / 1e6, linewidth=0.5, alpha=0.8, color='green')
ax2.axvspan(preprocessor.STORM_START, preprocessor.STORM_END, alpha=0.3, color='red', label='Storm Gap')
ax2.set_ylabel('MB per Hour')
ax2.set_xlabel('Time')
ax2.set_title('NASA Web Server Traffic - Bytes per Hour')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../reports/figures/traffic_overview.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Distribution của requests per minute
ts_1min = granularities['1min']

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
ax1 = axes[0]
ax1.hist(ts_1min['request_count'], bins=100, edgecolor='black', alpha=0.7)
ax1.set_xlabel('Requests per Minute')
ax1.set_ylabel('Frequency')
ax1.set_title('Distribution of Requests per Minute')
ax1.axvline(ts_1min['request_count'].mean(), color='red', linestyle='--', label=f"Mean: {ts_1min['request_count'].mean():.1f}")
ax1.axvline(ts_1min['request_count'].median(), color='green', linestyle='--', label=f"Median: {ts_1min['request_count'].median():.1f}")
ax1.legend()

# Boxplot
ax2 = axes[1]
ax2.boxplot(ts_1min['request_count'].dropna(), vert=True)
ax2.set_ylabel('Requests per Minute')
ax2.set_title('Boxplot of Requests per Minute')

plt.tight_layout()
plt.savefig('../reports/figures/requests_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

## 9. Lưu Processed Data

In [None]:
# Tạo thư mục output nếu chưa có
os.makedirs(PROCESSED_DIR, exist_ok=True)
os.makedirs('../reports/figures', exist_ok=True)

# Lưu processed data
print("Đang lưu processed data...")
preprocessor.save_processed(PROCESSED_DIR, granularities)

print("\nDone!")

In [None]:
# Verify saved files
print("\nKiểm tra files đã lưu:")
for f in os.listdir(PROCESSED_DIR):
    filepath = os.path.join(PROCESSED_DIR, f)
    size_mb = os.path.getsize(filepath) / 1024 / 1024
    print(f"  {f}: {size_mb:.2f} MB")

In [None]:
# Test loading
print("\nTest loading từ parquet:")
ts_loaded = load_timeseries(os.path.join(PROCESSED_DIR, 'timeseries_1min.parquet'))
print(f"  Shape: {ts_loaded.shape}")
print(f"  Index type: {type(ts_loaded.index)}")
print(f"  Date range: {ts_loaded.index.min()} to {ts_loaded.index.max()}")

## 10. Train/Test Split

In [None]:
# Split train/test theo đề bài
# Train: Jul 1995 + Aug 1-22, 1995
# Test: Aug 23-31, 1995

ts_1min = granularities['1min']
train_ts, test_ts = split_train_test(ts_1min, test_start='1995-08-23')

print("Train/Test Split:")
print(f"\nTrain:")
print(f"  Date range: {train_ts.index.min()} to {train_ts.index.max()}")
print(f"  Records: {len(train_ts):,}")
print(f"  Total requests: {train_ts['request_count'].sum():,}")

print(f"\nTest:")
print(f"  Date range: {test_ts.index.min()} to {test_ts.index.max()}")
print(f"  Records: {len(test_ts):,}")
print(f"  Total requests: {test_ts['request_count'].sum():,}")

## Summary

### Kết quả Data Ingestion:
1. **Parsed thành công** ~3.4 triệu log entries
2. **Tạo time series** với 3 granularities (1min, 5min, 15min)
3. **Xử lý storm gap** (~38 giờ server offline đầu tháng 8)
4. **Lưu** dưới dạng parquet để load nhanh

### Files đã tạo:
- `data/processed/timeseries_1min.parquet`
- `data/processed/timeseries_5min.parquet`
- `data/processed/timeseries_15min.parquet`

### Next Steps:
- Notebook 02: EDA - Phân tích traffic patterns
- Notebook 03: Feature Engineering