In [2]:
# SIMPLE STATISTICAL MODEL AS LAST RESORT
import pandas as pd
import numpy as np

# Load data
train = pd.read_csv('Data/train.csv', parse_dates=['datetime'], index_col='datetime')
test = pd.read_csv('Data/test .csv', parse_dates=['datetime'], index_col='datetime')
sample_sub = pd.read_csv('Data/sample_submission .csv')

# Handle missing values simply
train['pm2.5'] = train['pm2.5'].fillna(method='ffill').fillna(method='bfill')

# Create simple time-based features
train['hour'] = train.index.hour
train['month'] = train.index.month

# Calculate hourly averages
hourly_avg = train.groupby('hour')['pm2.5'].mean()
monthly_avg = train.groupby('month')['pm2.5'].mean()

# Create predictions based on time patterns
test['hour'] = test.index.hour
test['month'] = test.index.month

# Simple prediction: average of hourly and monthly patterns
predictions = []
for i, row in test.iterrows():
    hour_avg = hourly_avg[row['hour']]
    month_avg = monthly_avg[row['month']]
    predictions.append((hour_avg + month_avg) / 2)

# Create submission
submission = pd.DataFrame({
    'row ID': sample_sub['row ID'],
    'pm2.5': predictions
})

# Save
submission.to_csv('submissions/simple_statistical_model.csv', index=False)
print("Simple statistical model submission created!")

  train['pm2.5'] = train['pm2.5'].fillna(method='ffill').fillna(method='bfill')


Simple statistical model submission created!


In [3]:
# EMERGENCY ANALYSIS OF TEST SET
print("=== TEST SET ANALYSIS ===")
print(f"Test set date range: {test.index.min()} to {test.index.max()}")
print(f"Test set size: {len(test)}")
print(f"Test set columns: {test.columns.tolist()}")

# Check if test set has different characteristics
print("\n=== TRAIN vs TEST COMPARISON ===")
print("Train date range:", train.index.min(), "to", train.index.max())
print("Test date range:", test.index.min(), "to", test.index.max())

# Check for any time gaps or inconsistencies
train_dates = pd.Series(train.index)
test_dates = pd.Series(test.index)

print("\nDate continuity check:")
print("Train date frequency:", train_dates.diff().value_counts().head())
print("Test date frequency:", test_dates.diff().value_counts().head())

# Check if we need to predict in a different way
print("\nFirst few test dates:")
print(test.index[:10])

=== TEST SET ANALYSIS ===
Test set date range: 2013-07-02 04:00:00 to 2014-12-31 23:00:00
Test set size: 13148
Test set columns: ['No', 'DEWP', 'TEMP', 'PRES', 'Iws', 'Is', 'Ir', 'cbwd_NW', 'cbwd_SE', 'cbwd_cv', 'hour', 'month']

=== TRAIN vs TEST COMPARISON ===
Train date range: 2010-01-01 00:00:00 to 2013-07-02 03:00:00
Test date range: 2013-07-02 04:00:00 to 2014-12-31 23:00:00

Date continuity check:
Train date frequency: datetime
0 days 01:00:00    30675
Name: count, dtype: int64
Test date frequency: datetime
0 days 01:00:00    13147
Name: count, dtype: int64

First few test dates:
DatetimeIndex(['2013-07-02 04:00:00', '2013-07-02 05:00:00',
               '2013-07-02 06:00:00', '2013-07-02 07:00:00',
               '2013-07-02 08:00:00', '2013-07-02 09:00:00',
               '2013-07-02 10:00:00', '2013-07-02 11:00:00',
               '2013-07-02 12:00:00', '2013-07-02 13:00:00'],
              dtype='datetime64[ns]', name='datetime', freq=None)
