# Loading dataset and taking few insights

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv("Final_Merged_INSAT_CPCB_ERA5_filtered.csv")

In [2]:
df.columns

Index(['state', 'station_id', 'station_location', 'latitude', 'longitude',
       'timestamp_utc', 'PM2.5', 'img_mean', 'img_std', 'img_min', 'img_max',
       'img_median', 'mean_TIR1', 'std_TIR1', 'min_TIR1', 'max_TIR1',
       'median_TIR1', 'p25_TIR1', 'p75_TIR1', 'p90_TIR1', 'p95_TIR1',
       'pct_above_240_TIR1', 'pct_below_50_TIR1', 'pct_above_200_TIR1',
       'pct_below_100_TIR1', 'range_TIR1', 'cv_TIR1', 'skewness_TIR1',
       'kurtosis_TIR1', 'mean_WV', 'std_WV', 'min_WV', 'max_WV', 'median_WV',
       'p25_WV', 'p75_WV', 'p90_WV', 'p95_WV', 'pct_above_240_WV',
       'pct_below_50_WV', 'pct_above_200_WV', 'pct_below_100_WV', 'range_WV',
       'cv_WV', 'skewness_WV', 'kurtosis_WV', 'has_TIR1', 'has_WV', 't2m',
       'd2m', 'u10', 'v10', 'blh', 'ssrd', 'tp', 'sp', 'skt', 'number',
       'expver', 'grid_latitude', 'grid_longitude', 'wind_speed', 'wind_dir',
       'rh', 't2m_lag1', 't2m_rolling3', 't2m_rolling24', 'd2m_lag1',
       'd2m_rolling3', 'd2m_rolling24', 'wind_

In [3]:
df.shape

(102720, 83)

In [4]:
df.describe

<bound method NDFrame.describe of               state station_id station_location  latitude  longitude  \
0             Delhi      DL009      Pusa, Delhi   28.6374    77.1577   
1             Delhi      DL009      Pusa, Delhi   28.6374    77.1577   
2             Delhi      DL009      Pusa, Delhi   28.6374    77.1577   
3             Delhi      DL009      Pusa, Delhi   28.6374    77.1577   
4             Delhi      DL009      Pusa, Delhi   28.6374    77.1577   
...             ...        ...              ...       ...        ...   
102715  Maharashtra      MH033   Deonar, Mumbai   19.0455    72.9188   
102716  Maharashtra      MH033   Deonar, Mumbai   19.0455    72.9188   
102717  Maharashtra      MH033   Deonar, Mumbai   19.0455    72.9188   
102718  Maharashtra      MH033   Deonar, Mumbai   19.0455    72.9188   
102719  Maharashtra      MH033   Deonar, Mumbai   19.0455    72.9188   

                    timestamp_utc   PM2.5    img_mean   img_std     img_min  \
0       2021-01-01 02:

In [5]:
df.info

<bound method DataFrame.info of               state station_id station_location  latitude  longitude  \
0             Delhi      DL009      Pusa, Delhi   28.6374    77.1577   
1             Delhi      DL009      Pusa, Delhi   28.6374    77.1577   
2             Delhi      DL009      Pusa, Delhi   28.6374    77.1577   
3             Delhi      DL009      Pusa, Delhi   28.6374    77.1577   
4             Delhi      DL009      Pusa, Delhi   28.6374    77.1577   
...             ...        ...              ...       ...        ...   
102715  Maharashtra      MH033   Deonar, Mumbai   19.0455    72.9188   
102716  Maharashtra      MH033   Deonar, Mumbai   19.0455    72.9188   
102717  Maharashtra      MH033   Deonar, Mumbai   19.0455    72.9188   
102718  Maharashtra      MH033   Deonar, Mumbai   19.0455    72.9188   
102719  Maharashtra      MH033   Deonar, Mumbai   19.0455    72.9188   

                    timestamp_utc   PM2.5    img_mean   img_std     img_min  \
0       2021-01-01 02:00

## Measuring per Station Coverage

In [6]:
import pandas as pd

# Load dataset
fn = "Final_Merged_INSAT_CPCB_ERA5_filtered.csv"
df = pd.read_csv(fn, parse_dates=["timestamp_utc"])
df["timestamp_utc"] = pd.to_datetime(df["timestamp_utc"], utc=True)

# Sort for safety
df = df.sort_values(["station_id", "timestamp_utc"]).reset_index(drop=True)

# Function to compute coverage based on 8-hour schedule
def station_coverage(df, station_col="station_id", time_col="timestamp_utc", slots_per_day=8):
    rows = []
    for station, g in df.groupby(station_col):
        start = g[time_col].min()
        end = g[time_col].max()
        # number of days between first and last record
        total_days = (end.normalize() - start.normalize()).days + 1
        expected = total_days * slots_per_day
        actual = g[time_col].nunique()
        pct = actual / expected if expected > 0 else 0
        rows.append((station, start, end, total_days, expected, actual, pct))
    return pd.DataFrame(rows, columns=[
        station_col, "start", "end", "days_covered", "expected_slots", "actual_slots", "coverage_pct"
    ])

# Run coverage calc
cov = station_coverage(df)

# Sort by coverage %
cov = cov.sort_values("coverage_pct").reset_index(drop=True)

# Show summary
print("Coverage summary (8 slots/day expected):")
print(cov.describe())

print("\nWorst 10 stations by coverage:")
print(cov.head(10))

print("\nBest 10 stations by coverage:")
print(cov.tail(10))

Coverage summary (8 slots/day expected):
       days_covered  expected_slots  actual_slots  coverage_pct
count     20.000000       20.000000     20.000000     20.000000
mean     723.500000     5788.000000   5136.000000      0.886905
std       28.141559      225.132476    455.690569      0.065334
min      604.000000     4832.000000   4039.000000      0.756164
25%      730.000000     5840.000000   4870.750000      0.838124
50%      730.000000     5840.000000   5182.500000      0.887414
75%      730.000000     5840.000000   5487.250000      0.939598
max      730.000000     5840.000000   5773.000000      0.988527

Worst 10 stations by coverage:
  station_id                     start                       end  \
0      HR002 2021-01-01 02:00:00+00:00 2022-12-31 09:00:00+00:00   
1      MH012 2021-01-01 02:00:00+00:00 2022-12-31 09:00:00+00:00   
2      HR001 2021-01-01 05:00:00+00:00 2022-12-31 09:00:00+00:00   
3      MH007 2021-01-01 04:00:00+00:00 2022-12-31 04:00:00+00:00   
4      KA00

In [7]:
## Adding flag for the lower coverage stations. 
# Expected coverage calculation (2 years, hourly data → ~17,520 hours)
expected_hours = ((df["timestamp_utc"].max() - df["timestamp_utc"].min()).days + 1) * 24

# Calculate coverage for each station
coverage = (
    df.groupby("station_id")["timestamp_utc"]
    .nunique()
    .reset_index(name="available_hours")
)

coverage["coverage_ratio"] = coverage["available_hours"] / expected_hours

# Define threshold for low coverage (80%)
threshold = 0.8
coverage["low_coverage_flag"] = coverage["coverage_ratio"] < threshold

# Merge back to original dataframe
df = df.merge(
    coverage[["station_id", "low_coverage_flag"]],
    on="station_id",
    how="left"
)

print(df[["station_id", "timestamp_utc", "low_coverage_flag"]].head())

  station_id             timestamp_utc  low_coverage_flag
0      DL009 2021-01-01 02:00:00+00:00               True
1      DL009 2021-01-01 03:00:00+00:00               True
2      DL009 2021-01-01 04:00:00+00:00               True
3      DL009 2021-01-01 05:00:00+00:00               True
4      DL009 2021-01-01 06:00:00+00:00               True


# Adding timestamp_ist column to the dataset

In [8]:
# assume df already loaded with timestamp_utc in UTC timezone
# make sure it's parsed correctly as timezone-aware datetime
df["timestamp_utc"] = pd.to_datetime(df["timestamp_utc"], utc=True)

# convert to IST
df["timestamp_ist"] = df["timestamp_utc"].dt.tz_convert("Asia/Kolkata")

# sanity check
print(df[["timestamp_utc", "timestamp_ist"]].head())

              timestamp_utc             timestamp_ist
0 2021-01-01 02:00:00+00:00 2021-01-01 07:30:00+05:30
1 2021-01-01 03:00:00+00:00 2021-01-01 08:30:00+05:30
2 2021-01-01 04:00:00+00:00 2021-01-01 09:30:00+05:30
3 2021-01-01 05:00:00+00:00 2021-01-01 10:30:00+05:30
4 2021-01-01 06:00:00+00:00 2021-01-01 11:30:00+05:30


## Code for Gap Classification & Imputation

### Compute gap lengths per row (per station)

In [9]:
# assumes df has columns: station_id, timestamp_utc (already sorted earlier)
df = df.sort_values(["station_id", "timestamp_utc"]).reset_index(drop=True)

# compute time difference from previous and next timestamp within each station
df["prev_ts"] = df.groupby("station_id")["timestamp_utc"].shift(1)
df["next_ts"] = df.groupby("station_id")["timestamp_utc"].shift(-1)

df["gap_from_prev_h"] = (df["timestamp_utc"] - df["prev_ts"]) / pd.Timedelta(hours=1)
df["gap_to_next_h"] = (df["next_ts"] - df["timestamp_utc"]) / pd.Timedelta(hours=1)

def classify_gap(row):
    # Look at both previous and next gaps
    g_prev, g_next = row["gap_from_prev_h"], row["gap_to_next_h"]
    g = min(g_prev if not pd.isna(g_prev) else np.inf,
            g_next if not pd.isna(g_next) else np.inf)
    if g <= 3:
        return "short"
    elif g <= 24:
        return "medium"
    elif g < np.inf:
        return "long"
    else:
        return "none"  # start/end rows
df["gap_class"] = df.apply(classify_gap, axis=1)

#### Imputation logic

In [10]:
# Identify numeric columns for imputation
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
# drop columns that should not be imputed
exclude = ["latitude", "longitude"] if "latitude" in df.columns else []
num_cols = [c for c in num_cols if c not in exclude]

# add imputed flag column
df["is_imputed"] = False

def impute_station(g):
    g = g.sort_values("timestamp_utc").copy()
    
    # short gaps → ffill + bfill
    g[num_cols] = g[num_cols].fillna(method="ffill", limit=3).fillna(method="bfill", limit=3)
    
    # medium gaps → linear interpolation
    g[num_cols] = g[num_cols].interpolate(method="linear", limit=24, limit_direction="both")
    
    # mark imputed values
    for col in num_cols:
        g["is_imputed"] |= g[col].isna()
    
    return g

df = df.groupby("station_id").apply(impute_station).reset_index(drop=True)

## Feature selections

### Derived Meteorological Features

In [13]:
# Wind vector components
df["wind_dir_rad"] = np.deg2rad(df["wind_dir"].fillna(0))
df["wind_u"] = df["wind_speed"] * np.cos(df["wind_dir_rad"])
df["wind_v"] = df["wind_speed"] * np.sin(df["wind_dir_rad"])

# Atmospheric stability
df["stability"] = (df["t2m"] - df["skt"].fillna(df["t2m"])) / df["blh"].replace(0, np.nan)

# Moisture deficit
df["moisture_deficit"] = df["t2m"] - df["d2m"]

# Thermal inversion strength
df["inversion_strength"] = df["mean_TIR1"] - df["t2m"]

# Pressure tendency (difference vs previous hour)
df["dp_dt"] = df.groupby("station_id")["sp"].diff(1)

### Timestamp Cyclic Features

In [14]:
# Extract hour, day of year, and cyclical encodings
df["hour"] = df["timestamp_utc"].dt.hour
df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24)
df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24)

df["dayofyear"] = df["timestamp_utc"].dt.dayofyear
df["doy_sin"] = np.sin(2 * np.pi * df["dayofyear"] / 365)
df["doy_cos"] = np.cos(2 * np.pi * df["dayofyear"] / 365)

### Lags & Rolling Features (per-station)

In [15]:
lags = [1, 3, 24]
rolls = [3, 24]
base_feats = ["PM2.5", "t2m", "rh", "wind_speed", "mean_TIR1", "TOTEXTTAU"]

for lag in lags:
    for f in base_feats:
        df[f"{f}_lag{lag}"] = df.groupby("station_id")[f].shift(lag)

for r in rolls:
    for f in base_feats:
        df[f"{f}_rolling{r}_mean"] = (
            df.groupby("station_id")[f]
              .rolling(window=r, min_periods=1)
              .mean()
              .reset_index(level=0, drop=True)
        )
        df[f"{f}_rolling{r}_std"] = (
            df.groupby("station_id")[f]
              .rolling(window=r, min_periods=1)
              .std()
              .reset_index(level=0, drop=True)
        )

# Saving

In [16]:
df.to_csv("dataset_feature1.csv", index=False)