In [88]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import os 
import re 
from datetime import datetime

from matplotlib import pyplot as plt 
from typing import Iterable, Union
from utils import listdir_full, timeit
from operator import itemgetter

In [4]:
site_locations_path = '../CPCB/site_locations_040619.xlsx' # Replace with your path to the site locations file

site_locations_df = pd.read_excel(site_locations_path, 
                    names=['site name', 'gpsLatitude', 'gpsLongitude', 'site'], 
                    dtype={'gpsLatitude': float, 'gpsLongitude': float}, 
                    usecols=['gpsLatitude', 'gpsLongitude', 'site'], 
                    index_col='site',
                   )
site_to_gps = site_locations_df.T.to_dict(orient='list')

In [163]:
def read_cpcb(path):
    timestamp_col = 'From Date' 
    return pd.read_csv(
            path, 
            na_values=['None', ''], 
            dtype={'PM2.5': float}, 
            parse_dates = [timestamp_col], 
            usecols = [timestamp_col, 'PM2.5'], 
            date_parser=lambda x: datetime.strptime(x, '%d-%m-%Y %H:%M')
        ).rename(columns={'PM2.5': 'pm2_5', timestamp_col: 'timestamp'}).set_index('timestamp')

In [164]:
@timeit
def read_all(path):
    return pd.concat(
        read_cpcb(os.path.join(root, file)).assign(site=file.split('.')[0]) 
                  for root, _, files in os.walk(path) for file in files
    )

In [158]:
def match_prefix(string: str, prefixes: Iterable[str]) -> Union[str, None]:
    for prefix in prefixes:
        if string.startswith(prefix):
            return prefix 


def clean_2019_jan_june(from_path: str, site_names: Iterable[str], to_path: str = None) -> None:
    """
    Transform the 2019 Jan-June data into the format consistent with the rest of the dataset. 
    """
    if to_path is None:
        to_path = f'{from_path}_cleaned'
    if not os.path.exists(to_path):
        os.makedirs(to_path)
    
    for file_name in os.listdir(from_path):
        df = pd.read_csv(
            os.path.join(from_path, file_name), 
            usecols=['From.Date', 'To.Date.x', 'PM2.5']
        ).rename(columns={'From.Date': 'From Date', 'To.Date.x': 'To Date'})
        
        # get station name matching the site locations file
        if (file_name_cleaned := match_prefix(file_name, site_names)) is not None: 
                file_name = file_name_cleaned
        df.to_csv(os.path.join(to_path, file_name))

In [134]:
path = '../CPCB/CPCB Data 2019/Jan-June'
clean_2019_jan_june(path, site_to_gps.keys())

## Merging cleaned data

Make sure that your directory only contains folders and csv files 

In [165]:
path = '../CPCB/CPCB Data/'
df = read_all(path).sort_index()

Function read_all took 16.019567 seconds


In [180]:
df = df[df['site'].isin(site_to_gps.keys())]

In [181]:
df.to_csv('./data/CPCB_all')

# Adding coordinates to stations

In [205]:
df = pd.read_csv('./data/CPCB_all', index_col='timestamp')

In [220]:
df[['gpsLatitude', 'gpsLongitude']] = df['site'].map(site_to_gps).\
    transform({'gpsLatitude': itemgetter(0), 'gpsLongitude': itemgetter(1)})
df.drop(columns='site', inplace=True)

In [221]:
df

Unnamed: 0_level_0,pm2_5,gpsLatitude,gpsLongitude
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-01-01 00:00:00,474.50,28.684678,77.076574
2019-01-01 00:00:00,351.79,28.725650,77.201157
2019-01-01 00:00:00,309.47,28.624548,77.357710
2019-01-01 00:00:00,254.00,28.408842,77.309908
2019-01-01 00:00:00,361.50,28.814792,77.098075
...,...,...,...
2020-01-31 17:45:00,33.44,28.591825,77.227307
2020-01-31 17:45:00,50.00,28.580280,77.233829
2020-01-31 17:45:00,94.00,28.732820,77.170633
2020-01-31 17:45:00,65.48,28.609090,77.032541


In [222]:
df.to_csv('./data/CPCB_all')

# Combine CPCB data (with coordinates) with DAPHNE

In [None]:
def is_dap(file_name: str) -> bool: 
    match = re.match(r'(?P<code>[A-Z]{3})', file_name)
    if match is None: 
        return False 
    
    return match.group('code') in ['DAP', 'DMC']


def read_dap(path: str) -> pd.DataFrame:
    if not is_dap(os.path.basename(path)):
        return None 
    
    cols = ['pm2_5', 'gpsLatitude', 'gpsLongitude']
    df = pd.read_csv(
        path, 
        index_col='timestamp',
        parse_dates=['timestamp'],
    )
    if not set(cols).issubset(df.columns):
        return None 
    
    return df[cols]

In [None]:
dap = read_dap('./data/DAP_all_processed')

In [None]:
dap

In [None]:
a = pd.concat((df, dap)).sort_index()

In [None]:
a.to_csv('./data/DAP_CPCB_all')

In [None]:
df = a