### Black Carbon (880nm)

In [2]:
import pandas as pd
import numpy as np

# Read the CSV file, skipping metadata rows (first 9 rows)
df = pd.read_csv('black_carbon.csv', skiprows=10)
df.columns

  from pandas.core import (


Index(['Date', 'Black Carbon (880nm)', 'Status', 'Black Carbon (880nm).1',
       'Status.1', 'Black Carbon (880nm).2', 'Status.2'],
      dtype='object')

In [None]:
# Select relevant columns: Date and Black Carbon values for the three sites
columns = ['Date', 
           'Black Carbon (880nm)', 
           'Black Carbon (880nm).1', 
           'Black Carbon (880nm).2']
df = df[columns]

In [4]:
# Rename columns for clarity
df.columns = ['Date', 'BC_A4540', 'BC_Ladywood', 'BC_Underdale']
df.head()

Unnamed: 0,Date,BC_A4540,BC_Ladywood,BC_Underdale
0,2024-06-09,0.81,0.26,0.14
1,2024-06-10,0.62,0.21,0.18
2,2024-06-11,0.95,0.32,0.17
3,2024-06-12,1.36,0.4,0.23
4,2024-06-13,1.4,0.28,0.32


In [5]:
# Replace 'No data' with NaN
df[['BC_A4540', 'BC_Ladywood', 'BC_Underdale']] = df[['BC_A4540', 'BC_Ladywood', 'BC_Underdale']].replace('No data', np.nan)

In [6]:
# Convert Black Carbon columns to numeric
df[['BC_A4540', 'BC_Ladywood', 'BC_Underdale']] = df[['BC_A4540', 'BC_Ladywood', 'BC_Underdale']].apply(pd.to_numeric, errors='coerce')

In [None]:
# Calculate the mean Black Carbon value per row, ignoring NaN
df['Black_Carbon_Mean'] = df[['BC_A4540', 'BC_Ladywood', 'BC_Underdale']].mean(axis=1, skipna=True)

In [8]:
# Select only Date and Black_Carbon_Mean columns
df = df[['Date', 'Black_Carbon_Mean']]

# Rename specific columns
df = df.rename(columns={'Date': 'date', 'Black_Carbon_Mean': 'black_carbon_880nm'})
df.head()

Unnamed: 0,date,black_carbon_880nm
0,2024-06-09,0.403333
1,2024-06-10,0.336667
2,2024-06-11,0.48
3,2024-06-12,0.663333
4,2024-06-13,0.666667


In [11]:
# Remove any row where the 'Date' column contains "End" (case-insensitive)
df = df[~df['date'].str.lower().eq('end')]

# Ensure Date column is in datetime format
df['date'] = pd.to_datetime(df['date'])

In [12]:
# Function to calculate mean of past 7 days
def fill_with_past_7_days_mean(row, df, window_days=7):
    if pd.isna(row['black_carbon_880nm']):
        # Get the date range for the past 7 days (t-7 to t-1)
        start_date = row['date'] - pd.Timedelta(days=window_days)
        end_date = row['date'] - pd.Timedelta(days=1)
        # Select rows within the past 7 days
        past_data = df[(df['date'] >= start_date) & (df['date'] <= end_date)]
        # Calculate mean of non-NaN Black_Carbon_Mean values
        mean_value = past_data['black_carbon_880nm'].mean()
        return mean_value
    return row['black_carbon_880nm']

# Apply the function to fill missing Black_Carbon_Mean values
df['black_carbon_880nm'] = df.apply(lambda row: fill_with_past_7_days_mean(row, df), axis=1)

In [14]:
# Round specific columns
df['black_carbon_880nm'] = df['black_carbon_880nm'].round(3)
df.head()

Unnamed: 0,date,black_carbon_880nm
0,2024-06-09,0.403
1,2024-06-10,0.337
2,2024-06-11,0.48
3,2024-06-12,0.663
4,2024-06-13,0.667


In [15]:
# Save to a new CSV file
df.to_csv('cleaned_black_carbon.csv', index=False)

### Blue Particulate Matter (470nm)

In [3]:
import pandas as pd
import numpy as np

df_bpm = pd.read_csv('blue_particulate_matter.csv', skiprows=10)
df_bpm.columns

  from pandas.core import (


Index(['Date', 'Blue Particulate matter (470nm)', 'Status',
       'Blue Particulate matter (470nm).1', 'Status.1',
       'Blue Particulate matter (470nm).2', 'Status.2'],
      dtype='object')

In [5]:
# Select relevant columns: Date and Black Carbon values for the three sites
columns = ['Date', 
           'Blue Particulate matter (470nm)', 
           'Blue Particulate matter (470nm).1', 
           'Blue Particulate matter (470nm).2']
df_bpm = df_bpm[columns]

In [6]:
# Replace 'No data' with NaN
df_bpm[['Blue Particulate matter (470nm)', 'Blue Particulate matter (470nm).1', 'Blue Particulate matter (470nm).2']] = df_bpm[['Blue Particulate matter (470nm)', 'Blue Particulate matter (470nm).1', 'Blue Particulate matter (470nm).2']].replace('No data', np.nan)

# Convert blue particulate matter columns to numeric
df_bpm[['Blue Particulate matter (470nm)', 'Blue Particulate matter (470nm).1', 'Blue Particulate matter (470nm).2']] = df_bpm[['Blue Particulate matter (470nm)', 'Blue Particulate matter (470nm).1', 'Blue Particulate matter (470nm).2']].apply(pd.to_numeric, errors='coerce')

# Calculate the mean blue particulate matter value per row, ignoring NaN
df_bpm['blue_particulate_matter'] = df_bpm[['Blue Particulate matter (470nm)', 'Blue Particulate matter (470nm).1', 'Blue Particulate matter (470nm).2']].mean(axis=1, skipna=True)

df_bpm.head()



Unnamed: 0,Date,Blue Particulate matter (470nm),Blue Particulate matter (470nm).1,Blue Particulate matter (470nm).2,blue_particulate_matter
0,2024-06-09,0.93,0.31,0.17,0.47
1,2024-06-10,0.69,0.25,0.18,0.373333
2,2024-06-11,1.04,0.36,0.19,0.53
3,2024-06-12,1.52,0.48,0.26,0.753333
4,2024-06-13,1.55,0.3,0.35,0.733333


In [13]:
# Select only Date and Black_Carbon_Mean columns
# df_bpm = df_bpm[['Date', 'blue_particulate_matter']]

In [14]:
# Rename specific columns
df_bpm = df_bpm.rename(columns={'Date': 'date', 'blue_particulate_matter': 'blue_particulate_matter_470nm'})
df_bpm.head()

Unnamed: 0,date,blue_particulate_matter_470nm
0,2024-06-09,0.47
1,2024-06-10,0.373333
2,2024-06-11,0.53
3,2024-06-12,0.753333
4,2024-06-13,0.733333


In [15]:
# Remove any row where the 'Date' column contains "End" (case-insensitive)
df_bpm = df_bpm[~df_bpm['date'].str.lower().eq('end')]

# Ensure Date column is in datetime format
df_bpm['date'] = pd.to_datetime(df_bpm['date'])

In [18]:
# Function to calculate mean of past 7 days
def fill_with_past_7_days_mean(row, df_bpm, window_days=7):
    if pd.isna(row['blue_particulate_matter_470nm']):
        # Get the date range for the past 7 days (t-7 to t-1)
        start_date = row['date'] - pd.Timedelta(days=window_days)
        end_date = row['date'] - pd.Timedelta(days=1)
        # Select rows within the past 7 days
        past_data = df_bpm[(df_bpm['date'] >= start_date) & (df_bpm['date'] <= end_date)]
        # Calculate mean of non-NaN Black_Carbon_Mean values
        mean_value = past_data['blue_particulate_matter_470nm'].mean()
        return mean_value
    return row['blue_particulate_matter_470nm']

# Apply the function to fill missing Black_Carbon_Mean values
df_bpm['blue_particulate_matter_470nm'] = df_bpm.apply(lambda row: fill_with_past_7_days_mean(row, df_bpm), axis=1)

In [None]:
# Round specific columns
df_bpm['blue_particulate_matter_470nm'] = df_bpm['blue_particulate_matter_470nm'].round(3)
df_bpm.head()

Unnamed: 0,date,blue_particulate_matter_470nm
0,2024-06-09,0.47
1,2024-06-10,0.373
2,2024-06-11,0.53
3,2024-06-12,0.753
4,2024-06-13,0.733


In [23]:
# Save to a new CSV file
df_bpm.to_csv('blue_particulate_matter_cd.csv', index=False)

### Green Particulate Matter (520nm)

In [34]:
df_gpm = pd.read_csv('green_particulate_matter.csv', skiprows=10)
df_gpm.columns

Index(['Date', 'Green Particulate matter (520nm)', 'Status',
       'Green Particulate matter (520nm).1', 'Status.1',
       'Green Particulate matter (520nm).2', 'Status.2'],
      dtype='object')

In [35]:
# Select relevant columns: Date and Black Carbon values for the three sites
columns = ['Date', 
           'Green Particulate matter (520nm)', 
           'Green Particulate matter (520nm).1', 
           'Green Particulate matter (520nm).2']
df_gpm = df_gpm[columns]

In [36]:
# Replace 'No data' with NaN
df_gpm[['Green Particulate matter (520nm)', 'Green Particulate matter (520nm).1', 'Green Particulate matter (520nm).2']] = df_gpm[['Green Particulate matter (520nm)', 'Green Particulate matter (520nm).1', 'Green Particulate matter (520nm).2']].replace('No data', np.nan)

# Convert blue particulate matter columns to numeric
df_gpm[['Green Particulate matter (520nm)', 'Green Particulate matter (520nm).1', 'Green Particulate matter (520nm).2']] = df_gpm[['Green Particulate matter (520nm)', 'Green Particulate matter (520nm).1', 'Green Particulate matter (520nm).2']].apply(pd.to_numeric, errors='coerce')

# Calculate the mean blue particulate matter value per row, ignoring NaN
df_gpm['green_particulate_matter'] = df_gpm[['Green Particulate matter (520nm)', 'Green Particulate matter (520nm).1', 'Green Particulate matter (520nm).2']].mean(axis=1, skipna=True)

df_gpm.head()

Unnamed: 0,Date,Green Particulate matter (520nm),Green Particulate matter (520nm).1,Green Particulate matter (520nm).2,green_particulate_matter
0,09-06-2024,0.87,0.3,0.17,0.446667
1,10-06-2024,0.64,0.25,0.17,0.353333
2,11-06-2024,1.0,0.34,0.18,0.506667
3,12-06-2024,1.45,0.45,0.25,0.716667
4,13-06-2024,1.47,0.3,0.34,0.703333


In [37]:
# Select only Date and Black_Carbon_Mean columns
df_gpm = df_gpm[['Date', 'green_particulate_matter']]

In [38]:
# Rename specific columns
df_gpm = df_gpm.rename(columns={'Date': 'date', 'green_particulate_matter': 'green_particulate_matter_520nm'})
df_gpm.head()

Unnamed: 0,date,green_particulate_matter_520nm
0,09-06-2024,0.446667
1,10-06-2024,0.353333
2,11-06-2024,0.506667
3,12-06-2024,0.716667
4,13-06-2024,0.703333


In [40]:
# Ensure Date column is in datetime format, parsing DD-MM-YYYY
df_gpm['date'] = pd.to_datetime(df_gpm['date'], format='%d-%m-%Y')

In [None]:
df_gpm[df_gpm.isna().any(axis=1)]     # Show all rows with any missing values

Unnamed: 0,date,green_particulate_matter_520nm
207,2025-01-02,
217,2025-01-12,
218,2025-01-13,
306,2025-04-11,
319,2025-04-24,
341,2025-05-16,
342,2025-05-18,


In [46]:
# Function to calculate mean of past 7 days
def fill_with_past_7_days_mean(row, df_gpm, window_days=7):
    if pd.isna(row['green_particulate_matter_520nm']):
        # Get the date range for the past 7 days (t-7 to t-1)
        start_date = row['date'] - pd.Timedelta(days=window_days)
        end_date = row['date'] - pd.Timedelta(days=1)
        # Select rows within the past 7 days
        past_data = df_gpm[(df_gpm['date'] >= start_date) & (df_gpm['date'] <= end_date)]
        # Calculate mean of non-NaN Black_Carbon_Mean values
        mean_value = past_data['green_particulate_matter_520nm'].mean()
        return mean_value
    return row['green_particulate_matter_520nm']

# Apply the function to fill missing Black_Carbon_Mean values
df_gpm['green_particulate_matter_520nm'] = df_gpm.apply(lambda row: fill_with_past_7_days_mean(row, df_gpm), axis=1)

In [47]:
# Round specific columns
df_gpm['green_particulate_matter_520nm'] = df_gpm['green_particulate_matter_520nm'].round(3)
df_gpm.head()

Unnamed: 0,date,green_particulate_matter_520nm
0,2024-06-09,0.447
1,2024-06-10,0.353
2,2024-06-11,0.507
3,2024-06-12,0.717
4,2024-06-13,0.703


In [48]:
# Save to a new CSV file
df_gpm.to_csv('green_particulate_matter_cd.csv', index=False)

### Infra Red Particulate Matter (950nm)

In [3]:
df_irpm = pd.read_csv('infra_red_particulate_matter.csv', skiprows=10)
df_irpm.columns

Index(['Date', 'Infra Red Particulate matter (950nm)', 'Status',
       'Infra Red Particulate matter (950nm).1', 'Status.1',
       'Infra Red Particulate matter (950nm).2', 'Status.2'],
      dtype='object')

In [4]:
# Select relevant columns: Date and Black Carbon values for the three sites
columns = ['Date', 
           'Infra Red Particulate matter (950nm)', 
           'Infra Red Particulate matter (950nm).1', 
           'Infra Red Particulate matter (950nm).2']
df_irpm = df_irpm[columns]

In [5]:
# Replace 'No data' with NaN
df_irpm[['Infra Red Particulate matter (950nm)', 'Infra Red Particulate matter (950nm).1', 'Infra Red Particulate matter (950nm).2']] = df_irpm[['Infra Red Particulate matter (950nm)', 'Infra Red Particulate matter (950nm).1', 'Infra Red Particulate matter (950nm).2']].replace('No data', np.nan)

# Convert blue particulate matter columns to numeric
df_irpm[['Infra Red Particulate matter (950nm)', 'Infra Red Particulate matter (950nm).1', 'Infra Red Particulate matter (950nm).2']] = df_irpm[['Infra Red Particulate matter (950nm)', 'Infra Red Particulate matter (950nm).1', 'Infra Red Particulate matter (950nm).2']].apply(pd.to_numeric, errors='coerce')

# Calculate the mean blue particulate matter value per row, ignoring NaN
df_irpm['infra_red_particulate_matter'] = df_irpm[['Infra Red Particulate matter (950nm)', 'Infra Red Particulate matter (950nm).1', 'Infra Red Particulate matter (950nm).2']].mean(axis=1, skipna=True)

df_irpm.head()

Unnamed: 0,Date,Infra Red Particulate matter (950nm),Infra Red Particulate matter (950nm).1,Infra Red Particulate matter (950nm).2,infra_red_particulate_matter
0,09-06-2024,0.85,0.28,0.14,0.423333
1,10-06-2024,0.64,0.24,0.16,0.346667
2,11-06-2024,0.95,0.33,0.18,0.486667
3,12-06-2024,1.38,0.43,0.23,0.68
4,13-06-2024,1.4,0.28,0.33,0.67


In [6]:
# Select only Date and Black_Carbon_Mean columns
df_irpm = df_irpm[['Date', 'infra_red_particulate_matter']]

In [7]:
# Rename specific columns
df_irpm = df_irpm.rename(columns={'Date': 'date', 'infra_red_particulate_matter': 'infra_red_particulate_matter_950nm'})
df_irpm.head()

Unnamed: 0,date,infra_red_particulate_matter_950nm
0,09-06-2024,0.423333
1,10-06-2024,0.346667
2,11-06-2024,0.486667
3,12-06-2024,0.68
4,13-06-2024,0.67


In [8]:
# Ensure Date column is in datetime format, parsing DD-MM-YYYY
df_irpm['date'] = pd.to_datetime(df_irpm['date'], format='%d-%m-%Y')

In [9]:
# Check for missing values
df_irpm[df_irpm.isna().any(axis=1)]     # Show all rows with any missing values

Unnamed: 0,date,infra_red_particulate_matter_950nm
207,2025-01-02,
217,2025-01-12,
218,2025-01-13,
306,2025-04-11,
319,2025-04-24,
341,2025-05-16,
342,2025-05-18,


In [10]:
# Function to calculate mean of past 7 days
def fill_with_past_7_days_mean(row, df_irpm, window_days=7):
    if pd.isna(row['infra_red_particulate_matter_950nm']):
        # Get the date range for the past 7 days (t-7 to t-1)
        start_date = row['date'] - pd.Timedelta(days=window_days)
        end_date = row['date'] - pd.Timedelta(days=1)
        # Select rows within the past 7 days
        past_data = df_irpm[(df_irpm['date'] >= start_date) & (df_irpm['date'] <= end_date)]
        # Calculate mean of non-NaN Black_Carbon_Mean values
        mean_value = past_data['infra_red_particulate_matter_950nm'].mean()
        return mean_value
    return row['infra_red_particulate_matter_950nm']

# Apply the function to fill missing Black_Carbon_Mean values
df_irpm['infra_red_particulate_matter_950nm'] = df_irpm.apply(lambda row: fill_with_past_7_days_mean(row, df_irpm), axis=1)

In [11]:
# Round specific columns
df_irpm['infra_red_particulate_matter_950nm'] = df_irpm['infra_red_particulate_matter_950nm'].round(3)
df_irpm.head()

Unnamed: 0,date,infra_red_particulate_matter_950nm
0,2024-06-09,0.423
1,2024-06-10,0.347
2,2024-06-11,0.487
3,2024-06-12,0.68
4,2024-06-13,0.67


In [12]:
# Save to a new CSV file
df_irpm.to_csv('infra_red_particulate_matter_cd.csv', index=False)

### Nitric Oxide

In [14]:
df_no = pd.read_csv('nitric_oxide.csv', skiprows=10)
df_no.columns

Index(['Date', 'Nitric oxide', 'Status', 'Nitric oxide.1', 'Status.1',
       'Nitric oxide.2', 'Status.2', 'Nitric oxide.3', 'Status.3',
       'Nitric oxide.4', 'Status.4', 'Nitric oxide.5', 'Status.5',
       'Nitric oxide.6', 'Status.6', 'Nitric oxide.7', 'Status.7',
       'Nitric oxide.8', 'Status.8', 'Nitric oxide.9', 'Status.9',
       'Nitric oxide.10', 'Status.10', 'Nitric oxide.11', 'Status.11',
       'Nitric oxide.12', 'Status.12', 'Nitric oxide.13', 'Status.13',
       'Nitric oxide.14', 'Status.14', 'Nitric oxide.15', 'Status.15'],
      dtype='object')

In [16]:
# Select relevant columns: Date and Black Carbon values for the three sites
columns = ['Date', 
           'Nitric oxide', 
           'Nitric oxide.1', 
           'Nitric oxide.2',
           'Nitric oxide.3',
           'Nitric oxide.4',
           'Nitric oxide.5',
           'Nitric oxide.6',
           'Nitric oxide.7',
           'Nitric oxide.8',
           'Nitric oxide.9',
           'Nitric oxide.10',
           'Nitric oxide.11',
           'Nitric oxide.12',
           'Nitric oxide.13',
           'Nitric oxide.14',
           'Nitric oxide.15'
            ]
df_no = df_no[columns]

In [17]:
# Replace 'No data' with NaN
df_no[[    'Nitric oxide', 
           'Nitric oxide.1', 
           'Nitric oxide.2',
           'Nitric oxide.3',
           'Nitric oxide.4',
           'Nitric oxide.5',
           'Nitric oxide.6',
           'Nitric oxide.7',
           'Nitric oxide.8',
           'Nitric oxide.9',
           'Nitric oxide.10',
           'Nitric oxide.11',
           'Nitric oxide.12',
           'Nitric oxide.13',
           'Nitric oxide.14',
           'Nitric oxide.15'
            ]] = df_no[[    'Nitric oxide', 
           'Nitric oxide.1', 
           'Nitric oxide.2',
           'Nitric oxide.3',
           'Nitric oxide.4',
           'Nitric oxide.5',
           'Nitric oxide.6',
           'Nitric oxide.7',
           'Nitric oxide.8',
           'Nitric oxide.9',
           'Nitric oxide.10',
           'Nitric oxide.11',
           'Nitric oxide.12',
           'Nitric oxide.13',
           'Nitric oxide.14',
           'Nitric oxide.15'
            ]].replace('No data', np.nan)

# Convert blue particulate matter columns to numeric
df_no[[    'Nitric oxide', 
           'Nitric oxide.1', 
           'Nitric oxide.2',
           'Nitric oxide.3',
           'Nitric oxide.4',
           'Nitric oxide.5',
           'Nitric oxide.6',
           'Nitric oxide.7',
           'Nitric oxide.8',
           'Nitric oxide.9',
           'Nitric oxide.10',
           'Nitric oxide.11',
           'Nitric oxide.12',
           'Nitric oxide.13',
           'Nitric oxide.14',
           'Nitric oxide.15'
            ]] = df_no[[    'Nitric oxide', 
           'Nitric oxide.1', 
           'Nitric oxide.2',
           'Nitric oxide.3',
           'Nitric oxide.4',
           'Nitric oxide.5',
           'Nitric oxide.6',
           'Nitric oxide.7',
           'Nitric oxide.8',
           'Nitric oxide.9',
           'Nitric oxide.10',
           'Nitric oxide.11',
           'Nitric oxide.12',
           'Nitric oxide.13',
           'Nitric oxide.14',
           'Nitric oxide.15'
            ]].apply(pd.to_numeric, errors='coerce')

# Calculate the mean blue particulate matter value per row, ignoring NaN
df_no['nitric_oxide'] = df_no[[    'Nitric oxide', 
           'Nitric oxide.1', 
           'Nitric oxide.2',
           'Nitric oxide.3',
           'Nitric oxide.4',
           'Nitric oxide.5',
           'Nitric oxide.6',
           'Nitric oxide.7',
           'Nitric oxide.8',
           'Nitric oxide.9',
           'Nitric oxide.10',
           'Nitric oxide.11',
           'Nitric oxide.12',
           'Nitric oxide.13',
           'Nitric oxide.14',
           'Nitric oxide.15'
            ]].mean(axis=1, skipna=True)

df_no.head()

Unnamed: 0,Date,Nitric oxide,Nitric oxide.1,Nitric oxide.2,Nitric oxide.3,Nitric oxide.4,Nitric oxide.5,Nitric oxide.6,Nitric oxide.7,Nitric oxide.8,Nitric oxide.9,Nitric oxide.10,Nitric oxide.11,Nitric oxide.12,Nitric oxide.13,Nitric oxide.14,Nitric oxide.15,nitric_oxide
0,09-06-2024,11.0,,1.0,1,2,2.0,11.0,1.0,1.0,0,6.0,12.0,3,1,1,,3.785714
1,10-06-2024,9.0,,2.0,1,4,2.0,9.0,1.0,2.0,1,6.0,12.0,8,1,1,,4.214286
2,11-06-2024,13.0,,1.0,1,4,2.0,12.0,1.0,2.0,1,7.0,20.0,7,1,1,,5.214286
3,12-06-2024,18.0,,1.0,1,3,3.0,13.0,1.0,2.0,1,10.0,23.0,6,1,1,,6.0
4,13-06-2024,23.0,,1.0,2,2,3.0,10.0,1.0,1.0,1,19.0,36.0,5,2,1,,7.642857


In [18]:
# Select only Date and Black_Carbon_Mean columns
df_no = df_no[['Date', 'nitric_oxide']]

In [19]:
# Rename specific columns
df_no = df_no.rename(columns={'Date': 'date'})
df_no.head()

Unnamed: 0,date,nitric_oxide
0,09-06-2024,3.785714
1,10-06-2024,4.214286
2,11-06-2024,5.214286
3,12-06-2024,6.0
4,13-06-2024,7.642857


In [20]:
# Ensure Date column is in datetime format, parsing DD-MM-YYYY
df_no['date'] = pd.to_datetime(df_no['date'], format='%d-%m-%Y')

In [21]:
# Check for missing values
df_no[df_no.isna().any(axis=1)]     # Show all rows with any missing values

Unnamed: 0,date,nitric_oxide


In [22]:
# Function to calculate mean of past 7 days
def fill_with_past_7_days_mean(row, df_no, window_days=7):
    if pd.isna(row['nitric_oxide']):
        # Get the date range for the past 7 days (t-7 to t-1)
        start_date = row['date'] - pd.Timedelta(days=window_days)
        end_date = row['date'] - pd.Timedelta(days=1)
        # Select rows within the past 7 days
        past_data = df_no[(df_no['date'] >= start_date) & (df_no['date'] <= end_date)]
        # Calculate mean of non-NaN Black_Carbon_Mean values
        mean_value = past_data['nitric_oxide'].mean()
        return mean_value
    return row['nitric_oxide']

# Apply the function to fill missing nitric_oxide_Mean values
df_no['nitric_oxide'] = df_no.apply(lambda row: fill_with_past_7_days_mean(row, df_no), axis=1)

In [23]:
# Round specific columns
df_no['nitric_oxide'] = df_no['nitric_oxide'].round(3)
df_no.head()

Unnamed: 0,date,nitric_oxide
0,2024-06-09,3.786
1,2024-06-10,4.214
2,2024-06-11,5.214
3,2024-06-12,6.0
4,2024-06-13,7.643


In [24]:
# Save to a new CSV file
df_no.to_csv('nitric_oxide_cd.csv', index=False)

### Nitrogen Dioxide

In [None]:
df_dio = pd.read_csv('nitrogen_dioxide.csv', skiprows=10)
df_dio.columns

Index(['Date', 'Nitrogen dioxide', 'Status', 'Nitrogen dioxide.1', 'Status.1',
       'Nitrogen dioxide.2', 'Status.2', 'Nitrogen dioxide.3', 'Status.3',
       'Nitrogen dioxide.4', 'Status.4', 'Nitrogen dioxide.5', 'Status.5',
       'Nitrogen dioxide.6', 'Status.6', 'Nitrogen dioxide.7', 'Status.7',
       'Nitrogen dioxide.8', 'Status.8', 'Nitrogen dioxide.9', 'Status.9',
       'Nitrogen dioxide.10', 'Status.10', 'Nitrogen dioxide.11', 'Status.11',
       'Nitrogen dioxide.12', 'Status.12', 'Nitrogen dioxide.13', 'Status.13',
       'Nitrogen dioxide.14', 'Status.14', 'Nitrogen dioxide.15', 'Status.15'],
      dtype='object')

In [26]:
# Select relevant columns: Date and Black Carbon values for the three sites
columns = ['Date', 
           'Nitrogen dioxide', 
           'Nitrogen dioxide.1', 
           'Nitrogen dioxide.2',
           'Nitrogen dioxide.3',
           'Nitrogen dioxide.4',
           'Nitrogen dioxide.5',
           'Nitrogen dioxide.6',
           'Nitrogen dioxide.7',
           'Nitrogen dioxide.8',
           'Nitrogen dioxide.9',
           'Nitrogen dioxide.10',
           'Nitrogen dioxide.11',
           'Nitrogen dioxide.12',
           'Nitrogen dioxide.13',
           'Nitrogen dioxide.14',
           'Nitrogen dioxide.15'
            ]
df_dio = df_dio[columns]

In [27]:
# Replace 'No data' with NaN
df_dio[[    'Nitrogen dioxide', 
           'Nitrogen dioxide.1', 
           'Nitrogen dioxide.2',
           'Nitrogen dioxide.3',
           'Nitrogen dioxide.4',
           'Nitrogen dioxide.5',
           'Nitrogen dioxide.6',
           'Nitrogen dioxide.7',
           'Nitrogen dioxide.8',
           'Nitrogen dioxide.9',
           'Nitrogen dioxide.10',
           'Nitrogen dioxide.11',
           'Nitrogen dioxide.12',
           'Nitrogen dioxide.13',
           'Nitrogen dioxide.14',
           'Nitrogen dioxide.15'
            ]] = df_dio[['Nitrogen dioxide', 
           'Nitrogen dioxide.1', 
           'Nitrogen dioxide.2',
           'Nitrogen dioxide.3',
           'Nitrogen dioxide.4',
           'Nitrogen dioxide.5',
           'Nitrogen dioxide.6',
           'Nitrogen dioxide.7',
           'Nitrogen dioxide.8',
           'Nitrogen dioxide.9',
           'Nitrogen dioxide.10',
           'Nitrogen dioxide.11',
           'Nitrogen dioxide.12',
           'Nitrogen dioxide.13',
           'Nitrogen dioxide.14',
           'Nitrogen dioxide.15'
            ]].replace('No data', np.nan)

# Convert blue particulate matter columns to numeric
df_dio[['Nitrogen dioxide', 
           'Nitrogen dioxide.1', 
           'Nitrogen dioxide.2',
           'Nitrogen dioxide.3',
           'Nitrogen dioxide.4',
           'Nitrogen dioxide.5',
           'Nitrogen dioxide.6',
           'Nitrogen dioxide.7',
           'Nitrogen dioxide.8',
           'Nitrogen dioxide.9',
           'Nitrogen dioxide.10',
           'Nitrogen dioxide.11',
           'Nitrogen dioxide.12',
           'Nitrogen dioxide.13',
           'Nitrogen dioxide.14',
           'Nitrogen dioxide.15'
            ]] = df_dio[['Nitrogen dioxide', 
           'Nitrogen dioxide.1', 
           'Nitrogen dioxide.2',
           'Nitrogen dioxide.3',
           'Nitrogen dioxide.4',
           'Nitrogen dioxide.5',
           'Nitrogen dioxide.6',
           'Nitrogen dioxide.7',
           'Nitrogen dioxide.8',
           'Nitrogen dioxide.9',
           'Nitrogen dioxide.10',
           'Nitrogen dioxide.11',
           'Nitrogen dioxide.12',
           'Nitrogen dioxide.13',
           'Nitrogen dioxide.14',
           'Nitrogen dioxide.15'
            ]].apply(pd.to_numeric, errors='coerce')

# Calculate the mean blue particulate matter value per row, ignoring NaN
df_dio['nitric_oxide'] = df_dio[['Nitrogen dioxide', 
           'Nitrogen dioxide.1', 
           'Nitrogen dioxide.2',
           'Nitrogen dioxide.3',
           'Nitrogen dioxide.4',
           'Nitrogen dioxide.5',
           'Nitrogen dioxide.6',
           'Nitrogen dioxide.7',
           'Nitrogen dioxide.8',
           'Nitrogen dioxide.9',
           'Nitrogen dioxide.10',
           'Nitrogen dioxide.11',
           'Nitrogen dioxide.12',
           'Nitrogen dioxide.13',
           'Nitrogen dioxide.14',
           'Nitrogen dioxide.15'
            ]].mean(axis=1, skipna=True)

df_dio.head()

Unnamed: 0,Date,Nitrogen dioxide,Nitrogen dioxide.1,Nitrogen dioxide.2,Nitrogen dioxide.3,Nitrogen dioxide.4,Nitrogen dioxide.5,Nitrogen dioxide.6,Nitrogen dioxide.7,Nitrogen dioxide.8,Nitrogen dioxide.9,Nitrogen dioxide.10,Nitrogen dioxide.11,Nitrogen dioxide.12,Nitrogen dioxide.13,Nitrogen dioxide.14,Nitrogen dioxide.15,nitric_oxide
0,09-06-2024,18.0,,4.0,5,8,6.0,16.0,6.0,3.0,1,13.0,18.0,9,2,3,,8.0
1,10-06-2024,16.0,,7.0,5,7,5.0,15.0,7.0,5.0,1,14.0,17.0,13,4,3,,8.5
2,11-06-2024,22.0,,10.0,9,8,9.0,21.0,9.0,7.0,2,18.0,28.0,18,5,4,,12.142857
3,12-06-2024,31.0,,10.0,11,9,11.0,25.0,11.0,9.0,2,25.0,32.0,19,4,4,,14.5
4,13-06-2024,33.0,,6.0,13,9,10.0,18.0,8.0,6.0,4,23.0,36.0,16,6,7,,13.928571


In [28]:
# Select only Date and Black_Carbon_Mean columns
df_dio = df_dio[['Date', 'nitric_oxide']]

In [33]:
# Rename specific columns
df_dio = df_dio.rename(columns={'Date': 'date', 'nitric_dioxide_mean' : 'nitric_dioxide'})
df_dio.head()

Unnamed: 0,date,nitric_dioxide
0,2024-06-09,8.0
1,2024-06-10,8.5
2,2024-06-11,12.142857
3,2024-06-12,14.5
4,2024-06-13,13.928571


In [30]:
# Ensure Date column is in datetime format, parsing DD-MM-YYYY
df_dio['date'] = pd.to_datetime(df_dio['date'], format='%d-%m-%Y')

In [None]:
# Check for missing values
df_dio[df_no.isna().any(axis=1)]     # Show all rows with any missing values

Unnamed: 0,date,nitric_dioxide_mean


In [None]:
# Function to calculate mean of past 7 days
def fill_with_past_7_days_mean(row, df_dio, window_days=7):
    if pd.isna(row['nitric_dioxide']):
        # Get the date range for the past 7 days (t-7 to t-1)
        start_date = row['date'] - pd.Timedelta(days=window_days)
        end_date = row['date'] - pd.Timedelta(days=1)
        # Select rows within the past 7 days
        past_data = df_dio[(df_dio['date'] >= start_date) & (df_dio['date'] <= end_date)]
        # Calculate mean of non-NaN Black_Carbon_Mean values
        mean_value = past_data['nitric_dioxide'].mean()
        return mean_value
    return row['nitric_dioxide']

# Apply the function to fill missing nitric_oxide_Mean values
df_dio['nitric_dioxide'] = df_dio.apply(lambda row: fill_with_past_7_days_mean(row, df_dio), axis=1)

In [35]:
# Round specific columns
df_dio['nitric_dioxide'] = df_dio['nitric_dioxide'].round(3)
df_dio.head()

Unnamed: 0,date,nitric_dioxide
0,2024-06-09,8.0
1,2024-06-10,8.5
2,2024-06-11,12.143
3,2024-06-12,14.5
4,2024-06-13,13.929


In [36]:
# Save to a new CSV file
df_dio.to_csv('nitric_dioxide_cd.csv', index=False)

### Nitrogen oxides as nitrogen dioxide

In [51]:
df_noandio = pd.read_csv('nitrogen_oxides_as_nitrogen_dioxide.csv', skiprows=10)
df_noandio.columns

Index(['Date', 'Nitrogen oxides as nitrogen dioxide', 'Status',
       'Nitrogen oxides as nitrogen dioxide.1', 'Status.1',
       'Nitrogen oxides as nitrogen dioxide.2', 'Status.2',
       'Nitrogen oxides as nitrogen dioxide.3', 'Status.3',
       'Nitrogen oxides as nitrogen dioxide.4', 'Status.4',
       'Nitrogen oxides as nitrogen dioxide.5', 'Status.5',
       'Nitrogen oxides as nitrogen dioxide.6', 'Status.6',
       'Nitrogen oxides as nitrogen dioxide.7', 'Status.7',
       'Nitrogen oxides as nitrogen dioxide.8', 'Status.8',
       'Nitrogen oxides as nitrogen dioxide.9', 'Status.9',
       'Nitrogen oxides as nitrogen dioxide.10', 'Status.10',
       'Nitrogen oxides as nitrogen dioxide.11', 'Status.11',
       'Nitrogen oxides as nitrogen dioxide.12', 'Status.12',
       'Nitrogen oxides as nitrogen dioxide.13', 'Status.13',
       'Nitrogen oxides as nitrogen dioxide.14', 'Status.14',
       'Nitrogen oxides as nitrogen dioxide.15', 'Status.15'],
      dtype='object')

In [52]:
# Select relevant columns: Date and Black Carbon values for the three sites
columns = ['Date', 
           'Nitrogen oxides as nitrogen dioxide', 
           'Nitrogen oxides as nitrogen dioxide.1', 
           'Nitrogen oxides as nitrogen dioxide.2',
           'Nitrogen oxides as nitrogen dioxide.3',
           'Nitrogen oxides as nitrogen dioxide.4',
           'Nitrogen oxides as nitrogen dioxide.5',
           'Nitrogen oxides as nitrogen dioxide.6',
           'Nitrogen oxides as nitrogen dioxide.7',
           'Nitrogen oxides as nitrogen dioxide.8',
           'Nitrogen oxides as nitrogen dioxide.9',
           'Nitrogen oxides as nitrogen dioxide.10',
           'Nitrogen oxides as nitrogen dioxide.11',
           'Nitrogen oxides as nitrogen dioxide.12',
           'Nitrogen oxides as nitrogen dioxide.13',
           'Nitrogen oxides as nitrogen dioxide.14',
           'Nitrogen oxides as nitrogen dioxide.15'
            ]
df_noandio = df_noandio[columns]

In [53]:
# Replace 'No data' with NaN
df_noandio[['Nitrogen oxides as nitrogen dioxide', 
           'Nitrogen oxides as nitrogen dioxide.1', 
           'Nitrogen oxides as nitrogen dioxide.2',
           'Nitrogen oxides as nitrogen dioxide.3',
           'Nitrogen oxides as nitrogen dioxide.4',
           'Nitrogen oxides as nitrogen dioxide.5',
           'Nitrogen oxides as nitrogen dioxide.6',
           'Nitrogen oxides as nitrogen dioxide.7',
           'Nitrogen oxides as nitrogen dioxide.8',
           'Nitrogen oxides as nitrogen dioxide.9',
           'Nitrogen oxides as nitrogen dioxide.10',
           'Nitrogen oxides as nitrogen dioxide.11',
           'Nitrogen oxides as nitrogen dioxide.12',
           'Nitrogen oxides as nitrogen dioxide.13',
           'Nitrogen oxides as nitrogen dioxide.14',
           'Nitrogen oxides as nitrogen dioxide.15'
            ]] = df_noandio[['Nitrogen oxides as nitrogen dioxide', 
           'Nitrogen oxides as nitrogen dioxide.1', 
           'Nitrogen oxides as nitrogen dioxide.2',
           'Nitrogen oxides as nitrogen dioxide.3',
           'Nitrogen oxides as nitrogen dioxide.4',
           'Nitrogen oxides as nitrogen dioxide.5',
           'Nitrogen oxides as nitrogen dioxide.6',
           'Nitrogen oxides as nitrogen dioxide.7',
           'Nitrogen oxides as nitrogen dioxide.8',
           'Nitrogen oxides as nitrogen dioxide.9',
           'Nitrogen oxides as nitrogen dioxide.10',
           'Nitrogen oxides as nitrogen dioxide.11',
           'Nitrogen oxides as nitrogen dioxide.12',
           'Nitrogen oxides as nitrogen dioxide.13',
           'Nitrogen oxides as nitrogen dioxide.14',
           'Nitrogen oxides as nitrogen dioxide.15'
            ]].replace('No data', np.nan)

# Convert blue particulate matter columns to numeric
df_noandio[['Nitrogen oxides as nitrogen dioxide', 
           'Nitrogen oxides as nitrogen dioxide.1', 
           'Nitrogen oxides as nitrogen dioxide.2',
           'Nitrogen oxides as nitrogen dioxide.3',
           'Nitrogen oxides as nitrogen dioxide.4',
           'Nitrogen oxides as nitrogen dioxide.5',
           'Nitrogen oxides as nitrogen dioxide.6',
           'Nitrogen oxides as nitrogen dioxide.7',
           'Nitrogen oxides as nitrogen dioxide.8',
           'Nitrogen oxides as nitrogen dioxide.9',
           'Nitrogen oxides as nitrogen dioxide.10',
           'Nitrogen oxides as nitrogen dioxide.11',
           'Nitrogen oxides as nitrogen dioxide.12',
           'Nitrogen oxides as nitrogen dioxide.13',
           'Nitrogen oxides as nitrogen dioxide.14',
           'Nitrogen oxides as nitrogen dioxide.15'
            ]] = df_noandio[['Nitrogen oxides as nitrogen dioxide', 
           'Nitrogen oxides as nitrogen dioxide.1', 
           'Nitrogen oxides as nitrogen dioxide.2',
           'Nitrogen oxides as nitrogen dioxide.3',
           'Nitrogen oxides as nitrogen dioxide.4',
           'Nitrogen oxides as nitrogen dioxide.5',
           'Nitrogen oxides as nitrogen dioxide.6',
           'Nitrogen oxides as nitrogen dioxide.7',
           'Nitrogen oxides as nitrogen dioxide.8',
           'Nitrogen oxides as nitrogen dioxide.9',
           'Nitrogen oxides as nitrogen dioxide.10',
           'Nitrogen oxides as nitrogen dioxide.11',
           'Nitrogen oxides as nitrogen dioxide.12',
           'Nitrogen oxides as nitrogen dioxide.13',
           'Nitrogen oxides as nitrogen dioxide.14',
           'Nitrogen oxides as nitrogen dioxide.15'
            ]].apply(pd.to_numeric, errors='coerce')

# Calculate the mean blue particulate matter value per row, ignoring NaN
df_noandio['nitrogen_oxides_as_nitrogen_dioxide'] = df_noandio[['Nitrogen oxides as nitrogen dioxide', 
           'Nitrogen oxides as nitrogen dioxide.1', 
           'Nitrogen oxides as nitrogen dioxide.2',
           'Nitrogen oxides as nitrogen dioxide.3',
           'Nitrogen oxides as nitrogen dioxide.4',
           'Nitrogen oxides as nitrogen dioxide.5',
           'Nitrogen oxides as nitrogen dioxide.6',
           'Nitrogen oxides as nitrogen dioxide.7',
           'Nitrogen oxides as nitrogen dioxide.8',
           'Nitrogen oxides as nitrogen dioxide.9',
           'Nitrogen oxides as nitrogen dioxide.10',
           'Nitrogen oxides as nitrogen dioxide.11',
           'Nitrogen oxides as nitrogen dioxide.12',
           'Nitrogen oxides as nitrogen dioxide.13',
           'Nitrogen oxides as nitrogen dioxide.14',
           'Nitrogen oxides as nitrogen dioxide.15'
            ]].mean(axis=1, skipna=True)

df_noandio.head()

Unnamed: 0,Date,Nitrogen oxides as nitrogen dioxide,Nitrogen oxides as nitrogen dioxide.1,Nitrogen oxides as nitrogen dioxide.2,Nitrogen oxides as nitrogen dioxide.3,Nitrogen oxides as nitrogen dioxide.4,Nitrogen oxides as nitrogen dioxide.5,Nitrogen oxides as nitrogen dioxide.6,Nitrogen oxides as nitrogen dioxide.7,Nitrogen oxides as nitrogen dioxide.8,Nitrogen oxides as nitrogen dioxide.9,Nitrogen oxides as nitrogen dioxide.10,Nitrogen oxides as nitrogen dioxide.11,Nitrogen oxides as nitrogen dioxide.12,Nitrogen oxides as nitrogen dioxide.13,Nitrogen oxides as nitrogen dioxide.14,Nitrogen oxides as nitrogen dioxide.15,nitrogen_oxides_as_nitrogen_dioxide
0,09-06-2024,34.0,,6.0,6,10,8.0,32.0,7.0,5.0,2,22.0,37.0,13,4,4,,13.571429
1,10-06-2024,29.0,,9.0,7,13,8.0,29.0,9.0,7.0,2,24.0,35.0,25,7,5,,14.928571
2,11-06-2024,41.0,,12.0,11,14,12.0,39.0,11.0,9.0,3,28.0,59.0,28,7,5,,19.928571
3,12-06-2024,59.0,,12.0,12,14,15.0,44.0,13.0,12.0,3,41.0,67.0,29,6,6,,23.785714
4,13-06-2024,68.0,,8.0,15,11,14.0,33.0,10.0,8.0,6,52.0,91.0,23,9,9,,25.5


In [54]:
# Select only Date and Black_Carbon_Mean columns
df_noandio = df_noandio[['Date', 'nitrogen_oxides_as_nitrogen_dioxide']]
df_noandio.head()

Unnamed: 0,Date,nitrogen_oxides_as_nitrogen_dioxide
0,09-06-2024,13.571429
1,10-06-2024,14.928571
2,11-06-2024,19.928571
3,12-06-2024,23.785714
4,13-06-2024,25.5


In [55]:
# Rename specific columns
df_noandio = df_noandio.rename(columns={'Date': 'date'})
df_noandio.head()

Unnamed: 0,date,nitrogen_oxides_as_nitrogen_dioxide
0,09-06-2024,13.571429
1,10-06-2024,14.928571
2,11-06-2024,19.928571
3,12-06-2024,23.785714
4,13-06-2024,25.5


In [56]:
# Ensure Date column is in datetime format, parsing DD-MM-YYYY
df_noandio['date'] = pd.to_datetime(df_noandio['date'], format='%d-%m-%Y')

In [57]:
# Check for missing values
df_noandio[df_noandio.isna().any(axis=1)]     # Show all rows with any missing values

Unnamed: 0,date,nitrogen_oxides_as_nitrogen_dioxide


In [None]:
# Function to calculate mean of past 7 days
def fill_with_past_7_days_mean(row, df_noandio, window_days=7):
    if pd.isna(row['nitric_dioxide']):
        # Get the date range for the past 7 days (t-7 to t-1)
        start_date = row['date'] - pd.Timedelta(days=window_days)
        end_date = row['date'] - pd.Timedelta(days=1)
        # Select rows within the past 7 days
        past_data = df_noandio[(df_noandio['date'] >= start_date) & (df_dio['date'] <= end_date)]
        # Calculate mean of non-NaN Black_Carbon_Mean values
        mean_value = past_data['nitric_dioxide'].mean()
        return mean_value
    return row['nitric_dioxide']

# Apply the function to fill missing nitric_oxide_Mean values
df_noandio['nitric_dioxide'] = df_noandio.apply(lambda row: fill_with_past_7_days_mean(row, df_noandio), axis=1)

In [60]:
# Round specific columns
df_noandio['nitrogen_oxides_as_nitrogen_dioxide'] = df_noandio['nitrogen_oxides_as_nitrogen_dioxide'].round(3)
df_noandio.head()

Unnamed: 0,date,nitrogen_oxides_as_nitrogen_dioxide
0,2024-06-09,13.571
1,2024-06-10,14.929
2,2024-06-11,19.929
3,2024-06-12,23.786
4,2024-06-13,25.5


In [61]:
# Save to a new CSV file
df_noandio.to_csv('nitrogen_oxides_as_nitrogen_dioxide_cd.csv', index=False)

### Ozone

In [62]:
df_ozone = pd.read_csv('ozone.csv', skiprows=10)
df_ozone.columns

Index(['Date', 'Ozone', 'Status', 'Ozone.1', 'Status.1', 'Ozone.2', 'Status.2',
       'Ozone.3', 'Status.3', 'Ozone.4', 'Status.4', 'Ozone.5', 'Status.5',
       'Ozone.6', 'Status.6', 'Ozone.7', 'Status.7', 'Ozone.8', 'Status.8',
       'Ozone.9', 'Status.9', 'Ozone.10', 'Status.10'],
      dtype='object')

In [63]:
# Select relevant columns: Date and Black Carbon values for the three sites
columns = ['Date', 
           'Ozone', 
           'Ozone.1', 
           'Ozone.2',
           'Ozone.3',
           'Ozone.4',
           'Ozone.5',
           'Ozone.6',
           'Ozone.7',
           'Ozone.8',
           'Ozone.9',
           'Ozone.10'
           ]
df_ozone = df_ozone[columns]

In [64]:
# Replace 'No data' with NaN
df_ozone[[ 
           'Ozone', 
           'Ozone.1', 
           'Ozone.2',
           'Ozone.3',
           'Ozone.4',
           'Ozone.5',
           'Ozone.6',
           'Ozone.7',
           'Ozone.8',
           'Ozone.9',
           'Ozone.10'
           ]] = df_ozone[[ 
           'Ozone', 
           'Ozone.1', 
           'Ozone.2',
           'Ozone.3',
           'Ozone.4',
           'Ozone.5',
           'Ozone.6',
           'Ozone.7',
           'Ozone.8',
           'Ozone.9',
           'Ozone.10'
           ]].replace('No data', np.nan)

# Convert blue particulate matter columns to numeric
df_ozone[[ 
           'Ozone', 
           'Ozone.1', 
           'Ozone.2',
           'Ozone.3',
           'Ozone.4',
           'Ozone.5',
           'Ozone.6',
           'Ozone.7',
           'Ozone.8',
           'Ozone.9',
           'Ozone.10'
           ]] = df_ozone[[ 
           'Ozone', 
           'Ozone.1', 
           'Ozone.2',
           'Ozone.3',
           'Ozone.4',
           'Ozone.5',
           'Ozone.6',
           'Ozone.7',
           'Ozone.8',
           'Ozone.9',
           'Ozone.10'
           ]].apply(pd.to_numeric, errors='coerce')

# Calculate the mean blue particulate matter value per row, ignoring NaN
df_ozone['ozone'] = df_ozone[[ 
           'Ozone', 
           'Ozone.1', 
           'Ozone.2',
           'Ozone.3',
           'Ozone.4',
           'Ozone.5',
           'Ozone.6',
           'Ozone.7',
           'Ozone.8',
           'Ozone.9',
           'Ozone.10'
           ]].mean(axis=1, skipna=True)

df_ozone.head()

Unnamed: 0,Date,Ozone,Ozone.1,Ozone.2,Ozone.3,Ozone.4,Ozone.5,Ozone.6,Ozone.7,Ozone.8,Ozone.9,Ozone.10,ozone
0,08-06-2024,55.0,,63.0,59.0,57.0,59.0,65.0,60.0,61.0,62.0,62.0,60.3
1,09-06-2024,54.0,,63.0,59.0,,56.0,60.0,60.0,65.0,59.0,59.0,59.444444
2,10-06-2024,56.0,,64.0,57.0,62.0,59.0,68.0,57.0,63.0,61.0,62.0,60.9
3,11-06-2024,56.0,,64.0,53.0,58.0,60.0,67.0,54.0,67.0,62.0,64.0,60.5
4,12-06-2024,55.0,,70.0,60.0,60.0,60.0,67.0,61.0,75.0,69.0,68.0,64.5


In [65]:
# Select only Date and Black_Carbon_Mean columns
df_ozone = df_ozone[['Date', 'ozone']]
df_ozone.head()

Unnamed: 0,Date,ozone
0,08-06-2024,60.3
1,09-06-2024,59.444444
2,10-06-2024,60.9
3,11-06-2024,60.5
4,12-06-2024,64.5


In [66]:
# Rename specific columns
df_ozone = df_ozone.rename(columns={'Date': 'date', 'ozone' : 'Ozone'})
df_ozone.head()

Unnamed: 0,date,Ozone
0,08-06-2024,60.3
1,09-06-2024,59.444444
2,10-06-2024,60.9
3,11-06-2024,60.5
4,12-06-2024,64.5


In [67]:
# Ensure Date column is in datetime format, parsing DD-MM-YYYY
df_ozone['date'] = pd.to_datetime(df_ozone['date'], format='%d-%m-%Y')

In [68]:
# Check for missing values
df_ozone[df_ozone.isna().any(axis=1)]     # Show all rows with any missing values

Unnamed: 0,date,Ozone


In [69]:
# Function to calculate mean of past 7 days
def fill_with_past_7_days_mean(row, df_ozone, window_days=7):
    if pd.isna(row['Ozone']):
        # Get the date range for the past 7 days (t-7 to t-1)
        start_date = row['date'] - pd.Timedelta(days=window_days)
        end_date = row['date'] - pd.Timedelta(days=1)
        # Select rows within the past 7 days
        past_data = df_ozone[(df_ozone['date'] >= start_date) & (df_ozone['date'] <= end_date)]
        # Calculate mean of non-NaN Black_Carbon_Mean values
        mean_value = past_data['Ozone'].mean()
        return mean_value
    return row['Ozone']

# Apply the function to fill missing nitric_oxide_Mean values
df_ozone['Ozone'] = df_ozone.apply(lambda row: fill_with_past_7_days_mean(row, df_ozone), axis=1)

In [70]:
# Round specific columns
df_ozone['Ozone'] = df_ozone['Ozone'].round(3)
df_ozone.head()

Unnamed: 0,date,Ozone
0,2024-06-08,60.3
1,2024-06-09,59.444
2,2024-06-10,60.9
3,2024-06-11,60.5
4,2024-06-12,64.5


In [71]:
# Save to a new CSV file
df_ozone.to_csv('Ozone_cd.csv', index=False)

### PM2.5

In [72]:
df_pm2 = pd.read_csv('PM2.5.csv', skiprows=10)
df_pm2.columns

Index(['Date', 'PM2.5 particulate matter (Hourly measured)', 'Status',
       'PM2.5 particulate matter (Hourly measured).1', 'Status.1',
       'PM2.5 particulate matter (Hourly measured).2', 'Status.2',
       'PM2.5 particulate matter (Hourly measured).3', 'Status.3',
       'PM2.5 particulate matter (Hourly measured).4', 'Status.4',
       'PM2.5 particulate matter (Hourly measured).5', 'Status.5',
       'PM2.5 particulate matter (Hourly measured).6', 'Status.6',
       'PM2.5 particulate matter (Hourly measured).7', 'Status.7',
       'PM2.5 particulate matter (Hourly measured).8', 'Status.8',
       'PM2.5 particulate matter (Hourly measured).9', 'Status.9',
       'PM2.5 particulate matter (Hourly measured).10', 'Status.10',
       'PM2.5 particulate matter (Hourly measured).11', 'Status.11',
       'PM2.5 particulate matter (Hourly measured).12', 'Status.12',
       'PM2.5 particulate matter (Hourly measured).13', 'Status.13',
       'PM2.5 particulate matter (Hourly measured)

In [73]:
# Select relevant columns: Date and Black Carbon values for the three sites
columns = ['Date', 
           'PM2.5 particulate matter (Hourly measured)', 
           'PM2.5 particulate matter (Hourly measured).1', 
           'PM2.5 particulate matter (Hourly measured).2',
           'PM2.5 particulate matter (Hourly measured).3',
           'PM2.5 particulate matter (Hourly measured).4',
           'PM2.5 particulate matter (Hourly measured).5',
           'PM2.5 particulate matter (Hourly measured).6',
           'PM2.5 particulate matter (Hourly measured).7',
           'PM2.5 particulate matter (Hourly measured).8',
           'PM2.5 particulate matter (Hourly measured).9',
           'PM2.5 particulate matter (Hourly measured).10',
           'PM2.5 particulate matter (Hourly measured).11',
           'PM2.5 particulate matter (Hourly measured).12',
           'PM2.5 particulate matter (Hourly measured).13',
           'PM2.5 particulate matter (Hourly measured).14',
           ]
df_pm2 = df_pm2[columns]

In [74]:
# Replace 'No data' with NaN
df_pm2[[ 
           'PM2.5 particulate matter (Hourly measured)', 
           'PM2.5 particulate matter (Hourly measured).1', 
           'PM2.5 particulate matter (Hourly measured).2',
           'PM2.5 particulate matter (Hourly measured).3',
           'PM2.5 particulate matter (Hourly measured).4',
           'PM2.5 particulate matter (Hourly measured).5',
           'PM2.5 particulate matter (Hourly measured).6',
           'PM2.5 particulate matter (Hourly measured).7',
           'PM2.5 particulate matter (Hourly measured).8',
           'PM2.5 particulate matter (Hourly measured).9',
           'PM2.5 particulate matter (Hourly measured).10',
           'PM2.5 particulate matter (Hourly measured).11',
           'PM2.5 particulate matter (Hourly measured).12',
           'PM2.5 particulate matter (Hourly measured).13',
           'PM2.5 particulate matter (Hourly measured).14',
           ]] = df_pm2[[ 
           'PM2.5 particulate matter (Hourly measured)', 
           'PM2.5 particulate matter (Hourly measured).1', 
           'PM2.5 particulate matter (Hourly measured).2',
           'PM2.5 particulate matter (Hourly measured).3',
           'PM2.5 particulate matter (Hourly measured).4',
           'PM2.5 particulate matter (Hourly measured).5',
           'PM2.5 particulate matter (Hourly measured).6',
           'PM2.5 particulate matter (Hourly measured).7',
           'PM2.5 particulate matter (Hourly measured).8',
           'PM2.5 particulate matter (Hourly measured).9',
           'PM2.5 particulate matter (Hourly measured).10',
           'PM2.5 particulate matter (Hourly measured).11',
           'PM2.5 particulate matter (Hourly measured).12',
           'PM2.5 particulate matter (Hourly measured).13',
           'PM2.5 particulate matter (Hourly measured).14',
           ]].replace('No data', np.nan)

# Convert blue particulate matter columns to numeric
df_pm2[[ 
           'PM2.5 particulate matter (Hourly measured)', 
           'PM2.5 particulate matter (Hourly measured).1', 
           'PM2.5 particulate matter (Hourly measured).2',
           'PM2.5 particulate matter (Hourly measured).3',
           'PM2.5 particulate matter (Hourly measured).4',
           'PM2.5 particulate matter (Hourly measured).5',
           'PM2.5 particulate matter (Hourly measured).6',
           'PM2.5 particulate matter (Hourly measured).7',
           'PM2.5 particulate matter (Hourly measured).8',
           'PM2.5 particulate matter (Hourly measured).9',
           'PM2.5 particulate matter (Hourly measured).10',
           'PM2.5 particulate matter (Hourly measured).11',
           'PM2.5 particulate matter (Hourly measured).12',
           'PM2.5 particulate matter (Hourly measured).13',
           'PM2.5 particulate matter (Hourly measured).14',
           ]] = df_pm2[[ 
           'PM2.5 particulate matter (Hourly measured)', 
           'PM2.5 particulate matter (Hourly measured).1', 
           'PM2.5 particulate matter (Hourly measured).2',
           'PM2.5 particulate matter (Hourly measured).3',
           'PM2.5 particulate matter (Hourly measured).4',
           'PM2.5 particulate matter (Hourly measured).5',
           'PM2.5 particulate matter (Hourly measured).6',
           'PM2.5 particulate matter (Hourly measured).7',
           'PM2.5 particulate matter (Hourly measured).8',
           'PM2.5 particulate matter (Hourly measured).9',
           'PM2.5 particulate matter (Hourly measured).10',
           'PM2.5 particulate matter (Hourly measured).11',
           'PM2.5 particulate matter (Hourly measured).12',
           'PM2.5 particulate matter (Hourly measured).13',
           'PM2.5 particulate matter (Hourly measured).14',
           ]].apply(pd.to_numeric, errors='coerce')

# Calculate the mean blue particulate matter value per row, ignoring NaN
df_pm2['PM2.5'] = df_pm2[[ 
           'PM2.5 particulate matter (Hourly measured)', 
           'PM2.5 particulate matter (Hourly measured).1', 
           'PM2.5 particulate matter (Hourly measured).2',
           'PM2.5 particulate matter (Hourly measured).3',
           'PM2.5 particulate matter (Hourly measured).4',
           'PM2.5 particulate matter (Hourly measured).5',
           'PM2.5 particulate matter (Hourly measured).6',
           'PM2.5 particulate matter (Hourly measured).7',
           'PM2.5 particulate matter (Hourly measured).8',
           'PM2.5 particulate matter (Hourly measured).9',
           'PM2.5 particulate matter (Hourly measured).10',
           'PM2.5 particulate matter (Hourly measured).11',
           'PM2.5 particulate matter (Hourly measured).12',
           'PM2.5 particulate matter (Hourly measured).13',
           'PM2.5 particulate matter (Hourly measured).14',
           ]].mean(axis=1, skipna=True)

df_pm2.head()

Unnamed: 0,Date,PM2.5 particulate matter (Hourly measured),PM2.5 particulate matter (Hourly measured).1,PM2.5 particulate matter (Hourly measured).2,PM2.5 particulate matter (Hourly measured).3,PM2.5 particulate matter (Hourly measured).4,PM2.5 particulate matter (Hourly measured).5,PM2.5 particulate matter (Hourly measured).6,PM2.5 particulate matter (Hourly measured).7,PM2.5 particulate matter (Hourly measured).8,PM2.5 particulate matter (Hourly measured).9,PM2.5 particulate matter (Hourly measured).10,PM2.5 particulate matter (Hourly measured).11,PM2.5 particulate matter (Hourly measured).12,PM2.5 particulate matter (Hourly measured).13,PM2.5 particulate matter (Hourly measured).14,PM2.5
0,09-06-2024,5,,4.0,4,,4.0,,4,4.0,,,,4,4,,4.125
1,10-06-2024,3,,3.0,3,,3.0,,3,3.0,,,,3,3,,3.0
2,11-06-2024,5,,4.0,4,,4.0,,5,4.0,,,,4,4,,4.25
3,12-06-2024,6,,4.0,5,,4.0,,5,5.0,,,,6,4,,4.875
4,13-06-2024,6,,4.0,5,,4.0,,4,4.0,,,,6,4,,4.625


In [75]:
# Select only Date and Black_Carbon_Mean columns
df_pm2 = df_pm2[['Date', 'PM2.5']]
df_pm2.head()

Unnamed: 0,Date,PM2.5
0,09-06-2024,4.125
1,10-06-2024,3.0
2,11-06-2024,4.25
3,12-06-2024,4.875
4,13-06-2024,4.625


In [76]:
# Rename specific columns
df_pm2 = df_pm2.rename(columns={'Date': 'date'})
df_pm2.head()

Unnamed: 0,date,PM2.5
0,09-06-2024,4.125
1,10-06-2024,3.0
2,11-06-2024,4.25
3,12-06-2024,4.875
4,13-06-2024,4.625


In [77]:
# Ensure Date column is in datetime format, parsing DD-MM-YYYY
df_pm2['date'] = pd.to_datetime(df_pm2['date'], format='%d-%m-%Y')

In [78]:
# Check for missing values
df_pm2[df_pm2.isna().any(axis=1)]     # Show all rows with any missing values

Unnamed: 0,date,PM2.5


In [None]:
# Only needed if there are missing values corresponding to the date
# Function to calculate mean of past 7 days
def fill_with_past_7_days_mean(row, df_pm2, window_days=7):
    if pd.isna(row['PM2.5']):
        # Get the date range for the past 7 days (t-7 to t-1)
        start_date = row['date'] - pd.Timedelta(days=window_days)
        end_date = row['date'] - pd.Timedelta(days=1)
        # Select rows within the past 7 days
        past_data = df_pm2[(df_pm2['date'] >= start_date) & (df_pm2['date'] <= end_date)]
        # Calculate mean of non-NaN Black_Carbon_Mean values
        mean_value = past_data['PM2.5'].mean()
        return mean_value
    return row['PM2.5']

# Apply the function to fill missing nitric_oxide_Mean values
df_pm2['PM2.5'] = df_pm2.apply(lambda row: fill_with_past_7_days_mean(row, df_pm2), axis=1)

In [81]:
# Round specific columns
df_pm2['PM2.5'] = df_pm2['PM2.5'].round(3)
df_pm2.head()

Unnamed: 0,date,PM2.5
0,2024-06-09,4.125
1,2024-06-10,3.0
2,2024-06-11,4.25
3,2024-06-12,4.875
4,2024-06-13,4.625


In [82]:
# Save to a new CSV file
df_pm2.to_csv('PM2.5_cd.csv', index=False)

### PM10

In [83]:
df_pm10 = pd.read_csv('PM10.csv', skiprows=10)
df_pm10.columns

Index(['Date', 'PM10 particulate matter (Hourly measured)', 'Status',
       'PM10 particulate matter (Hourly measured).1', 'Status.1',
       'PM10 particulate matter (Hourly measured).2', 'Status.2',
       'PM10 particulate matter (Hourly measured).3', 'Status.3',
       'PM10 particulate matter (Hourly measured).4', 'Status.4',
       'PM10 particulate matter (Hourly measured).5', 'Status.5',
       'PM10 particulate matter (Hourly measured).6', 'Status.6',
       'PM10 particulate matter (Hourly measured).7', 'Status.7',
       'PM10 particulate matter (Hourly measured).8', 'Status.8',
       'PM10 particulate matter (Hourly measured).9', 'Status.9',
       'PM10 particulate matter (Hourly measured).10', 'Status.10',
       'PM10 particulate matter (Hourly measured).11', 'Status.11',
       'PM10 particulate matter (Hourly measured).12', 'Status.12',
       'PM10 particulate matter (Hourly measured).13', 'Status.13'],
      dtype='object')

In [84]:
# Select relevant columns: Date and Black Carbon values for the three sites
columns = ['Date', 
           'PM10 particulate matter (Hourly measured)', 
           'PM10 particulate matter (Hourly measured).1', 
           'PM10 particulate matter (Hourly measured).2',
           'PM10 particulate matter (Hourly measured).3',
           'PM10 particulate matter (Hourly measured).4',
           'PM10 particulate matter (Hourly measured).5',
           'PM10 particulate matter (Hourly measured).6',
           'PM10 particulate matter (Hourly measured).7',
           'PM10 particulate matter (Hourly measured).8',
           'PM10 particulate matter (Hourly measured).9',
           'PM10 particulate matter (Hourly measured).10',
           'PM10 particulate matter (Hourly measured).11',
           'PM10 particulate matter (Hourly measured).12',
           'PM10 particulate matter (Hourly measured).13'
           ]
df_pm10 = df_pm10[columns]

In [85]:
# Replace 'No data' with NaN
df_pm10[[ 
           'PM10 particulate matter (Hourly measured)', 
           'PM10 particulate matter (Hourly measured).1', 
           'PM10 particulate matter (Hourly measured).2',
           'PM10 particulate matter (Hourly measured).3',
           'PM10 particulate matter (Hourly measured).4',
           'PM10 particulate matter (Hourly measured).5',
           'PM10 particulate matter (Hourly measured).6',
           'PM10 particulate matter (Hourly measured).7',
           'PM10 particulate matter (Hourly measured).8',
           'PM10 particulate matter (Hourly measured).9',
           'PM10 particulate matter (Hourly measured).10',
           'PM10 particulate matter (Hourly measured).11',
           'PM10 particulate matter (Hourly measured).12',
           'PM10 particulate matter (Hourly measured).13'
           ]] = df_pm10[[ 
           'PM10 particulate matter (Hourly measured)', 
           'PM10 particulate matter (Hourly measured).1', 
           'PM10 particulate matter (Hourly measured).2',
           'PM10 particulate matter (Hourly measured).3',
           'PM10 particulate matter (Hourly measured).4',
           'PM10 particulate matter (Hourly measured).5',
           'PM10 particulate matter (Hourly measured).6',
           'PM10 particulate matter (Hourly measured).7',
           'PM10 particulate matter (Hourly measured).8',
           'PM10 particulate matter (Hourly measured).9',
           'PM10 particulate matter (Hourly measured).10',
           'PM10 particulate matter (Hourly measured).11',
           'PM10 particulate matter (Hourly measured).12',
           'PM10 particulate matter (Hourly measured).13'
           ]].replace('No data', np.nan)

# Convert blue particulate matter columns to numeric
df_pm10[[ 
           'PM10 particulate matter (Hourly measured)', 
           'PM10 particulate matter (Hourly measured).1', 
           'PM10 particulate matter (Hourly measured).2',
           'PM10 particulate matter (Hourly measured).3',
           'PM10 particulate matter (Hourly measured).4',
           'PM10 particulate matter (Hourly measured).5',
           'PM10 particulate matter (Hourly measured).6',
           'PM10 particulate matter (Hourly measured).7',
           'PM10 particulate matter (Hourly measured).8',
           'PM10 particulate matter (Hourly measured).9',
           'PM10 particulate matter (Hourly measured).10',
           'PM10 particulate matter (Hourly measured).11',
           'PM10 particulate matter (Hourly measured).12',
           'PM10 particulate matter (Hourly measured).13'
           ]] = df_pm10[[ 
           'PM10 particulate matter (Hourly measured)', 
           'PM10 particulate matter (Hourly measured).1', 
           'PM10 particulate matter (Hourly measured).2',
           'PM10 particulate matter (Hourly measured).3',
           'PM10 particulate matter (Hourly measured).4',
           'PM10 particulate matter (Hourly measured).5',
           'PM10 particulate matter (Hourly measured).6',
           'PM10 particulate matter (Hourly measured).7',
           'PM10 particulate matter (Hourly measured).8',
           'PM10 particulate matter (Hourly measured).9',
           'PM10 particulate matter (Hourly measured).10',
           'PM10 particulate matter (Hourly measured).11',
           'PM10 particulate matter (Hourly measured).12',
           'PM10 particulate matter (Hourly measured).13'
           ]].apply(pd.to_numeric, errors='coerce')

# Calculate the mean blue particulate matter value per row, ignoring NaN
df_pm10['PM10'] = df_pm10[[ 
           'PM10 particulate matter (Hourly measured)', 
           'PM10 particulate matter (Hourly measured).1', 
           'PM10 particulate matter (Hourly measured).2',
           'PM10 particulate matter (Hourly measured).3',
           'PM10 particulate matter (Hourly measured).4',
           'PM10 particulate matter (Hourly measured).5',
           'PM10 particulate matter (Hourly measured).6',
           'PM10 particulate matter (Hourly measured).7',
           'PM10 particulate matter (Hourly measured).8',
           'PM10 particulate matter (Hourly measured).9',
           'PM10 particulate matter (Hourly measured).10',
           'PM10 particulate matter (Hourly measured).11',
           'PM10 particulate matter (Hourly measured).12',
           'PM10 particulate matter (Hourly measured).13'
           ]].mean(axis=1, skipna=True)

df_pm10.head()

Unnamed: 0,Date,PM10 particulate matter (Hourly measured),PM10 particulate matter (Hourly measured).1,PM10 particulate matter (Hourly measured).2,PM10 particulate matter (Hourly measured).3,PM10 particulate matter (Hourly measured).4,PM10 particulate matter (Hourly measured).5,PM10 particulate matter (Hourly measured).6,PM10 particulate matter (Hourly measured).7,PM10 particulate matter (Hourly measured).8,PM10 particulate matter (Hourly measured).9,PM10 particulate matter (Hourly measured).10,PM10 particulate matter (Hourly measured).11,PM10 particulate matter (Hourly measured).12,PM10 particulate matter (Hourly measured).13,PM10
0,09-06-2024,9,,7.0,7,7.0,13.0,7,7.0,,,10.0,8,7,,8.2
1,10-06-2024,7,,5.0,5,5.0,9.0,5,5.0,,,8.0,7,5,,6.1
2,11-06-2024,10,,8.0,7,7.0,11.0,8,8.0,,,12.0,8,8,,8.7
3,12-06-2024,15,,8.0,9,9.0,14.0,9,8.0,,,13.0,11,7,,10.3
4,13-06-2024,21,,7.0,10,8.0,11.0,8,9.0,,,13.0,10,7,,10.4


In [86]:
# Select only Date and Black_Carbon_Mean columns
df_pm10 = df_pm10[['Date', 'PM10']]
df_pm10.head()

Unnamed: 0,Date,PM10
0,09-06-2024,8.2
1,10-06-2024,6.1
2,11-06-2024,8.7
3,12-06-2024,10.3
4,13-06-2024,10.4


In [87]:
# Rename specific columns
df_pm10 = df_pm10.rename(columns={'Date': 'date'})
df_pm10.head()

Unnamed: 0,date,PM10
0,09-06-2024,8.2
1,10-06-2024,6.1
2,11-06-2024,8.7
3,12-06-2024,10.3
4,13-06-2024,10.4


In [88]:
# Ensure Date column is in datetime format, parsing DD-MM-YYYY
df_pm10['date'] = pd.to_datetime(df_pm10['date'], format='%d-%m-%Y')

In [89]:
# Check for missing values
df_pm10[df_pm10.isna().any(axis=1)]     # Show all rows with any missing values

Unnamed: 0,date,PM10


In [90]:
# Only needed if there are missing values corresponding to the date
# Function to calculate mean of past 7 days
def fill_with_past_7_days_mean(row, df_pm10, window_days=7):
    if pd.isna(row['PM10']):
        # Get the date range for the past 7 days (t-7 to t-1)
        start_date = row['date'] - pd.Timedelta(days=window_days)
        end_date = row['date'] - pd.Timedelta(days=1)
        # Select rows within the past 7 days
        past_data = df_pm10[(df_pm10['date'] >= start_date) & (df_pm10['date'] <= end_date)]
        # Calculate mean of non-NaN Black_Carbon_Mean values
        mean_value = past_data['PM10'].mean()
        return mean_value
    return row['PM10']

# Apply the function to fill missing nitric_oxide_Mean values
df_pm10['PM10'] = df_pm10.apply(lambda row: fill_with_past_7_days_mean(row, df_pm10), axis=1)

In [91]:
# Round specific columns
df_pm10['PM10'] = df_pm10['PM10'].round(3)
df_pm10.head()

Unnamed: 0,date,PM10
0,2024-06-09,8.2
1,2024-06-10,6.1
2,2024-06-11,8.7
3,2024-06-12,10.3
4,2024-06-13,10.4


In [92]:
# Save to a new CSV file
df_pm10.to_csv('PM10_cd.csv', index=False)

### Red Particulate matter (660nm)

In [93]:
df_rpm = pd.read_csv('red_particulate_matter.csv', skiprows=10)
df_rpm.columns

Index(['Date', 'Red Particulate matter (660nm)', 'Status',
       'Red Particulate matter (660nm).1', 'Status.1',
       'Red Particulate matter (660nm).2', 'Status.2'],
      dtype='object')

In [None]:
# Select relevant columns: Date and Black Carbon values for the three sites
columns = ['Date', 
           'Red Particulate matter (660nm)', 
           'Red Particulate matter (660nm).1', 
           'Red Particulate matter (660nm).2'
           ]
df_rpm = df_rpm[columns]

In [94]:
# Replace 'No data' with NaN
df_rpm[[ 
           'Red Particulate matter (660nm)', 
           'Red Particulate matter (660nm).1', 
           'Red Particulate matter (660nm).2'
           ]] = df_rpm[[ 
           'Red Particulate matter (660nm)', 
           'Red Particulate matter (660nm).1', 
           'Red Particulate matter (660nm).2'
           ]].replace('No data', np.nan)

# Convert blue particulate matter columns to numeric
df_rpm[[ 
           'Red Particulate matter (660nm)', 
           'Red Particulate matter (660nm).1', 
           'Red Particulate matter (660nm).2'
           ]] = df_rpm[[ 
           'Red Particulate matter (660nm)', 
           'Red Particulate matter (660nm).1', 
           'Red Particulate matter (660nm).2'
           ]].apply(pd.to_numeric, errors='coerce')

# Calculate the mean blue particulate matter value per row, ignoring NaN
df_rpm['red_particulate_matter_660nm'] = df_rpm[[ 
           'Red Particulate matter (660nm)', 
           'Red Particulate matter (660nm).1', 
           'Red Particulate matter (660nm).2'
           ]].mean(axis=1, skipna=True)

df_rpm.head()

Unnamed: 0,Date,Red Particulate matter (660nm),Status,Red Particulate matter (660nm).1,Status.1,Red Particulate matter (660nm).2,Status.2,red_particulate_matter_660nm
0,09-06-2024,0.82,V ugm-3,0.26,V ugm-3,0.15,V ugm-3,0.41
1,10-06-2024,0.62,V ugm-3,0.21,V ugm-3,0.15,V ugm-3,0.326667
2,11-06-2024,0.95,V ugm-3,0.32,V ugm-3,0.17,V ugm-3,0.48
3,12-06-2024,1.37,V ugm-3,0.41,V ugm-3,0.23,V ugm-3,0.67
4,13-06-2024,1.4,V ugm-3,0.28,V ugm-3,0.33,V ugm-3,0.67


In [95]:
# Select only Date and Black_Carbon_Mean columns
df_rpm = df_rpm[['Date', 'red_particulate_matter_660nm']]
df_rpm.head()

Unnamed: 0,Date,red_particulate_matter_660nm
0,09-06-2024,0.41
1,10-06-2024,0.326667
2,11-06-2024,0.48
3,12-06-2024,0.67
4,13-06-2024,0.67


In [96]:
# Rename specific columns
df_rpm = df_rpm.rename(columns={'Date': 'date'})
df_rpm.head()

Unnamed: 0,date,red_particulate_matter_660nm
0,09-06-2024,0.41
1,10-06-2024,0.326667
2,11-06-2024,0.48
3,12-06-2024,0.67
4,13-06-2024,0.67


In [97]:
# Ensure Date column is in datetime format, parsing DD-MM-YYYY
df_rpm['date'] = pd.to_datetime(df_rpm['date'], format='%d-%m-%Y')

In [99]:
# Check for missing values
df_rpm[df_rpm.isna().any(axis=1)]     # Show all rows with any missing values

Unnamed: 0,date,red_particulate_matter_660nm
207,2025-01-02,
217,2025-01-12,
218,2025-01-13,
306,2025-04-11,
319,2025-04-24,
341,2025-05-16,
342,2025-05-18,


In [100]:
# Only needed if there are missing values corresponding to the date
# Function to calculate mean of past 7 days
def fill_with_past_7_days_mean(row, df_rpm, window_days=7):
    if pd.isna(row['red_particulate_matter_660nm']):
        # Get the date range for the past 7 days (t-7 to t-1)
        start_date = row['date'] - pd.Timedelta(days=window_days)
        end_date = row['date'] - pd.Timedelta(days=1)
        # Select rows within the past 7 days
        past_data = df_rpm[(df_rpm['date'] >= start_date) & (df_rpm['date'] <= end_date)]
        # Calculate mean of non-NaN Black_Carbon_Mean values
        mean_value = past_data['red_particulate_matter_660nm'].mean()
        return mean_value
    return row['red_particulate_matter_660nm']

# Apply the function to fill missing nitric_oxide_Mean values
df_rpm['red_particulate_matter_660nm'] = df_rpm.apply(lambda row: fill_with_past_7_days_mean(row, df_rpm), axis=1)

In [101]:
# Round specific columns
df_rpm['red_particulate_matter_660nm'] = df_rpm['red_particulate_matter_660nm'].round(3)
df_rpm.head()

Unnamed: 0,date,red_particulate_matter_660nm
0,2024-06-09,0.41
1,2024-06-10,0.327
2,2024-06-11,0.48
3,2024-06-12,0.67
4,2024-06-13,0.67


In [102]:
# Save to a new CSV file
df_rpm.to_csv('red_particulate_matter_cd.csv', index=False)

### UV Particulate Matter (370nm)

In [None]:
import pandas as pd
import numpy as np

df_UV = pd.read_csv('UV_particulate_matter_(370nm).csv', skiprows=10)
df_UV.columns

Index(['Date', 'UV Particulate Matter (370nm)', 'Status',
       'UV Particulate Matter (370nm).1', 'Status.1',
       'UV Particulate Matter (370nm).2', 'Status.2'],
      dtype='object')

In [14]:
# Select relevant columns: Date and Black Carbon values for the three sites
columns = ['Date', 
           'UV Particulate Matter (370nm)',
           'UV Particulate Matter (370nm).1',
           'UV Particulate Matter (370nm).2',
           ]
df_UV = df_UV[columns]

In [15]:
# Replace 'No data' with NaN
df_UV[[ 
           'UV Particulate Matter (370nm)',
           'UV Particulate Matter (370nm).1',
           'UV Particulate Matter (370nm).2',
           ]] = df_UV[[ 
           'UV Particulate Matter (370nm)',
           'UV Particulate Matter (370nm).1',
           'UV Particulate Matter (370nm).2',
           ]].replace('No data', np.nan)

# Convert blue particulate matter columns to numeric
df_UV[[ 
           'UV Particulate Matter (370nm)',
           'UV Particulate Matter (370nm).1',
           'UV Particulate Matter (370nm).2',
           ]] = df_UV[[ 
           'UV Particulate Matter (370nm)',
           'UV Particulate Matter (370nm).1',
           'UV Particulate Matter (370nm).2',
           ]].apply(pd.to_numeric, errors='coerce')

# Calculate the mean blue particulate matter value per row, ignoring NaN
df_UV['UV_particulate_matter_370nm'] = df_UV[[ 
           'UV Particulate Matter (370nm)',
           'UV Particulate Matter (370nm).1',
           'UV Particulate Matter (370nm).2',
           ]].mean(axis=1, skipna=True)

df_UV.head()

Unnamed: 0,Date,UV Particulate Matter (370nm),UV Particulate Matter (370nm).1,UV Particulate Matter (370nm).2,UV_particulate_matter_370nm
0,09-06-2024,0.89,0.33,0.18,0.466667
1,10-06-2024,0.68,0.25,0.16,0.363333
2,11-06-2024,1.0,0.37,0.18,0.516667
3,12-06-2024,1.48,0.48,0.28,0.746667
4,13-06-2024,1.48,0.3,0.35,0.71


In [16]:
# Select only Date and Black_Carbon_Mean columns
df_UV = df_UV[['Date', 'UV_particulate_matter_370nm']]
df_UV.head()

Unnamed: 0,Date,UV_particulate_matter_370nm
0,09-06-2024,0.466667
1,10-06-2024,0.363333
2,11-06-2024,0.516667
3,12-06-2024,0.746667
4,13-06-2024,0.71


In [17]:
# Rename specific columns
df_UV = df_UV.rename(columns={'Date': 'date'})
df_UV.head()

Unnamed: 0,date,UV_particulate_matter_370nm
0,09-06-2024,0.466667
1,10-06-2024,0.363333
2,11-06-2024,0.516667
3,12-06-2024,0.746667
4,13-06-2024,0.71


In [18]:
# Ensure Date column is in datetime format, parsing DD-MM-YYYY
df_UV['date'] = pd.to_datetime(df_UV['date'], format='%d-%m-%Y')

In [19]:
# Check for missing values
df_UV[df_UV.isna().any(axis=1)]     # Show all rows with any missing values

Unnamed: 0,date,UV_particulate_matter_370nm
207,2025-01-02,
217,2025-01-12,
218,2025-01-13,
306,2025-04-11,
318,2025-04-23,
319,2025-04-24,
341,2025-05-16,
342,2025-05-18,
361,2025-06-06,


In [20]:
# Only needed if there are missing values corresponding to the date
# Function to calculate mean of past 7 days
def fill_with_past_7_days_mean(row, df_UV, window_days=7):
    if pd.isna(row['UV_particulate_matter_370nm']):
        # Get the date range for the past 7 days (t-7 to t-1)
        start_date = row['date'] - pd.Timedelta(days=window_days)
        end_date = row['date'] - pd.Timedelta(days=1)
        # Select rows within the past 7 days
        past_data = df_UV[(df_UV['date'] >= start_date) & (df_UV['date'] <= end_date)]
        # Calculate mean of non-NaN Black_Carbon_Mean values
        mean_value = past_data['UV_particulate_matter_370nm'].mean()
        return mean_value
    return row['UV_particulate_matter_370nm']

# Apply the function to fill missing nitric_oxide_Mean values
df_UV['UV_particulate_matter_370nm'] = df_UV.apply(lambda row: fill_with_past_7_days_mean(row, df_UV), axis=1)

In [21]:
# Round specific columns
df_UV['UV_particulate_matter_370nm'] = df_UV['UV_particulate_matter_370nm'].round(3)
df_UV.head()

Unnamed: 0,date,UV_particulate_matter_370nm
0,2024-06-09,0.467
1,2024-06-10,0.363
2,2024-06-11,0.517
3,2024-06-12,0.747
4,2024-06-13,0.71


In [22]:
# Save to a new CSV file
df_UV.to_csv('UV_particulate_matter_370nm_cd.csv', index=False)

### Yellow Particulate matter (590nm)

In [23]:
df_ypm = pd.read_csv('yellow_particulate_matter.csv', skiprows=10)
df_ypm.columns

Index(['Date', 'Yellow Particulate matter (590nm)', 'Status',
       'Yellow Particulate matter (590nm).1', 'Status.1',
       'Yellow Particulate matter (590nm).2', 'Status.2'],
      dtype='object')

In [None]:
# Select relevant columns: Date and Black Carbon values for the three sites
columns = ['Date', 
           'Yellow Particulate matter (590nm)',
           'Yellow Particulate matter (590nm).1',
           'Yellow Particulate matter (590nm).2',
           ]
df_ypm = df_ypm[columns]

In [24]:
# Replace 'No data' with NaN
df_ypm[[ 
           'Yellow Particulate matter (590nm)',
           'Yellow Particulate matter (590nm).1',
           'Yellow Particulate matter (590nm).2',
           ]] = df_ypm[[ 
           'Yellow Particulate matter (590nm)',
           'Yellow Particulate matter (590nm).1',
           'Yellow Particulate matter (590nm).2',
           ]].replace('No data', np.nan)

# Convert blue particulate matter columns to numeric
df_ypm[[ 
           'Yellow Particulate matter (590nm)',
           'Yellow Particulate matter (590nm).1',
           'Yellow Particulate matter (590nm).2',
           ]] = df_ypm[[ 
           'Yellow Particulate matter (590nm)',
           'Yellow Particulate matter (590nm).1',
           'Yellow Particulate matter (590nm).2',
           ]].apply(pd.to_numeric, errors='coerce')

# Calculate the mean blue particulate matter value per row, ignoring NaN
df_ypm['yellow_particulate_matter_590nm'] = df_ypm[[ 
           'Yellow Particulate matter (590nm)',
           'Yellow Particulate matter (590nm).1',
           'Yellow Particulate matter (590nm).2',
           ]].mean(axis=1, skipna=True)

df_ypm.head()

Unnamed: 0,Date,Yellow Particulate matter (590nm),Status,Yellow Particulate matter (590nm).1,Status.1,Yellow Particulate matter (590nm).2,Status.2,yellow_particulate_matter_590nm
0,09-06-2024,0.85,V ugm-3,0.29,V ugm-3,0.15,V ugm-3,0.43
1,10-06-2024,0.63,V ugm-3,0.24,V ugm-3,0.17,V ugm-3,0.346667
2,11-06-2024,0.99,V ugm-3,0.34,V ugm-3,0.18,V ugm-3,0.503333
3,12-06-2024,1.43,V ugm-3,0.44,V ugm-3,0.24,V ugm-3,0.703333
4,13-06-2024,1.45,V ugm-3,0.3,V ugm-3,0.33,V ugm-3,0.693333


In [25]:
# Select only Date and Black_Carbon_Mean columns
df_ypm = df_ypm[['Date', 'yellow_particulate_matter_590nm']]
df_ypm.head()

Unnamed: 0,Date,yellow_particulate_matter_590nm
0,09-06-2024,0.43
1,10-06-2024,0.346667
2,11-06-2024,0.503333
3,12-06-2024,0.703333
4,13-06-2024,0.693333


In [26]:
# Rename specific columns
df_ypm = df_ypm.rename(columns={'Date': 'date'})
df_ypm.head()

Unnamed: 0,date,yellow_particulate_matter_590nm
0,09-06-2024,0.43
1,10-06-2024,0.346667
2,11-06-2024,0.503333
3,12-06-2024,0.703333
4,13-06-2024,0.693333


In [27]:
# Ensure Date column is in datetime format, parsing DD-MM-YYYY
df_ypm['date'] = pd.to_datetime(df_ypm['date'], format='%d-%m-%Y')

In [28]:
# Check for missing values
df_ypm[df_ypm.isna().any(axis=1)]     # Show all rows with any missing values

Unnamed: 0,date,yellow_particulate_matter_590nm
207,2025-01-02,
217,2025-01-12,
218,2025-01-13,
306,2025-04-11,
319,2025-04-24,
341,2025-05-16,
342,2025-05-18,


In [29]:
# Only needed if there are missing values corresponding to the date
# Function to calculate mean of past 7 days
def fill_with_past_7_days_mean(row, df_ypm, window_days=7):
    if pd.isna(row['yellow_particulate_matter_590nm']):
        # Get the date range for the past 7 days (t-7 to t-1)
        start_date = row['date'] - pd.Timedelta(days=window_days)
        end_date = row['date'] - pd.Timedelta(days=1)
        # Select rows within the past 7 days
        past_data = df_ypm[(df_ypm['date'] >= start_date) & (df_ypm['date'] <= end_date)]
        # Calculate mean of non-NaN Black_Carbon_Mean values
        mean_value = past_data['yellow_particulate_matter_590nm'].mean()
        return mean_value
    return row['yellow_particulate_matter_590nm']

# Apply the function to fill missing nitric_oxide_Mean values
df_ypm['yellow_particulate_matter_590nm'] = df_ypm.apply(lambda row: fill_with_past_7_days_mean(row, df_ypm), axis=1)

In [30]:
# Round specific columns
df_ypm['yellow_particulate_matter_590nm'] = df_ypm['yellow_particulate_matter_590nm'].round(3)
df_ypm.head()

Unnamed: 0,date,yellow_particulate_matter_590nm
0,2024-06-09,0.43
1,2024-06-10,0.347
2,2024-06-11,0.503
3,2024-06-12,0.703
4,2024-06-13,0.693


In [31]:
# Save to a new CSV file
df_ypm.to_csv('yellow_particulate_matter_590nm_cd.csv', index=False)