In [1]:
# %pip install pandas
# %pip install matplotlib
# %pip install plotly
# %pip install numpy
# %pip install seaborn
# %pip install scikit-learn
# %pip install nbformat>=4.2.0
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

file_paths = ["POAR_2018.csv", 
              "POAR_2019.csv", 
              "POAR_2020.csv", 
              "POAR_2021.csv", 
              "POAR_2022.csv", 
              "POAR_2023.csv"]

my_dates = ["20 December 2018", "3 January 2019", "19 March 2020", "26 March 2020", "29 June 2020", "10 November 2020", "20 December 2020", "3 January 2021", "29 November 2021", "25 July 2022", "24 July 2023"]

# Dictionary to hold DataFrames
poar_data = {}

# Loop through file_paths, read each file, and store in the dictionary
for year, file_path in zip(range(2018, 2024), file_paths):
    poar_data[year] = pd.read_csv(file_path, skiprows=4)

# poar2018 = pd.read_csv(file_paths[0], skiprows=4)
# poar2019 = pd.read_csv(file_paths[1], skiprows=4)
# poar2020 = pd.read_csv(file_paths[2], skiprows=4)
# poar2021 = pd.read_csv(file_paths[3], skiprows=4)
# poar2022 = pd.read_csv(file_paths[4], skiprows=4)
# poar2023 = pd.read_csv(file_paths[5], skiprows=4)

# Convert 'my_dates' to datetime
my_dates_dt = pd.to_datetime(my_dates, dayfirst=True)

# poarBig = pd.concat([poar2018, poar2019, poar2020, poar2021, poar2022, poar2023])
poarBig = pd.concat(poar_data.values(), ignore_index=True)

# Convert the date column in poarBig to datetime
poarBig['Date'] = pd.to_datetime(poarBig['Date'], dayfirst=True)

# Replace "24:00" with "00:00" in the 'time' column
poarBig['time'] = poarBig['time'].replace('24:00', '00:00') 

# Convert 'time' column to datetime to ensure correct sorting
poarBig['time'] = pd.to_datetime(poarBig['time'], format='%H:%M').dt.time

# Sort the DataFrame by the 'time' column
poarBig.sort_values(by='time', inplace=True)

# Filter rows where the date is in 'my_dates_dt'
filtered_poarBig = poarBig[poarBig['Date'].isin(my_dates_dt)]

In [2]:
pm10 = filtered_poarBig[["Date", "time", "PM<sub>10</sub> particulate matter (Hourly measured)"]]
pm10.rename(columns={"PM<sub>10</sub> particulate matter (Hourly measured)": "PM10"}, inplace=True)
#pm10.head()

pm10_cleaned = pm10.dropna()

# scatter_plot = sns.scatterplot(data=pm10_cleaned, x="Date", y="PM10")
# scatter_plot.set(xlabel="Date", ylabel="PM10")
# scatter_plot.set_title("PM10 over time")
# plt.show()
scatter_plot = px.scatter(pm10_cleaned, x="time", y="PM10", color = "Date", title="PM10 over the years", hover_data=["Date"], labels={"time": "Time", "PM10": "PM10"})
scatter_plot.show()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pm10.rename(columns={"PM<sub>10</sub> particulate matter (Hourly measured)": "PM10"}, inplace=True)


In [3]:
nitric_oxide = filtered_poarBig[["Date", "time", "Nitric oxide"]]
# nitric_oxide.rename(columns={"Nitric oxide": "PM10"}, inplace=True)

nitric_oxide_cleaned = nitric_oxide.dropna()

scatter__no_plot = px.scatter(nitric_oxide_cleaned, x="time", y="Nitric oxide", color = "Date", title = "Nitric oxide over the years", hover_data = ["Date"], labels = {"time": "Time"})
scatter__no_plot.show()

In [4]:
no_no2 = filtered_poarBig[["Date", "time", "Nitrogen oxides as nitrogen dioxide"]]

no_no2_cleaned = no_no2.dropna()

scatter__no_plot = px.scatter(no_no2_cleaned, x="time", y="Nitrogen oxides as nitrogen dioxide", color = "Date", title = "Nitrogen oxides as nitrogen dioxide over the years", hover_data = ["Date"], labels = {"time": "Time", "Nitrogen oxides as nitrogen dioxide": "NO2"})
scatter__no_plot.show()

In [5]:
poar2020 = poar_data[2020]
poar2020['Date'] = pd.to_datetime(poar2020['Date'], dayfirst=True)

poar2020['time'] = poar2020['time'].replace('24:00', '00:00')
# poar2020.head()

# poar2020['time'] = pd.to_datetime(poar2020['time'], format='%H:%M').dt.time

poar2020.sort_values(by='time', inplace=True)
 

avg = poar2020[[ "Date", "PM<sub>10</sub> particulate matter (Hourly measured)", "Nitric oxide", "Nitrogen oxides as nitrogen dioxide"]]
avg.rename(columns={"PM<sub>10</sub> particulate matter (Hourly measured)": "PM10", "Nitrogen oxides as nitrogen dioxide": "NO2"}, inplace=True)
# avg.head()

monthly_avg = avg.resample('ME', on='Date').mean()

# Format the 'Date' index to show only month
monthly_avg.index = monthly_avg.index.strftime('%B')
# monthly_avg.head()

# monthly_avg.plot(kind='line', figsize=(10, 6))
# plt.title("Monthly average of PM10, NO and NO2 in 2020")
# plt.ylabel("Average concentration")
# plt.xlabel("Month")
# plt.show()

fig = px.line(monthly_avg, x=monthly_avg.index, y=monthly_avg.columns, title="Monthly average of PM10, NO, and NO2 in 2020")
fig.update_xaxes(title_text='Month')
fig.update_yaxes(title_text='Average concentration')
fig.show()




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [6]:
poar2021 = poar_data[2021]
poar2022 = poar_data[2022]
poar2023 = poar_data[2023]
years_avg = pd.concat([poar2020, poar2021, poar2022, poar2023])

pm10_yearly = years_avg[["Date", "PM<sub>10</sub> particulate matter (Hourly measured)"]]

pm10_yearly.rename(columns={"PM<sub>10</sub> particulate matter (Hourly measured)": "PM10"}, inplace=True)

pm10_yearly_cleaned = pm10_yearly.dropna()

# Ensure 'Date' is in datetime format
pm10_yearly_cleaned['Date'] = pd.to_datetime(pm10_yearly_cleaned['Date'])

# Convert the 'Date' column to datetime format
pm10_yearly_cleaned['Date'] = pd.to_datetime(pm10_yearly_cleaned['Date'], format='%d-%m-%Y')

# Extract year and month
pm10_yearly_cleaned['Year'] = pm10_yearly_cleaned['Date'].dt.year
pm10_yearly_cleaned['Month'] = pm10_yearly_cleaned['Date'].dt.month

# Group by year and month and calculate the mean PM10
pm10_monthly_avg = pm10_yearly_cleaned.groupby(['Year', 'Month']).agg({'PM10': 'mean'}).reset_index()

# Pivot the table to have years as columns
pm10_pivot = pm10_monthly_avg.pivot(index='Month', columns='Year', values='PM10').reset_index()

# Reshape the pivot table for plotting
pm10_pivot_melted = pd.melt(pm10_pivot, id_vars=['Month'], value_vars=pm10_pivot.columns[1:], var_name='Year', value_name='PM10')

# Create the bar plot using Plotly Express
pm10_avg_bar = px.bar(pm10_pivot_melted, x='Month', y='PM10', color='Year', barmode='group', title='Monthly average of PM10 over the years', labels={'PM10': 'PM10 Levels', 'Month': 'Month'})

# Show the plot
pm10_avg_bar.show()





A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a