## Biossido di Zolfo analysis ##

In [103]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [104]:
data = pd.read_csv('biossido_di_zolfo.csv')
data

Unnamed: 0,IdSensore,Data,Valore,Stato,UnitaMisura,NomeTipoSensore,Year,Month,Day,Hour,CumulativeDays,Week,CumulativeDay
0,5617,2013-01-01 00:00:00,1.4,VA,µg/m³,Biossido di Zolfo,2013,1,1,0,1,1,1
1,5617,2013-01-01 01:00:00,2.9,VA,µg/m³,Biossido di Zolfo,2013,1,1,1,1,1,1
2,5617,2013-01-01 02:00:00,1.9,VA,µg/m³,Biossido di Zolfo,2013,1,1,2,1,1,1
3,5617,2013-01-01 03:00:00,2.1,VA,µg/m³,Biossido di Zolfo,2013,1,1,3,1,1,1
4,5617,2013-01-01 04:00:00,2.5,VA,µg/m³,Biossido di Zolfo,2013,1,1,4,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
271190,20427,2017-12-31 19:00:00,2.5,VA,µg/m³,Biossido di Zolfo,2017,12,31,19,365,53,365
271191,20427,2017-12-31 20:00:00,3.5,VA,µg/m³,Biossido di Zolfo,2017,12,31,20,365,53,365
271192,20427,2017-12-31 21:00:00,4.2,VA,µg/m³,Biossido di Zolfo,2017,12,31,21,365,53,365
271193,20427,2017-12-31 22:00:00,4.8,VA,µg/m³,Biossido di Zolfo,2017,12,31,22,365,53,365


In [105]:
# dropping all the nocturnal hour (from 23 to 7)
data = data[(data['Hour'] >= 7) & (data['Hour'] <= 23)]

# counting the number of values for each day in each year
data.groupby(['Year', 'Week', 'IdSensore']).size()
# weekly_std = data.groupby(['Year', 'Week']).size().std()

Year  Week  IdSensore
2013  1     5617         119
            5619         119
            5630         119
            5631         119
            5646         119
                        ... 
2017  53    5630          17
            5631          17
            5646          17
            10280         17
            20427         17
Length: 1736, dtype: int64

In [106]:
data['IdSensore'].unique()

array([ 5617,  5619,  5630,  5631,  5646,  5696, 10006, 10280, 20427],
      dtype=int64)

In [110]:
# total number of weeks in the dataset
weeks = data.groupby(['Year', 'Week']).size().count()
# computing the maean for each sensors
# creating the dataframe
colarray = ['IdSensore']
for years in data['Year'].unique():
    for week in data[data['Year'] == years]['Week'].unique():
        colarray.append('Week ' + str(week) + ' Year ' + str(years))
means = pd.DataFrame(columns=colarray)
        
for sensor in data['IdSensore'].unique():
    currmeans = []
    for year in data['Year'].unique():
        for week in data[data['Year'] == year]['Week'].unique():
            currentweek = data[(data['Year'] == year) & (data['Week'] == week) & (data['IdSensore'] == sensor)]
            currmeans.append(round(currentweek['Valore'].mean(), 2))
    # adding the means of the sensors as a row in the dataframe
    means.loc[len(means)] = [sensor] + currmeans
means

Unnamed: 0,IdSensore,Week 1 Year 2013,Week 2 Year 2013,Week 3 Year 2013,Week 4 Year 2013,Week 5 Year 2013,Week 6 Year 2013,Week 7 Year 2013,Week 8 Year 2013,Week 9 Year 2013,...,Week 44 Year 2017,Week 45 Year 2017,Week 46 Year 2017,Week 47 Year 2017,Week 48 Year 2017,Week 49 Year 2017,Week 50 Year 2017,Week 51 Year 2017,Week 52 Year 2017,Week 53 Year 2017
0,5617.0,0.88,1.0,1.48,1.07,1.03,1.24,0.77,1.09,0.71,...,1.2,2.11,2.57,4.75,5.1,6.99,6.28,,,
1,5619.0,6.46,1.86,2.94,3.02,2.71,3.05,6.28,3.27,2.52,...,3.76,3.13,6.27,5.22,5.4,6.83,6.3,7.47,4.32,4.52
2,5630.0,2.95,2.15,2.11,1.95,2.88,0.63,6.9,3.15,2.31,...,3.73,2.71,4.61,3.86,3.34,4.51,3.63,4.64,3.34,2.94
3,5631.0,7.46,6.09,4.1,2.94,3.42,2.69,4.47,6.33,2.56,...,2.6,2.5,2.95,2.81,3.02,4.54,5.03,7.41,4.26,5.09
4,5646.0,5.07,6.4,7.1,6.81,6.92,6.77,7.19,6.82,6.88,...,2.78,2.27,2.67,3.12,5.42,3.16,2.19,1.78,1.72,2.37
5,5696.0,,,,,,,,,,...,,,,,,,,,,
6,10006.0,2.03,2.12,1.96,2.09,,,3.0,2.79,4.1,...,,,,,,,,,,
7,10280.0,7.05,6.37,7.99,8.7,8.32,9.8,11.59,4.83,3.62,...,3.12,1.85,4.4,3.2,2.77,3.85,3.07,4.66,2.63,2.22
8,20427.0,,,,,,,,,,...,,,,,,,,5.4,4.89,4.38


In [112]:
# finding out the number of missing values for each row
missing = means.isna().sum(axis=1)
missing

0      9
1     34
2      1
3      8
4      8
5    264
6     58
7      5
8    262
dtype: int64

In [111]:
# saving the means
means.to_csv('means_biossido_di_zolfo.csv', index=False)