In [161]:
import sys
sys.executable
import pandas as pd
import numpy as np
import scipy
import locale

In [162]:
### We have three data sets for weather data
# 1. Airport
# 2. Barcelona station
# 3. Fabra Observatory (at altitude)
# The Fabra observatory is at an altitude of over 400 metres and so pressure and temperature data are lower
# values than experienced in Barcelona city.
# The Barcelona station data set is incomplete.  No pressure readings are recorded at all and we have
# more data missing from the data set.
# The solution applied is to use the Airport data set and include data from Barcelona station to populate missing
# data values, then to use adjusted pressure data from Fabra Observatory to replace missing pressure values.

# We now load the three data sets

rawCsvData=pd.read_csv("data/Weather/HistoricalWeatherDataBCNAirport.csv", decimal=',')
dfWeatherAirport=rawCsvData.copy()

rawCsvData=pd.read_csv("data/Weather/HistoricalWeatherDataBCN.csv", decimal=',')
dfWeatherBcn=rawCsvData.copy()

rawCsvData=pd.read_csv("data/Weather/HistoricalWeatherDataBCNFabra.csv", decimal=',')
dfWeatherFabra=rawCsvData.copy()

In [163]:
dfWeatherAirport.describe

<bound method NDFrame.describe of            fecha  indicativo                nombre  provincia  altitud  tmed  \
0     2017-01-01          76  BARCELONA AEROPUERTO  BARCELONA        4   7.8   
1     2017-01-02          76  BARCELONA AEROPUERTO  BARCELONA        4   8.6   
2     2017-01-03          76  BARCELONA AEROPUERTO  BARCELONA        4   8.1   
3     2017-01-04          76  BARCELONA AEROPUERTO  BARCELONA        4  10.4   
4     2017-01-05          76  BARCELONA AEROPUERTO  BARCELONA        4   8.8   
...          ...         ...                   ...        ...      ...   ...   
1395  2020-10-27          76  BARCELONA AEROPUERTO  BARCELONA        4  15.7   
1396  2020-10-28          76  BARCELONA AEROPUERTO  BARCELONA        4  16.0   
1397  2020-10-29          76  BARCELONA AEROPUERTO  BARCELONA        4  16.5   
1398  2020-10-30          76  BARCELONA AEROPUERTO  BARCELONA        4  16.3   
1399  2020-10-31          76  BARCELONA AEROPUERTO  BARCELONA        4  15.9   

     

In [164]:
dfWeatherAirport.describe()

Unnamed: 0,indicativo,altitud,tmed,tmin,tmax,velmedia,racha,sol,presMax,presMin
count,1400.0,1400.0,1395.0,1395.0,1395.0,1397.0,1394.0,1399.0,1398.0,1398.0
mean,76.0,4.0,17.821577,13.864229,21.78172,4.580601,10.736657,6.949249,1018.809657,1014.141845
std,0.0,0.0,6.020405,6.38548,5.818068,1.620279,3.14103,3.792312,6.138294,7.065691
min,76.0,4.0,3.4,-0.6,5.1,1.4,5.8,0.0,994.5,987.4
25%,76.0,4.0,12.7,8.4,16.9,3.6,8.9,4.2,1015.3,1010.9
50%,76.0,4.0,17.2,13.5,21.4,4.2,9.7,7.4,1018.5,1014.5
75%,76.0,4.0,23.1,19.5,26.9,5.3,12.5,10.2,1022.3,1018.2
max,76.0,4.0,32.1,27.3,37.0,18.9,30.8,13.2,1037.0,1034.0


In [165]:
### We check the data types and fix them
dfWeatherAirport.info()
dfWeatherAirport.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1400 entries, 0 to 1399
Data columns (total 20 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   fecha        1400 non-null   object 
 1   indicativo   1400 non-null   int64  
 2   nombre       1400 non-null   object 
 3   provincia    1400 non-null   object 
 4   altitud      1400 non-null   int64  
 5   tmed         1395 non-null   float64
 6   prec         1396 non-null   object 
 7   tmin         1395 non-null   float64
 8   horatmin     1395 non-null   object 
 9   tmax         1395 non-null   float64
 10  horatmax     1395 non-null   object 
 11  dir          1394 non-null   object 
 12  velmedia     1397 non-null   float64
 13  racha        1394 non-null   float64
 14  horaracha    1394 non-null   object 
 15  sol          1399 non-null   float64
 16  presMax      1398 non-null   float64
 17  horaPresMax  1397 non-null   object 
 18  presMin      1398 non-null   float64
 19  horaPr

fecha          0
indicativo     0
nombre         0
provincia      0
altitud        0
tmed           5
prec           4
tmin           5
horatmin       5
tmax           5
horatmax       5
dir            6
velmedia       3
racha          6
horaracha      6
sol            1
presMax        2
horaPresMax    3
presMin        2
horaPresMin    3
dtype: int64

In [166]:
# remove columns indicativo, nombre,provincia, altitud, horatmin, horatmax,horaracha, horaPresMax, horaPresMin

dfWeatherAirport=dfWeatherAirport.drop(['indicativo', 'nombre','provincia', 'altitud', 'horatmin', 'horatmax','horaracha', 'horaPresMax', 'horaPresMin'], axis=1)

In [167]:
dfWeatherAirport.info()
dfWeatherAirport.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1400 entries, 0 to 1399
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   fecha     1400 non-null   object 
 1   tmed      1395 non-null   float64
 2   prec      1396 non-null   object 
 3   tmin      1395 non-null   float64
 4   tmax      1395 non-null   float64
 5   dir       1394 non-null   object 
 6   velmedia  1397 non-null   float64
 7   racha     1394 non-null   float64
 8   sol       1399 non-null   float64
 9   presMax   1398 non-null   float64
 10  presMin   1398 non-null   float64
dtypes: float64(8), object(3)
memory usage: 120.4+ KB


fecha       0
tmed        5
prec        4
tmin        5
tmax        5
dir         6
velmedia    3
racha       6
sol         1
presMax     2
presMin     2
dtype: int64

In [168]:
# Convert column fecha to DateTime
dfWeatherAirport["fecha"]=pd.to_datetime(dfWeatherAirport["fecha"],format='%Y-%m-%d')

In [169]:
dfWeatherAirport.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1400 entries, 0 to 1399
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   fecha     1400 non-null   datetime64[ns]
 1   tmed      1395 non-null   float64       
 2   prec      1396 non-null   object        
 3   tmin      1395 non-null   float64       
 4   tmax      1395 non-null   float64       
 5   dir       1394 non-null   object        
 6   velmedia  1397 non-null   float64       
 7   racha     1394 non-null   float64       
 8   sol       1399 non-null   float64       
 9   presMax   1398 non-null   float64       
 10  presMin   1398 non-null   float64       
dtypes: datetime64[ns](1), float64(8), object(2)
memory usage: 120.4+ KB


In [170]:
dfWeatherAirport.head()

Unnamed: 0,fecha,tmed,prec,tmin,tmax,dir,velmedia,racha,sol,presMax,presMin
0,2017-01-01,7.8,0,3.3,12.4,36.0,3.9,12.5,2.7,1029.0,1022.3
1,2017-01-02,8.6,0,2.8,14.3,99.0,5.3,10.3,4.8,1023.1,1019.9
2,2017-01-03,8.1,0,3.7,12.5,99.0,5.0,10.8,4.0,1024.2,1020.6
3,2017-01-04,10.4,0,6.3,14.6,35.0,3.9,8.9,5.2,1023.9,1019.2
4,2017-01-05,8.8,0,4.6,13.1,99.0,3.9,10.3,4.6,1026.3,1019.1


In [171]:
# fix prec precipitation data - replace value for trace with 0.05 and convert commas to decimal points
dfWeatherAirport['prec'] = dfWeatherAirport['prec'].replace('Ip', '0,05')
dfWeatherAirport['prec'] = pd.to_numeric(dfWeatherAirport['prec'].str.replace(',','.')).round(decimals=2)

In [172]:
dfWeatherAirport.head()
dfWeatherAirport.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1400 entries, 0 to 1399
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   fecha     1400 non-null   datetime64[ns]
 1   tmed      1395 non-null   float64       
 2   prec      1396 non-null   float64       
 3   tmin      1395 non-null   float64       
 4   tmax      1395 non-null   float64       
 5   dir       1394 non-null   object        
 6   velmedia  1397 non-null   float64       
 7   racha     1394 non-null   float64       
 8   sol       1399 non-null   float64       
 9   presMax   1398 non-null   float64       
 10  presMin   1398 non-null   float64       
dtypes: datetime64[ns](1), float64(9), object(1)
memory usage: 120.4+ KB


In [173]:
dfWeatherAirport.dir.astype("category").cat.codes.head(20)

0     29
1     36
2     36
3     28
4     36
5     36
6     36
7     28
8     36
9     36
10    17
11    36
12    36
13    29
14     0
15    28
16    11
17    29
18    33
19    32
dtype: int8

In [174]:
# Convert wind direction dir category values to points of compass because 0 is similar to 36
# 0,1,2,34,35,36 -> NN
# 3,4,5,6 -> NE
# 7,8,9,10,11 -> EE
# 12,13,14,15 -> SE
# 16,17,18,19,20 -> SS
# 21,22,23,24 -> SW
# 25,26,27,28,29 -> WW
# 30,31,32,33 -> NW
# 99 -> VAR

# set dir 99 to be null so we can replace it more easily with data from another weather station

dfWeatherAirport['dir'].replace({'99.0': None},inplace =True)

# set up bins for the wind directions  99.0 -> VARiable
directions = np.array('N NNE NE ENE E ESE SE SSE S SSW SW WSW W WNW NW NNW N VAR'.split())
bins = np.arange(1.125, 37.2, 2.25)

dfWeatherAirport['windDir'] = directions[np.digitize(pd.to_numeric(dfWeatherAirport['dir']), bins)]
#dfWeatherAirport.windDir.astype("category").cat.codes

In [175]:
dfWeatherAirport['windDir'].value_counts(dropna=False).head(50)

VAR    467
N      193
WSW    136
SW     126
E       93
ENE     62
ESE     45
NE      43
NNE     40
W       39
SSE     28
SSW     27
NNW     24
SE      21
WNW     21
S       18
NW      17
Name: windDir, dtype: int64

In [176]:
# Now prepare the other weather data sets
dfWeatherBcn.describe

<bound method NDFrame.describe of            fecha indicativo     nombre  provincia  altitud  prec   dir  \
0     2017-01-01      0201D  BARCELONA  BARCELONA        6   0.0  28.0   
1     2017-01-02      0201D  BARCELONA  BARCELONA        6   0.0  22.0   
2     2017-01-03      0201D  BARCELONA  BARCELONA        6   0.0  34.0   
3     2017-01-04      0201D  BARCELONA  BARCELONA        6   0.0  27.0   
4     2017-01-05      0201D  BARCELONA  BARCELONA        6   0.0  34.0   
...          ...        ...        ...        ...      ...   ...   ...   
1395  2020-10-27      0201D  BARCELONA  BARCELONA        6   0.0  29.0   
1396  2020-10-28      0201D  BARCELONA  BARCELONA        6   0.0  22.0   
1397  2020-10-29      0201D  BARCELONA  BARCELONA        6   0.0  21.0   
1398  2020-10-30      0201D  BARCELONA  BARCELONA        6   0.0  20.0   
1399  2020-10-31      0201D  BARCELONA  BARCELONA        6   0.0  23.0   

      velmedia  racha horaracha  tmed  tmin horatmin  tmax horatmax  
0      

In [177]:
dfWeatherBcn.describe()

Unnamed: 0,altitud,prec,velmedia,racha,tmed,tmin,tmax
count,1400.0,1399.0,1399.0,1397.0,1224.0,1224.0,1224.0
mean,6.0,1.628663,3.53624,9.659699,18.053023,14.931373,21.175817
std,0.0,6.857892,1.51211,3.522307,5.707941,6.023844,5.528025
min,6.0,0.0,0.0,3.9,2.7,0.1,4.6
25%,6.0,0.0,2.5,7.2,13.075,9.7,16.2
50%,6.0,0.0,3.3,8.9,17.8,14.6,21.2
75%,6.0,0.0,4.2,11.4,23.2,20.4,26.4
max,6.0,83.9,18.3,29.4,30.0,27.1,35.2


In [178]:
### We check the data types and fix them

In [179]:

dfWeatherBcn=dfWeatherBcn.drop(['indicativo','nombre','provincia','altitud','horatmin','horatmax','horaracha'], axis=1)


In [180]:
dfWeatherBcn["fecha"]=pd.to_datetime(dfWeatherBcn["fecha"],format='%Y-%m-%d')

In [181]:
dfWeatherBcn.info()
dfWeatherBcn.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1400 entries, 0 to 1399
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   fecha     1400 non-null   datetime64[ns]
 1   prec      1399 non-null   float64       
 2   dir       1397 non-null   object        
 3   velmedia  1399 non-null   float64       
 4   racha     1397 non-null   float64       
 5   tmed      1224 non-null   float64       
 6   tmin      1224 non-null   float64       
 7   tmax      1224 non-null   float64       
dtypes: datetime64[ns](1), float64(6), object(1)
memory usage: 87.6+ KB


fecha         0
prec          1
dir           3
velmedia      1
racha         3
tmed        176
tmin        176
tmax        176
dtype: int64

In [182]:
### So Ip trace value for prec is not in this data set
# define categories for wind direction
dfWeatherBcn['dir'].value_counts(dropna=False).head(100)

20.0    135
21.0    122
22.0    104
8.0     100
9.0      78
99.0     73
10.0     69
26.0     51
30.0     45
32.0     39
34.0     38
35.0     38
24.0     37
12.0     36
23.0     35
27.0     34
13.0     33
11.0     33
19.0     33
14.0     32
29.0     26
33.0     22
18.0     22
7.0      22
25.0     20
31.0     20
28.0     20
36.0     19
1.0      10
15.0     10
6.0       9
17.0      6
2.0       5
3.0       5
4.0       5
5.0       4
88.0      4
16.0      3
NaN       3
Name: dir, dtype: int64

In [183]:
# Here we have a value of 88.0 which should probably be 99.0
# Replace nulls in weatherAirport if they exist in weatherBcn

validWeather = dfWeatherAirport.combine_first(dfWeatherBcn)


In [184]:
validWeather['dir'].value_counts(dropna=False).head(100)

22.0    134
24.0     94
36.0     78
35.0     71
1.0      68
21.0     68
8.0      62
25.0     61
23.0     59
20.0     53
10.0     52
7.0      46
26.0     39
11.0     38
34.0     33
4.0      32
9.0      32
30.0     28
99.0     26
19.0     25
13.0     25
2.0      24
12.0     24
15.0     23
6.0      22
14.0     21
27.0     21
3.0      18
31.0     17
29.0     17
32.0     17
28.0     17
5.0      16
16.0     12
33.0     10
18.0      9
17.0      7
88.0      1
Name: dir, dtype: int64

In [185]:
validWeather.head()
validWeather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1400 entries, 0 to 1399
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   dir       1400 non-null   object        
 1   fecha     1400 non-null   datetime64[ns]
 2   prec      1400 non-null   float64       
 3   presMax   1398 non-null   float64       
 4   presMin   1398 non-null   float64       
 5   racha     1400 non-null   float64       
 6   sol       1399 non-null   float64       
 7   tmax      1397 non-null   float64       
 8   tmed      1397 non-null   float64       
 9   tmin      1397 non-null   float64       
 10  velmedia  1400 non-null   float64       
 11  windDir   1400 non-null   object        
dtypes: datetime64[ns](1), float64(9), object(2)
memory usage: 131.4+ KB


In [186]:
# Now we have 1x 88.0 value and 26x 99.0 values, so we should now combine with the data set from Fabra observatory

dfWeatherFabra.describe

<bound method NDFrame.describe of            fecha indicativo            nombre  provincia  altitud  tmed prec  \
0     2017-01-01      0200E  BARCELONA, FABRA  BARCELONA      408   7.4  0,0   
1     2017-01-02      0200E  BARCELONA, FABRA  BARCELONA      408   9.8  0,0   
2     2017-01-03      0200E  BARCELONA, FABRA  BARCELONA      408   7.8  0,0   
3     2017-01-04      0200E  BARCELONA, FABRA  BARCELONA      408   9.7  0,0   
4     2017-01-05      0200E  BARCELONA, FABRA  BARCELONA      408   8.6  0,0   
...          ...        ...               ...        ...      ...   ...  ...   
1395  2020-10-27      0200E  BARCELONA, FABRA  BARCELONA      408  13.0  0,0   
1396  2020-10-28      0200E  BARCELONA, FABRA  BARCELONA      408  15.7  0,0   
1397  2020-10-29      0200E  BARCELONA, FABRA  BARCELONA      408  17.4  0,0   
1398  2020-10-30      0200E  BARCELONA, FABRA  BARCELONA      408  18.6  0,0   
1399  2020-10-31      0200E  BARCELONA, FABRA  BARCELONA      408  19.6  0,0   

     

In [187]:
dfWeatherFabra.describe()

Unnamed: 0,altitud,tmed,tmin,tmax,dir,velmedia,racha,sol,presMax,presMin
count,1400.0,1400.0,1400.0,1400.0,1400.0,1400.0,1400.0,1400.0,1400.0,1400.0
mean,408.0,16.718357,12.769714,20.6655,22.348571,3.933,11.119571,7.906214,970.504643,966.489643
std,0.0,6.210202,5.911644,6.669392,10.033678,1.707394,3.398476,3.786018,5.610685,6.661327
min,408.0,-0.4,-2.4,1.7,1.0,0.6,4.2,0.0,947.5,941.0
25%,408.0,11.6,8.0,15.1,18.0,2.8,8.6,5.5,967.8,963.5
50%,408.0,16.1,12.2,20.3,25.0,3.9,10.6,8.8,970.5,967.3
75%,408.0,21.8,17.7,26.1,31.0,5.0,12.8,10.5,973.525,970.4
max,408.0,32.6,27.5,37.7,36.0,15.6,28.9,13.9,986.7,983.8


In [188]:
dfWeatherFabra.info()
dfWeatherFabra.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1400 entries, 0 to 1399
Data columns (total 20 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   fecha        1400 non-null   object 
 1   indicativo   1400 non-null   object 
 2   nombre       1400 non-null   object 
 3   provincia    1400 non-null   object 
 4   altitud      1400 non-null   int64  
 5   tmed         1400 non-null   float64
 6   prec         1400 non-null   object 
 7   tmin         1400 non-null   float64
 8   horatmin     1400 non-null   object 
 9   tmax         1400 non-null   float64
 10  horatmax     1399 non-null   object 
 11  dir          1400 non-null   int64  
 12  velmedia     1400 non-null   float64
 13  racha        1400 non-null   float64
 14  horaracha    1400 non-null   object 
 15  sol          1400 non-null   float64
 16  presMax      1400 non-null   float64
 17  horaPresMax  1400 non-null   object 
 18  presMin      1400 non-null   float64
 19  horaPr

fecha          0
indicativo     0
nombre         0
provincia      0
altitud        0
tmed           0
prec           0
tmin           0
horatmin       0
tmax           0
horatmax       1
dir            0
velmedia       0
racha          0
horaracha      0
sol            0
presMax        0
horaPresMax    0
presMin        0
horaPresMin    0
dtype: int64

In [189]:
dfWeatherFabra=dfWeatherFabra.drop(['indicativo', 'nombre','provincia', 'altitud', 'horatmin', 'horatmax','horaracha', 'horaPresMax', 'horaPresMin'], axis=1)

In [190]:
dfWeatherFabra["fecha"]=pd.to_datetime(dfWeatherFabra["fecha"],format='%Y-%m-%d')

In [191]:
dfWeatherFabra.info()
dfWeatherFabra.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1400 entries, 0 to 1399
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   fecha     1400 non-null   datetime64[ns]
 1   tmed      1400 non-null   float64       
 2   prec      1400 non-null   object        
 3   tmin      1400 non-null   float64       
 4   tmax      1400 non-null   float64       
 5   dir       1400 non-null   int64         
 6   velmedia  1400 non-null   float64       
 7   racha     1400 non-null   float64       
 8   sol       1400 non-null   float64       
 9   presMax   1400 non-null   float64       
 10  presMin   1400 non-null   float64       
dtypes: datetime64[ns](1), float64(8), int64(1), object(1)
memory usage: 120.4+ KB


fecha       0
tmed        0
prec        0
tmin        0
tmax        0
dir         0
velmedia    0
racha       0
sol         0
presMax     0
presMin     0
dtype: int64

In [192]:
dfWeatherFabra['dir'].value_counts(dropna=False).head(100)

31    123
21    116
33    106
32    104
27     78
20     77
34     76
26     72
22     71
6      62
30     56
7      51
5      38
25     33
1      32
29     31
3      31
19     29
28     28
18     20
2      19
17     17
23     16
4      16
16     14
12     13
8      13
15     10
24     10
14      9
11      9
35      8
13      6
36      5
10      1
Name: dir, dtype: int64

In [193]:
# No values for wind direction 99.0, so we should replace once more.

# set dir 99 to be null in df validWeather so we can replace it more easily with data from another weather station

validWeather['dir'].replace({'99.0': None},inplace =True)
validWeather['dir'].replace({'88.0': None},inplace =True)

dfValidWeather = validWeather.combine_first(dfWeatherFabra)

In [194]:
dfValidWeather['dir'].value_counts(dropna=False).head(100)

22.0    134
24.0     94
36.0     78
35.0     71
21.0     68
1.0      68
8.0      62
25.0     61
23.0     59
20.0     53
10.0     52
7.0      46
26.0     39
11.0     38
34.0     33
4.0      32
9.0      32
30.0     28
19.0     25
13.0     25
12.0     24
2.0      24
15.0     23
6.0      22
27.0     21
14.0     21
3.0      18
31.0     17
29.0     17
28.0     17
32.0     17
5.0      16
16.0     12
33.0     10
18.0      9
17.0      7
21        3
32        2
31        2
29        2
28        2
20        2
14        1
5         1
11        1
12        1
13        1
19        1
15        1
7         1
18        1
26        1
30        1
33        1
34        1
1         1
Name: dir, dtype: int64

In [195]:
# Now convert the dir wind directions to compass points
# set up bins for the wind directions  99.0 -> VARiable
directions = np.array('N NNE NE ENE E ESE SE SSE S SSW SW WSW W WNW NW NNW N VAR'.split())
bins = np.arange(1.125, 37.2, 2.25)

dfValidWeather['windDir'] = directions[np.digitize(pd.to_numeric(dfValidWeather['dir']), bins)]

In [196]:
dfValidWeather.describe()

Unnamed: 0,presMax,presMin,racha,sol,tmax,tmed,tmin,velmedia
count,1400.0,1400.0,1400.0,1400.0,1400.0,1400.0,1400.0,1400.0
mean,1018.740929,1014.074357,10.731786,6.950429,21.77,17.812,13.856714,4.5765
std,6.397588,7.282786,3.143198,3.791213,5.819631,6.019372,6.382127,1.621103
min,970.3,966.4,5.6,0.0,5.1,3.4,-0.6,1.4
25%,1015.3,1010.875,8.9,4.2,16.9,12.7,8.4,3.6
50%,1018.5,1014.5,9.7,7.45,21.35,17.2,13.4,4.2
75%,1022.3,1018.2,12.5,10.2,26.9,23.1,19.5,5.3
max,1037.0,1034.0,30.8,13.2,37.0,32.1,27.3,18.9


In [197]:
dfValidWeather.isna().sum()

dir         0
fecha       0
prec        0
presMax     0
presMin     0
racha       0
sol         0
tmax        0
tmed        0
tmin        0
velmedia    0
windDir     0
dtype: int64

In [None]:
# confirmed that we now have a complete data set with no null values

In [199]:
dfValidWeather.head(100)

Unnamed: 0,dir,fecha,prec,presMax,presMin,racha,sol,tmax,tmed,tmin,velmedia,windDir
0,36.0,2017-01-01,0,1029.0,1022.3,12.5,2.7,12.4,7.8,3.3,3.9,N
1,22.0,2017-01-02,0,1023.1,1019.9,10.3,4.8,14.3,8.6,2.8,5.3,SW
2,34.0,2017-01-03,0,1024.2,1020.6,10.8,4.0,12.5,8.1,3.7,5.0,NNW
3,35.0,2017-01-04,0,1023.9,1019.2,8.9,5.2,14.6,10.4,6.3,3.9,N
4,34.0,2017-01-05,0,1026.3,1019.1,10.3,4.6,13.1,8.8,4.6,3.9,NNW
...,...,...,...,...,...,...,...,...,...,...,...,...
95,35.0,2017-04-06,0,1019.7,1015.2,9.2,10.9,20.6,15.1,9.6,3.9,N
96,19.0,2017-04-07,0,1021.2,1018.9,8.3,11.4,19.3,15.0,10.6,3.6,S
97,24.0,2017-04-08,0,1023.0,1019.8,9.7,11.8,19.1,14.8,10.6,5.0,WSW
98,19.0,2017-04-09,0,1025.0,1022.2,9.2,11.7,18.8,14.3,9.8,5.3,S


In [200]:
dfValidWeather.describe

<bound method NDFrame.describe of        dir      fecha prec  presMax  presMin  racha  sol  tmax  tmed  tmin  \
0     36.0 2017-01-01    0   1029.0   1022.3   12.5  2.7  12.4   7.8   3.3   
1     22.0 2017-01-02    0   1023.1   1019.9   10.3  4.8  14.3   8.6   2.8   
2     34.0 2017-01-03    0   1024.2   1020.6   10.8  4.0  12.5   8.1   3.7   
3     35.0 2017-01-04    0   1023.9   1019.2    8.9  5.2  14.6  10.4   6.3   
4     34.0 2017-01-05    0   1026.3   1019.1   10.3  4.6  13.1   8.8   4.6   
...    ...        ...  ...      ...      ...    ...  ...   ...   ...   ...   
1395  36.0 2020-10-27    0   1017.0   1014.4    8.9  5.8  19.8  15.7  11.6   
1396  22.0 2020-10-28    0   1023.2   1016.5    9.7  3.0  20.5  16.0  11.4   
1397  21.0 2020-10-29    0   1026.7   1023.2    8.3  5.7  21.2  16.5  11.8   
1398  24.0 2020-10-30    0   1025.7   1023.3    8.3  7.1  20.9  16.3  11.7   
1399  23.0 2020-10-31    0   1024.2   1021.1   11.4  7.6  20.8  15.9  11.0   

      velmedia windDir  
0   