# Working with weather data



In [1]:
#Import the required libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
#Load the dataset

df = pd.read_csv('https://cli.fusio.net/cli/climate_data/webdata/hly4935.csv', skiprows=23, low_memory=False)
#skiprows=23 is used to skip the first 23 rows of the dataset
#low_memory=False is used to suppress the warning message due to mixed data types in the dataset columns

#Display the first 5 rows of the dataset
print(df.head())


                date  ind rain  ind.1  temp  ind.2  wetb  dewpt  vappr  rhum  \
0  10-apr-1996 14:00    0  0.0      0  11.5      0   8.1    3.9    0.0     0   
1  31-jul-1996 08:00    0  0.0      0  11.5      0  11.1   10.7    0.0     0   
2  31-jul-1996 09:00    0  0.0      0  11.6      0  10.7    9.8    0.0     0   
3  31-jul-1996 10:00    0  0.0      0  12.9      0  11.3    9.8    0.0     0   
4  31-jul-1996 11:00    0  0.0      0  14.5      0  10.8    7.0    0.0     0   

   ... ind.3  wdsp ind.4  wddir  ww   w  sun    vis clht clamt  
0  ...     0     0     0      0  25  81  0.0  35000   32     5  
1  ...     0     0     0      0  25  82  0.0  40000   45     5  
2  ...     0     0     0      0  80  81  0.0   8000   32     7  
3  ...     0     0     0      0  25  82  0.0  28000   35     6  
4  ...     0     0     0      0   2  11  0.0  40000   40     6  

[5 rows x 21 columns]


In [3]:
#Analyze the dataset
#Check the number of rows and columns in the dataset
print(df.shape)


(246930, 21)


In [4]:
#Check the data types of the columns in the dataset
print(df.dtypes)

date      object
ind        int64
rain      object
ind.1      int64
temp     float64
ind.2      int64
wetb     float64
dewpt    float64
vappr    float64
rhum       int64
msl       object
ind.3      int64
wdsp      object
ind.4      int64
wddir     object
ww        object
w         object
sun      float64
vis       object
clht      object
clamt     object
dtype: object


In [13]:
print(df.isna().sum())

date     0
ind      0
rain     0
ind.1    0
temp     0
ind.2    0
wetb     0
dewpt    0
vappr    0
rhum     0
msl      0
ind.3    0
wdsp     0
ind.4    0
wddir    0
ww       0
w        0
sun      0
vis      0
clht     0
clamt    0
dtype: int64


In [6]:
# Cleaning the dataset
# Convert the date column to a datetime data 
df['date'] = pd.to_datetime(df['date'], format='%d-%b-%Y %H:%M') #format='%d-%b-%Y %H:%M' is used to specify the format of the date column
print(df.dtypes)

date     datetime64[ns]
ind               int64
rain             object
ind.1             int64
temp            float64
ind.2             int64
wetb            float64
dewpt           float64
vappr           float64
rhum              int64
msl              object
ind.3             int64
wdsp             object
ind.4             int64
wddir            object
ww               object
w                object
sun             float64
vis              object
clht             object
clamt            object
dtype: object


In [7]:
#Now deal with the mixed data types in the dataset
#Check the unique values in the 'rain' column
print(df['rain'].unique())

#Okay, we can see that the 'rain' column contains the string '---' which is causing the column to be of object data type
#Replace the '---' values with 0.0

df['rain'] = df['rain'].replace(' ', 0.0)

#Convert the 'rain' column to float data type
df['rain'] = df['rain'].astype(float)


['0.0' '0.1' ' ' '1.0' '3.1' '0.7' '0.3' '0.6' '1.5' '3.3' '0.2' '0.4'
 '2.3' '1.2' '0.5' '1.9' '0.8' '0.9' '1.1' '4.8' '1.3' '13.4' '1.7' '3.4'
 '1.6' '2.8' '1.4' '2.7' '4.0' '2.1' '4.7' '2.0' '2.2' '3.7' '2.4' '4.1'
 '3.2' '2.6' '1.8' '4.4' '6.5' '6.1' '3.5' '3.6' '2.9' '2.5' '5.2' '4.2'
 '4.6' '4.3' '3.9' '5.5' '7.8' '5.7' '3.8' '3.0' '5.4' '7.3' '7.5' '8.0'
 '4.9' '6.6' '6.7' '5.1' '8.5' '9.9' '5.0' '7.0' '8.2' '6.2' '12.9' '13.2'
 '8.9' '6.3' '6.0' '6.9' '8.1' '14.2' '5.9' '11.4' '4.5' '11.6' '5.6'
 '9.1' '8.6' '5.8' '8.4' '16.0' '6.4' '16.5' '7.6' '6.8' '5.3' '7.4' '9.0'
 '11.7' '7.2' '8.7' '9.4' '10.9' '12.4' '18.7' '13.6' '7.7' '15.2' '16.9'
 '11.1' '8.3' '8.8' '7.9' '9.2' '13.8' '9.5' '12.7' '11.9' '10.0']


In [8]:
#Rain column is now done, let's check the unique values in the 'wetb' column
print(df['wetb'].unique())

#We can see that the 'wetb' column contains integer values and float values
#Convert the 'wetb' column to float data type
df['wetb'] = df['wetb'].astype(float)



[ 8.1 11.1 10.7 11.3 10.8 10.9 10.4 10.6 11.  10.2 10.1  8.8  9.   8.5
  8.6  9.3  9.1  9.2  9.5  9.7 12.  11.6 11.7 11.8 11.2 10.5  9.6  9.4
 10.  11.4 11.5 12.1 11.9 12.6 12.8 12.4 12.7 13.  13.2 13.7 13.4 13.6
 13.8  9.9 12.9 13.1 13.5 13.9  0.  13.3 12.2 10.3  8.9  8.7  9.8 12.5
 14.1 14.2 12.3 14.  15.5 14.6 15.3 15.  15.1 14.9 14.7 15.4 15.6 15.7
 14.4 14.5 14.3 14.8 15.2 15.9 16.  16.1 15.8 16.3 16.6 17.  17.6 17.7
 17.4 16.8 16.5 16.4  8.4  7.8  7.7  8.2  7.9  8.   7.4  7.5 17.3 17.2
 16.9 17.5 17.9 16.7  7.6  8.3  7.2  7.   7.1  6.7  6.1  6.   6.4  6.8
  5.9  6.2  5.5  5.7  7.3  6.9  6.3  5.8  6.5  6.6  5.3  5.4  5.6  5.2
  5.1  5.   4.9  4.6  4.4  4.5  4.3  4.1  4.   4.8  4.7  3.8  3.9  3.7
  4.2  3.6  3.4  3.5  3.2  2.6  1.4  1.2  1.1  1.7  1.3  2.   2.8  2.9
  3.1  2.2  2.5  2.4  2.3  1.8  0.2  0.5  0.3  0.6  0.7  0.1 -0.4  1.
  0.8  0.4 -0.1 -0.3 -0.5 -1.  -0.9  3.3  2.7  1.5  3.   1.9  2.1 -0.8
 -0.7 -0.2  0.9 -1.1 -1.5 -1.7 -1.3 -1.8  1.6 -0.6 -2.  -1.2 -1.6 -1.4
 -2.1 -

In [9]:
#Check the unique values in the 'dewpt' column
print(df['dewpt'].unique())

#We can see that the 'dewpt' column contains integer values and float values
#Convert the 'dewpt' column to float data type
df['dewpt'] = df['dewpt'].astype(float)


[  3.9  10.7   9.8   7.    7.3   6.7   8.6   7.5   9.6   8.5   9.4   9.5
   7.9   8.3   7.1   7.2   8.2   8.1   8.8   9.2   8.9   9.1   9.3  10.
  10.8  10.6  10.4  10.3   9.9  10.1   8.7  10.2  10.9  11.2  11.7   9.7
  11.1  11.3  11.4  11.5  11.    9.   11.6  10.5  11.8   0.   12.   12.5
  12.8  12.9  13.1  13.2  13.   12.1   7.8   8.    6.8   6.9   7.4   7.7
   7.6  12.4  13.4  13.8  14.   12.2  11.9  13.5  12.3  12.6  12.7  13.3
  14.1  14.2  14.6  14.7  14.8  15.5  15.1  14.3  14.4  13.9  13.6  13.7
  15.3  15.9  15.6  15.4  15.2  15.   15.8  16.3  16.6  16.9  17.2  17.1
  16.7  14.9  14.5   8.4   6.4   6.5   6.3   4.7   5.6   5.4  16.   16.5
  16.4  16.8  17.   15.7  16.1   5.7   6.    5.5   4.5   5.1   4.8   6.1
   6.6   6.2   5.8   5.9   5.2   4.6   4.4   4.9   4.2   4.3   5.3   4.1
   5.    4.    3.8   3.7   3.6   2.8   3.4   0.7   2.9   3.    3.3   3.2
   3.5   3.1   0.6   1.6   1.7   2.5   2.4   2.3   1.9   1.1   1.3   0.5
   0.2   0.3   1.2   0.9   1.5   2.6   2.    1.4   2

In [10]:
#Check the unique values in the 'vappr' column
print(df['vappr'].unique())

#We can see that the 'vappr' column contains integer values and float values
#Convert the 'vappr' column to float data type
df['vappr'] = df['vappr'].astype(float)

[ 0.  10.6 10.  10.9 10.1 10.8 11.3 11.7 11.4 11.5 11.8 12.3 11.9 12.1
 13.  12.8 12.6 12.5 12.2 12.9 12.4 11.2 11.1 13.1 13.3 13.7 12.  13.2
 13.4 13.5 13.8 12.7 10.4  6.1 14.  14.4 14.5 14.8 14.9 15.1 15.2 15.
 14.1 11.6  9.9 10.2 10.3 10.5 10.7 15.4 15.8 16.  14.2 13.9 13.6 15.5
 14.3 14.7 15.3 14.6 16.1 16.2 16.6 16.7 16.8 17.6 17.1 16.3 16.4 15.9
 15.6 15.7 17.4 18.  17.7 17.3 17.5 17.2 17.  17.9 18.5 18.9 19.3 19.6
 19.5 19.2 19.  18.1 16.9 16.5 11.   9.6  8.5  9.1  9.   9.8 18.2 17.8
 18.8 18.7 19.1 19.4 18.3 18.6  9.2  9.3  8.4  8.8  8.6  9.4  9.7  9.5
  8.3  8.7  8.9  8.2  8.1  8.   7.9  7.5  7.8  6.4  7.6  7.4  7.7  6.8
  6.9  7.3  7.2  7.   6.6  6.7  6.3  6.2  6.5  6.   5.9  5.8  7.1  5.6
  5.4  5.3  5.5  5.7  5.1  5.   5.2  4.6  4.8  4.7  4.9  4.5  4.4  4.3
 18.4 19.7 20.4 21.4 21.  20.7 20.9 19.9 20.3 21.5 19.8 20.  21.3 20.8
 20.6 20.5 20.2  3.6  4.   4.1  3.   3.9  3.7  4.2  3.8  3.5  3.4 20.1
 22.4 22.3 22.6 23.  22.9 22.8 21.8 21.7 21.2 21.6  3.2  3.3  2.7  2.6
  3.1 2

In [11]:
#Check the unique values in the 'rhum' column
print(df['rhum'].unique())

#We can see that the 'rhum' column contains integer values and a zero value
#We should leave the zero value as it is because it is a valid value
#We should convert the 'rhum' column to an integer data type
df['rhum'] = df['rhum'].astype(int)


[  0  92  82  87  81  88  85  89  83  78  79  74  70  73  80  76  95  96
  97  91  99 100  94  84  71  69  68  72  77  93  98  67  61  58  60  65
  90  59  55  54  47  51  52  56  86  66  62  64  63  75  46  57  48  53
  50  49  43  45  44  42  36  38  41  40  34  33  39  37  35  31  30  32
  29  27  26  25  28  22]


In [12]:
#Check the unique values in the 'msl' column
print(df['msl'].unique())

#We can see that the 'msl' column contains missing values represented by '---'
#Replacing these with a zero value makes no sense because the mean sea level pressure cannot be zero
#We can replace the missing values with the mean of the column
#But first, we need to convert the 'msl' column to a float data type
#Convert the 'msl' column to float data type
df['msl'] = df['msl'].astype(float)

#Calculate the mean of the 'msl' column
mean_msl = df['msl'].mean()

#Replace the zero values with the mean of the column
df['msl'] = df['msl'].replace('0.0', mean_msl)



['1016.7' '1011.4' '1011.2' '1011.0' '1010.9' '1011.3' '1011.6' '1011.8'
 '1011.1' '1011.5' '1012.2' '1012.6' '1013.1' '1013.5' '1013.8' '1014.2'
 '1015.0' '1015.5' '1015.6' '1016.2' '1017.0' '1017.6' '1018.5' '1019.2'
 '1019.6' '1020.3' '1020.7' '1021.0' '1020.9' '1021.4' '1021.6' '1021.9'
 '1022.6' '1023.2' '1023.8' '1024.3' '1024.6' '1024.8' '1025.4' '1025.7'
 '1026.2' '1026.5' '1026.7' '1027.0' '1027.2' '1027.6' '1028.2' '1028.3'
 '1028.5' '1028.6' '1028.7' '1028.8' '1028.9' '1028.4' '1027.9' '1026.9'
 '1026.8' '1026.6' '1026.3' '1026.0' '1025.3' '1022.8' '1022.3' '1022.1'
 '1021.5' '1020.4' '1018.3' '1017.7' '1016.4' '1014.1' '1013.2' '1012.1'
 '1010.7' '1010.2' '1009.9' ' ' '1009.3' '1008.9' '1008.6' '1007.9'
 '1007.6' '1007.4' '1007.0' '1006.9' '1007.1' '1008.3' '1009.0' '1009.2'
 '1009.5' '1010.0' '1010.3' '1009.8' '1010.8' '1010.5' '1010.1' '1010.4'
 '1010.6' '1011.7' '1012.4' '1013.4' '1014.4' '1014.9' '1016.1' '1016.5'
 '1015.4' '1015.7' '1015.2' '1014.7' '1014.3' '1009.4' '

ValueError: could not convert string to float: ' '

In [None]:
#Check the unique values in the 'wdsp' column
print(df['wdsp'].unique())

#We can see that the 'wdsp' column contains missing values represented by ' '
#Replacing these with a zero value makes no sense because the wind speed cannot be zero
#We can look at the time of year and the location of the weather station to estimate the wind speed
#But for now, we can replace the missing values with the mean of the column

['0' '6' '8' '9' '7' '11' '13' '12' '14' '10' ' ' '4' '5' '3' '2' '15'
 '16' '17' '18' '19' '1' '20' '23' '21' '22' '24' '25' '28' '26' '29' '31'
 '32' '35' '33' '27' '30' '39' '37' '36' '38' '34' '42' '49' '50' '48'
 '43' '40']


### End