In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from geopy.geocoders import Nominatim

In [None]:
rain = pd.read_csv('data/rain_data_aus.csv')
rain.tail(3)

In [None]:
rain['location'].unique()

In [None]:
rain.isna().sum()

In [None]:
rain.describe()

In [None]:
sns.pairplot(rain)

In [None]:
def season_group(data):
    """
    Function to create a new column of seasons groups
    Selecting DataSet rows and classifing the season by the month
    Summer = 1
    Autumn = 2
    Winter = 3
    Spring = 4
    """
    seasons= []
    data['date'] = pd.to_datetime(data['date'])
    
    for x in range(len(data['date'])):
        if ((data['date'][x].month) == 12) | ((data['date'][x].month) == 1) | ((data['date'][x].month) == 2):
            seasons.append(1)
        elif ((data['date'][x].month) == 3) | ((data['date'][x].month) == 4) | ((data['date'][x].month) == 5):
            seasons.append(2)
        elif ((data['date'][x].month) == 6) | ((data['date'][x].month) == 7) | ((data['date'][x].month) == 8):
            seasons.append(3)
        elif ((data['date'][x].month) == 9) | ((data['date'][x].month) == 10) | ((data['date'][x].month) == 11):
            seasons.append(4)
    data['season'] = seasons

In [None]:
def wind_direction_group(data):
    """
    Function to create a new column of wind direction groups
    Selecting DataSet rows and classifing the direction in smaller groups
    N = 1
    S = 2
    E = 3
    W = 4
    """
    directions= []
    data['winddir3pm'] = data['winddir3pm'].astype('string')
    data['winddir9am'] = data['winddir9am'].astype('string')
    data['windgustdir'] = data['windgustdir'].astype('string')
    
    for x in range(len(data['date'])):
        if (data['winddir3pm'][x].upper().startswith('N')) | (data['winddir9am'][x].upper().startswith('N')) | (data['windgustdir'][x].upper().startswith('N')):
            directions.append(1)
        elif (data['winddir3pm'][x].upper().startswith('S')) | (data['winddir9am'][x].upper().startswith('S')) | (data['windgustdir'][x].upper().startswith('S')):
            directions.append(2)
        elif (data['winddir3pm'][x].upper().startswith('E')) | (data['winddir9am'][x].upper().startswith('E')) | (data['windgustdir'][x].upper().startswith('E')):
            directions.append(3)
        elif (data['winddir3pm'][x].upper().startswith('W')) | (data['winddir9am'][x].upper().startswith('W')) | (data['windgustdir'][x].upper().startswith('W')):
            directions.append(4)
            
    data['wind_dir_group'] = directions

In [None]:
wind_05 = pd.read_csv('data/wind_table_05.csv')
wind_05

In [None]:
wind_05.isna().sum()

In [None]:
wind_05.describe()

In [None]:
wind_05.corr()

In [None]:
wind_05['date']

In [None]:
wind_05.sample(3)

In [None]:
wind_05[['windgustspeed','windspeed9am','windspeed3pm']].isna().sum()

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
wind_05[['windgustspeed','windspeed9am','windspeed3pm']] = imputer.fit_transform(wind_05[['windgustspeed','windspeed9am','windspeed3pm']])

In [None]:
wind_05[['windgustspeed','windspeed9am','windspeed3pm']].isna().sum()

In [None]:
wind_05[['windgustdir','winddir9am','winddir3pm']].isna().sum()

In [None]:
print(f"{wind_05['windgustdir'].unique()}")
print(f"{wind_05['winddir9am'].unique()}")
print(f"{wind_05['winddir3pm'].unique()}")

In [None]:
imputer = SimpleImputer(strategy='most_frequent')
wind_05[['windgustdir','winddir9am','winddir3pm']] = imputer.fit_transform(wind_05[['windgustdir','winddir9am','winddir3pm']])

In [None]:
wind_direction_group(wind_05)

In [None]:
season_group(wind_05)

In [None]:
wind_05.sample(3)

In [11]:
wind_06 = pd.read_csv('data/wind_table_06.csv')
wind_06

Unnamed: 0,date,location,windgustdir,windgustspeed,winddir9am,winddir3pm,windspeed9am,windspeed3pm
0,2015-01-22,PearceRAAF,SW,37.0,SSE,SW,19.0,24.0
1,2015-01-22,PerthAirport,W,37.0,ESE,WSW,15.0,24.0
2,2015-01-22,Perth,SW,35.0,SE,SW,13.0,20.0
3,2015-01-22,SalmonGums,S,41.0,S,S,20.0,22.0
4,2015-01-22,Walpole,S,35.0,SW,SSW,9.0,19.0
...,...,...,...,...,...,...,...,...
19995,2016-03-21,Ballarat,SE,48.0,ESE,SE,24.0,30.0
19996,2016-03-21,Bendigo,SE,33.0,SE,ESE,15.0,13.0
19997,2016-03-21,Sale,SE,35.0,W,SE,9.0,26.0
19998,2016-03-21,MelbourneAirport,S,43.0,W,S,7.0,28.0


In [None]:
wind_06.isna().sum()

In [None]:
wind_06['date']

In [None]:
season_group(wind_06)

In [None]:
wind_06.sample(5)

In [None]:
wind_06.dtypes

In [None]:
list(zip(wind_06['winddir3pm'], wind_06['winddir9am'], wind_06['windgustdir']))

In [None]:
wind_06['winddir3pm'].unique()

In [None]:
len(wind_06['winddir9am'].value_counts())

In [None]:
wind_06['winddir9am'].isna().sum()

In [None]:
print(f"{wind_06['windgustdir'].unique()}")
print(f"{wind_06['winddir9am'].unique()}")
print(f"{wind_06['winddir3pm'].unique()}")

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
wind_06[['winddir3pm','winddir9am','windgustdir']] = imputer.fit_transform(wind_06[['winddir3pm','winddir9am','windgustdir']])

In [None]:
print(f"{wind_06['windgustdir'].isna().sum()}")
print(f"{wind_06['winddir9am'].isna().sum()}")
print(f"{wind_06['winddir3pm'].isna().sum()}")

In [None]:
wind_direction_group(wind_06)

In [None]:
wind_06.sample(5)

In [None]:
print(f"{wind_06['windgustspeed'].isna().sum()}")
print(f"{wind_06['windspeed9am'].isna().sum()}")
print(f"{wind_06['windspeed3pm'].isna().sum()}")

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
wind_06[['windgustspeed','windspeed9am','windspeed3pm']] = imputer.fit_transform(wind_06[['windgustspeed','windspeed9am','windspeed3pm']])

In [None]:
print(f"{wind_06['windgustspeed'].isna().sum()}")
print(f"{wind_06['windspeed9am'].isna().sum()}")
print(f"{wind_06['windspeed3pm'].isna().sum()}")

In [None]:
wind_06.sample(5)

In [None]:
! pip install geopy

In [None]:
wind_06['location'].unique()

In [None]:
lista = [x for x in wind_06['location']]
lista

In [None]:
wind_06['location'].str.replace( r"([A-Z]+)", r" \1").str.strip()

In [None]:

geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode('Pearce R A A F', 'Australia')
res = [location.latitude, location.longitude]
res

In [8]:
geolocator = Nominatim(user_agent="my-application")

wind_06['location'] = wind_06['location'].str.replace(r"([A-Z]+)", r" \1").str.strip()
cities = [x for x in wind_06['location'].unique()]
lat = []
long = []
          
for city in cities:
    location = geolocator.geocode(city, 'Australia')
    lat.append(location.latitude)
    long.append(location.longitude)

In [9]:
lat

[-31.6739604,
 -31.9431218,
 -31.9527121,
 -32.9815347,
 42.1417653,
 -42.8825088,
 -41.4340813,
 -23.6983884,
 -12.46044,
 -14.4642313,
 -25.3455545,
 -36.0804766,
 -35.8348792,
 -31.4983333,
 54.9738474,
 -33.2816667,
 -29.0328038,
 54.6639133,
 37.5385087,
 -33.8548157,
 -33.9498935,
 -35.115,
 -34.4243941,
 -35.2975906,
 -35.4209771,
 -35.5297196,
 -37.5622632,
 -36.7588767,
 44.980656,
 -37.667111000000006,
 -34.1847265,
 -35.4325283,
 45.5202471,
 -37.7110022,
 50.5657372,
 -27.4689682,
 -16.9206657,
 -28.0023731,
 -19.2569391,
 -34.9281805,
 -37.8246698,
 -34.4693354,
 -31.1999142,
 42.6511674,
 -34.0263348,
 47.9015941,
 -32.815,
 -30.2962407]

In [10]:
wind_06.head()

Unnamed: 0,date,location,windgustdir,windgustspeed,winddir9am,winddir3pm,windspeed9am,windspeed3pm
0,2015-01-22,Pearce RAAF,SW,37.0,SSE,SW,19.0,24.0
1,2015-01-22,Perth Airport,W,37.0,ESE,WSW,15.0,24.0
2,2015-01-22,Perth,SW,35.0,SE,SW,13.0,20.0
3,2015-01-22,Salmon Gums,S,41.0,S,S,20.0,22.0
4,2015-01-22,Walpole,S,35.0,SW,SSW,9.0,19.0


In [1]:
## clear jupyter memory
import gc

In [2]:
gc.collect()

271