In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, learning_curve, cross_val_score, KFold
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.impute import SimpleImputer


In [18]:
rain = pd.read_csv('data/data_backup.csv')

In [19]:
rain.sample(3)

Unnamed: 0,date,location,mintemp,maxtemp,rainfall,evaporation,sunshine,humidity9am,humidity3pm,pressure9am,...,wind_dir9am,wind_dir3pm,wind_speed9am,wind_speed3pm,month,season,loc_rainto_mean,wind_rainto_mean,location_lat,location_lon
101563,2013-08-06,Nuriootpa,11.7,13.2,0.8,4.8,1.5,92.0,95.0,1013.2,...,WSW,SW,24.0,24.0,8,Winter,0.1972,0.2666,-34.469335,138.993901
130513,2010-09-13,Launceston,9.1,14.0,1.8,,,57.0,35.0,1007.1,...,WNW,WNW,11.0,19.0,9,Spring,0.2308,0.2666,-41.434081,147.13735
39316,2010-05-19,Williamtown,10.2,19.9,7.2,1.2,2.8,90.0,70.0,1021.5,...,W,SSW,13.0,20.0,5,Autumn,0.2742,0.2254,-32.815,151.842778


In [20]:
rain.shape

(142193, 35)

In [21]:
rain.isna().sum()

date                    0
location                0
mintemp               637
maxtemp               322
rainfall             1406
evaporation         60843
sunshine            67816
humidity9am          1774
humidity3pm          3610
pressure9am         14014
pressure3pm         13981
cloud9am            53657
cloud3pm            57094
temp9am               904
temp3pm              2726
raintoday            1406
amountOfRain            0
raintomorrow            0
temp                  322
humidity             3610
precipitation3pm        0
precipitation9am        0
modelo_vigente          0
wind_gustdir         9330
wind_gustspeed       9270
wind_dir9am         10013
wind_dir3pm          3778
wind_speed9am        1348
wind_speed3pm        2630
month                   0
season                  0
loc_rainto_mean         0
wind_rainto_mean     9330
location_lat            0
location_lon            0
dtype: int64

In [22]:
rain.loc[rain['evaporation'].isna()].head()

Unnamed: 0,date,location,mintemp,maxtemp,rainfall,evaporation,sunshine,humidity9am,humidity3pm,pressure9am,...,wind_dir9am,wind_dir3pm,wind_speed9am,wind_speed3pm,month,season,loc_rainto_mean,wind_rainto_mean,location_lat,location_lon
0,2008-12-01,Albury,13.4,22.9,0.6,,,71.0,22.0,1007.7,...,W,WNW,20.0,24.0,12,Summer,0.2052,0.2666,-36.080477,146.91628
1,2008-12-02,Albury,7.4,25.1,0.0,,,44.0,25.0,1010.6,...,NNW,WSW,4.0,22.0,12,Summer,0.2052,0.282,-36.080477,146.91628
2,2008-12-03,Albury,12.9,25.7,0.0,,,38.0,30.0,1007.6,...,W,WSW,19.0,26.0,12,Summer,0.2052,0.2342,-36.080477,146.91628
3,2008-12-04,Albury,9.2,28.0,0.0,,,45.0,16.0,1017.6,...,SE,E,11.0,9.0,12,Summer,0.2052,0.1868,-36.080477,146.91628
4,2008-12-05,Albury,17.5,32.3,1.0,,,82.0,33.0,1010.8,...,ENE,NW,7.0,20.0,12,Summer,0.2052,0.2666,-36.080477,146.91628


In [23]:
rain['evaporation'].fillna(0, inplace=True)

In [24]:
rain.loc[rain['sunshine'].isna()].head()

Unnamed: 0,date,location,mintemp,maxtemp,rainfall,evaporation,sunshine,humidity9am,humidity3pm,pressure9am,...,wind_dir9am,wind_dir3pm,wind_speed9am,wind_speed3pm,month,season,loc_rainto_mean,wind_rainto_mean,location_lat,location_lon
0,2008-12-01,Albury,13.4,22.9,0.6,0.0,,71.0,22.0,1007.7,...,W,WNW,20.0,24.0,12,Summer,0.2052,0.2666,-36.080477,146.91628
1,2008-12-02,Albury,7.4,25.1,0.0,0.0,,44.0,25.0,1010.6,...,NNW,WSW,4.0,22.0,12,Summer,0.2052,0.282,-36.080477,146.91628
2,2008-12-03,Albury,12.9,25.7,0.0,0.0,,38.0,30.0,1007.6,...,W,WSW,19.0,26.0,12,Summer,0.2052,0.2342,-36.080477,146.91628
3,2008-12-04,Albury,9.2,28.0,0.0,0.0,,45.0,16.0,1017.6,...,SE,E,11.0,9.0,12,Summer,0.2052,0.1868,-36.080477,146.91628
4,2008-12-05,Albury,17.5,32.3,1.0,0.0,,82.0,33.0,1010.8,...,ENE,NW,7.0,20.0,12,Summer,0.2052,0.2666,-36.080477,146.91628


In [25]:
rain['sunshine'].fillna(0, inplace=True)

In [26]:
rain.loc[rain['cloud9am'].isna()].head().T

Unnamed: 0,1,2,3,5,7
date,2008-12-02,2008-12-03,2008-12-04,2008-12-06,2008-12-08
location,Albury,Albury,Albury,Albury,Albury
mintemp,7.4,12.9,9.2,14.6,7.7
maxtemp,25.1,25.7,28,29.7,26.7
rainfall,0,0,0,0.2,0
evaporation,0,0,0,0,0
sunshine,0,0,0,0,0
humidity9am,44,38,45,55,48
humidity3pm,25,30,16,23,19
pressure9am,1010.6,1007.6,1017.6,1009.2,1013.4


In [27]:
rain['cloud9am'].fillna(0, inplace=True)

In [28]:
rain['cloud3pm'].fillna(0, inplace=True)

In [29]:
rain.isna().sum()

date                    0
location                0
mintemp               637
maxtemp               322
rainfall             1406
evaporation             0
sunshine                0
humidity9am          1774
humidity3pm          3610
pressure9am         14014
pressure3pm         13981
cloud9am                0
cloud3pm                0
temp9am               904
temp3pm              2726
raintoday            1406
amountOfRain            0
raintomorrow            0
temp                  322
humidity             3610
precipitation3pm        0
precipitation9am        0
modelo_vigente          0
wind_gustdir         9330
wind_gustspeed       9270
wind_dir9am         10013
wind_dir3pm          3778
wind_speed9am        1348
wind_speed3pm        2630
month                   0
season                  0
loc_rainto_mean         0
wind_rainto_mean     9330
location_lat            0
location_lon            0
dtype: int64

In [30]:
rain['mintemp'] = rain['mintemp'].astype('float')

In [31]:
rain['mintemp'].fillna(rain['mintemp'].mean(), inplace=True)

In [32]:
rain['mintemp'].dtype

dtype('float64')

In [33]:
rain['maxtemp'] = rain['maxtemp'].astype('float')

In [34]:
rain['maxtemp'].fillna(rain['maxtemp'].mean(), inplace=True)

In [35]:
rain['humidity'] = rain['humidity'].astype('float')

In [36]:
rain['humidity'].fillna(rain['humidity'].mean(), inplace=True)

In [37]:
rain['humidity9am'] = rain['humidity9am'].astype('float')

In [38]:
rain['humidity9am'].fillna(rain['humidity9am'].mean(), inplace=True)

In [39]:
rain['humidity3pm'] = rain['humidity3pm'].astype('float')

In [40]:
rain['humidity3pm'].fillna(rain['humidity3pm'].mean(), inplace=True)

In [41]:
rain['rainfall'].fillna(0, inplace=True)

In [42]:
rain.isna().sum()

date                    0
location                0
mintemp                 0
maxtemp                 0
rainfall                0
evaporation             0
sunshine                0
humidity9am             0
humidity3pm             0
pressure9am         14014
pressure3pm         13981
cloud9am                0
cloud3pm                0
temp9am               904
temp3pm              2726
raintoday            1406
amountOfRain            0
raintomorrow            0
temp                  322
humidity                0
precipitation3pm        0
precipitation9am        0
modelo_vigente          0
wind_gustdir         9330
wind_gustspeed       9270
wind_dir9am         10013
wind_dir3pm          3778
wind_speed9am        1348
wind_speed3pm        2630
month                   0
season                  0
loc_rainto_mean         0
wind_rainto_mean     9330
location_lat            0
location_lon            0
dtype: int64

In [44]:
rain['raintoday'].fillna(0, inplace=True)

In [45]:
rain.loc[rain['pressure9am'].isna()].head()

Unnamed: 0,date,location,mintemp,maxtemp,rainfall,evaporation,sunshine,humidity9am,humidity3pm,pressure9am,...,wind_dir9am,wind_dir3pm,wind_speed9am,wind_speed3pm,month,season,loc_rainto_mean,wind_rainto_mean,location_lat,location_lon
3350,2009-12-12,Badgerys Creek,12.1864,32.6,0.0,0.0,0.0,68.84381,51.482606,,...,,,,,12,Summer,0.1991,0.1952,-35.834879,149.995766
3376,2010-01-09,Badgerys Creek,12.1864,38.6,0.0,0.0,0.0,68.84381,23.0,,...,,WSW,,17.0,1,Summer,0.1991,0.1488,-35.834879,149.995766
3395,2010-01-30,Badgerys Creek,19.6,28.2,0.0,0.0,0.0,68.84381,51.482606,,...,,,,,1,Summer,0.1991,0.1868,-35.834879,149.995766
3399,2010-02-04,Badgerys Creek,21.3,27.5,0.0,0.0,0.0,68.84381,69.0,,...,,ENE,,24.0,2,Summer,0.1991,0.1612,-35.834879,149.995766
3406,2010-02-13,Badgerys Creek,20.9,25.0,0.0,0.0,0.0,68.84381,77.0,,...,,NNE,,7.0,2,Summer,0.1991,0.2254,-35.834879,149.995766


In [49]:
rain['pressure9am'] = rain['pressure9am'].astype('float')

In [50]:
rain['pressure9am'].fillna(rain['pressure9am'].mean(), inplace=True)

In [51]:
rain['pressure3pm'] = rain['pressure3pm'].astype('float')

In [52]:
rain['pressure3pm'].fillna(rain['pressure3pm'].mean(), inplace=True)

In [56]:
rain['temp9am'] = rain['temp9am'].astype('float')
rain['temp9am'].fillna(rain['temp9am'].mean(), inplace=True)

rain['temp3pm'] = rain['temp3pm'].astype('float')
rain['temp3pm'].fillna(rain['temp3pm'].mean(), inplace=True)

rain['temp'] = rain['temp'].astype('float')
rain['temp'].fillna(rain['temp'].mean(), inplace=True)

rain['wind_gustspeed'] = rain['wind_gustspeed'].astype('float')
rain['wind_gustspeed'].fillna(rain['wind_gustspeed'].mean(), inplace=True)

rain['wind_speed9am'] = rain['wind_speed9am'].astype('float')
rain['wind_speed9am'].fillna(rain['wind_speed9am'].mean(), inplace=True)

rain['wind_speed3pm'] = rain['wind_speed3pm'].astype('float')
rain['wind_speed3pm'].fillna(rain['wind_speed3pm'].mean(), inplace=True)

In [57]:
rain.isna().sum()

date                    0
location                0
mintemp                 0
maxtemp                 0
rainfall                0
evaporation             0
sunshine                0
humidity9am             0
humidity3pm             0
pressure9am             0
pressure3pm             0
cloud9am                0
cloud3pm                0
temp9am                 0
temp3pm                 0
raintoday               0
amountOfRain            0
raintomorrow            0
temp                    0
humidity                0
precipitation3pm        0
precipitation9am        0
modelo_vigente          0
wind_gustdir         9330
wind_gustspeed          0
wind_dir9am         10013
wind_dir3pm          3778
wind_speed9am           0
wind_speed3pm           0
month                   0
season                  0
loc_rainto_mean         0
wind_rainto_mean     9330
location_lat            0
location_lon            0
dtype: int64

In [66]:
def season_group(data):
    """
    Function to create a new column of seasons groups
    Selecting DataSet rows and classifing the season by the month
    Summer = 1
    Autumn = 2
    Winter = 3
    Spring = 4
    """
    seasons= []
    data['date'] = pd.to_datetime(data['date'])
    
    for x in range(len(data['date'])):
        if ((data['date'][x].month) == 12) | ((data['date'][x].month) == 1) | ((data['date'][x].month) == 2):
            seasons.append(1)
        elif ((data['date'][x].month) == 3) | ((data['date'][x].month) == 4) | ((data['date'][x].month) == 5):
            seasons.append(2)
        elif ((data['date'][x].month) == 6) | ((data['date'][x].month) == 7) | ((data['date'][x].month) == 8):
            seasons.append(3)
        elif ((data['date'][x].month) == 9) | ((data['date'][x].month) == 10) | ((data['date'][x].month) == 11):
            seasons.append(4)
    data['season'] = seasons

In [67]:
season_group(rain)

In [73]:
rain.head().T

Unnamed: 0,0,1,2,3,4
date,2008-12-01 00:00:00,2008-12-02 00:00:00,2008-12-03 00:00:00,2008-12-04 00:00:00,2008-12-05 00:00:00
location,Albury,Albury,Albury,Albury,Albury
mintemp,13.4,7.4,12.9,9.2,17.5
maxtemp,22.9,25.1,25.7,28,32.3
rainfall,0.6,0,0,0,1
evaporation,0,0,0,0,0
sunshine,0,0,0,0,0
humidity9am,71,44,38,45,82
humidity3pm,22,25,30,16,33
pressure9am,1007.7,1010.6,1007.6,1017.6,1010.8


In [74]:
X = rain.drop(['raintomorrow', 'wind_gustdir', 'wind_dir9am', 'wind_dir3pm', 'location', 'date'], axis=1)
y = rain.raintomorrow

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [79]:
forest = RandomForestClassifier(max_depth=2, n_estimators=100, n_jobs=-1)

results = cross_val_score(forest, X_train, y_train, cv=KFold(10, shuffle=True), scoring='roc_auc')
(np.mean(results), np.std(results))

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').



(nan, nan)

In [72]:
tree.fit(X_train, y_train)
pd.DataFrame(zip(tree.feature_importances_, X_train.columns), columns=['importance','variable']).sort_values(by='importance')

TypeError: invalid type promotion