In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import precision_score, recall_score, roc_auc_score, plot_roc_curve
import warnings
warnings.simplefilter("ignore")
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
from sklearn.linear_model import LogisticRegression

In [3]:
# Source

rain = pd.read_csv('rain_data_aus.csv')
wind_table_01 = pd.read_csv('wind_table_01.csv')

In [4]:
rain.columns

Index(['date', 'location', 'mintemp', 'maxtemp', 'rainfall', 'evaporation',
       'sunshine', 'humidity9am', 'humidity3pm', 'pressure9am', 'pressure3pm',
       'cloud9am', 'cloud3pm', 'temp9am', 'temp3pm', 'raintoday',
       'amountOfRain', 'raintomorrow', 'temp', 'humidity', 'precipitation3pm',
       'precipitation9am', 'modelo_vigente'],
      dtype='object')

In [5]:

'''
# Training

X = rain[['rainfall', 'humidity3pm', 'pressure3pm', 'temp3pm', 'maxtemp']]
y = rain['raintomorrow']

'''


"\n# Training\n\nX = rain[['rainfall', 'humidity3pm', 'pressure3pm', 'temp3pm', 'maxtemp']]\ny = rain['raintomorrow']\n\n"

In [6]:


'''
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y)
'''


'\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y)\n'

In [7]:

'''
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.15, stratify=y_train)
'''


'\nX_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.15, stratify=y_train)\n'

In [8]:

'''
# Classification

cat_columns = list(X_train.select_dtypes('object').columns)
num_columns = list(X_train.select_dtypes(exclude='object').columns)
'''


"\n# Classification\n\ncat_columns = list(X_train.select_dtypes('object').columns)\nnum_columns = list(X_train.select_dtypes(exclude='object').columns)\n"

In [9]:
# Categorical pipeline components

categorical_imputer = SimpleImputer(strategy='most_frequent')
onehotencoder = OneHotEncoder(drop='first',  handle_unknown='error')

In [10]:
# Categorical pipeline

categorical_pipeline = Pipeline(steps=[('missing', categorical_imputer),
                                       ('encode', onehotencoder)])

In [11]:
# Numerical pipeline components

numerical_imputer_knn = KNNImputer()

In [12]:
# Numerical pipeline

numerical_pipeline = Pipeline(steps=[('missing', numerical_imputer_knn)])

In [13]:
# Columns transform
'''
dataprep = ColumnTransformer(transformers=[('numerical_transform', numerical_pipeline, num_columns),
                                           ('categ_transform', categorical_pipeline, cat_columns)])'''

"\ndataprep = ColumnTransformer(transformers=[('numerical_transform', numerical_pipeline, num_columns),\n                                           ('categ_transform', categorical_pipeline, cat_columns)])"

In [14]:
# Model

scaler = StandardScaler()
model_lgbm = LGBMClassifier()

'''

pipeline = Pipeline(steps=[('data', dataprep),
                           ('scaler', StandardScaler()),
                           ('model', model_lgbm)])
                           
'''

"\n\npipeline = Pipeline(steps=[('data', dataprep),\n                           ('scaler', StandardScaler()),\n                           ('model', model_lgbm)])\n                           \n"

In [15]:
cities = rain['location'].unique()

In [16]:
cities = list(rain['location'].unique())

In [45]:
prediction_r = []
score_r =[]

for i in cities:
    
    X = rain[(rain['location'] == i)][[ 'humidity3pm', 'humidity9am', 'pressure3pm', 'pressure9am', 'temp3pm', 'temp9am']]
    y = rain[(rain['location'] == i)]['raintomorrow']
                                      
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y)
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.20, stratify=y_train)
                                          
    cat_columns = list(X_train.select_dtypes('object').columns)
    num_columns = list(X_train.select_dtypes(exclude='object').columns)
                                      
    dataprep = ColumnTransformer(transformers=[('numerical_transform', numerical_pipeline, num_columns),
                                               ('categ_transform', categorical_pipeline, cat_columns)])
    
    pipeline = Pipeline(steps=[('data', dataprep),
                               ('scaler', StandardScaler()),
                               ('model', model_lgbm)])
                                      
    pipeline.fit(X_train, y_train)
                                      
    prediction_v = pipeline.predict(X_valid)
    prediction_r.append(prediction_v)
    
    score_v = pipeline.score(X_valid, y_valid)
    print(i)
    print(X.isna().mean())
    print(score_v)
    score_r.append(score_v)
    
print(sum(score_r)/len(score_r))   

                                      
    

Albury
humidity3pm    0.001328
humidity9am    0.001328
pressure3pm    0.001328
pressure9am    0.000000
temp3pm        0.001328
temp9am        0.001328
dtype: float64
0.8515625
BadgerysCreek
humidity3pm    0.010587
humidity9am    0.009904
pressure3pm    0.050546
pressure9am    0.048839
temp3pm        0.006831
temp9am        0.005464
dtype: float64
0.8293172690763052
Cobar
humidity3pm    0.038487
humidity9am    0.038153
pressure3pm    0.008367
pressure9am    0.009036
temp3pm        0.002008
temp9am        0.002008
dtype: float64
0.889763779527559
CoffsHarbour
humidity3pm    0.004064
humidity9am    0.002370
pressure3pm    0.090078
pressure9am    0.089739
temp3pm        0.001355
temp9am        0.001016
dtype: float64
0.8127490039840638
Moree
humidity3pm    0.001752
humidity9am    0.001402
pressure3pm    0.000350
pressure9am    0.000000
temp3pm        0.000000
temp9am        0.000000
dtype: float64
0.8969072164948454
Newcastle
humidity3pm    0.292724
humidity9am    0.060914
pressure3pm    1

AliceSprings
humidity3pm    0.00165
humidity9am    0.00099
pressure3pm    0.00099
pressure9am    0.00033
temp3pm        0.00066
temp9am        0.00000
dtype: float64
0.939922480620155
Darwin
humidity3pm    0.000627
humidity9am    0.000313
pressure3pm    0.000627
pressure9am    0.000313
temp3pm        0.000627
temp9am        0.000000
dtype: float64
0.8324125230202578
Katherine
humidity3pm    0.527261
humidity9am    0.049391
pressure3pm    0.003207
pressure9am    0.000641
temp3pm        0.455420
temp9am        0.024375
dtype: float64
0.8452830188679246
Uluru
humidity3pm    0.000657
humidity9am    0.001972
pressure3pm    0.000657
pressure9am    0.001972
temp3pm        0.000657
temp9am        0.001972
dtype: float64
0.9382239382239382
0.8394840283041541


In [18]:
print(sum(score_r)/len(score_r))

0.8356873478951429


In [46]:
trial_r = rain[rain['location'] == 'NorfolkIsland'][['date', 'humidity3pm', 'humidity9am', 'pressure3pm', 'pressure9am', 'temp3pm', 'temp9am', 'raintomorrow']]

In [47]:
wind_table_02 = pd.read_csv('wind_table_02.csv')
wind_table_03 = pd.read_csv('wind_table_03.csv')
wind_table_04 = pd.read_csv('wind_table_04.csv')
wind_table_05 = pd.read_csv('wind_table_05.csv')
wind_table_06 = pd.read_csv('wind_table_06.csv')
wind_table_07 = pd.read_csv('wind_table_07.csv')
wind_table_08 = pd.read_csv('wind_table_08.csv')

In [48]:
trial_w = wind_table_01[wind_table_01['location'] == 'NorfolkIsland']

In [49]:
trial_w = trial_w.append(wind_table_02[wind_table_02['location'] == 'NorfolkIsland'])
trial_w = trial_w.append(wind_table_03[wind_table_03['location'] == 'NorfolkIsland'])
trial_w = trial_w.append(wind_table_04[wind_table_04['location'] == 'NorfolkIsland'])
trial_w = trial_w.append(wind_table_05[wind_table_05['location'] == 'NorfolkIsland'])
trial_w = trial_w.append(wind_table_06[wind_table_06['location'] == 'NorfolkIsland'])
trial_w = trial_w.append(wind_table_07[wind_table_07['location'] == 'NorfolkIsland'])
trial_w = trial_w.append(wind_table_08[wind_table_08['location'] == 'NorfolkIsland'])

In [50]:
trial_c = trial_r.merge(trial_w, left_on='date', right_on='date')

In [51]:
trial_c.drop('location', axis = 1, inplace = True)

In [52]:
trial_c.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3382 entries, 0 to 3381
Data columns (total 20 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   date            3382 non-null   object 
 1   humidity3pm     3380 non-null   float64
 2   humidity9am     3376 non-null   float64
 3   pressure3pm     3380 non-null   float64
 4   pressure9am     3373 non-null   float64
 5   temp3pm         3380 non-null   float64
 6   temp9am         3376 non-null   float64
 7   raintomorrow    3382 non-null   object 
 8   wind_gustdir    829 non-null    object 
 9   wind_gustspeed  830 non-null    float64
 10  wind_dir9am     821 non-null    object 
 11  wind_dir3pm     831 non-null    object 
 12  wind_speed9am   828 non-null    float64
 13  wind_speed3pm   832 non-null    float64
 14  windgustdir     2500 non-null   object 
 15  windgustspeed   2500 non-null   float64
 16  winddir9am      2527 non-null   object 
 17  winddir3pm      2546 non-null   o

In [53]:
X = trial_c[['humidity3pm', 'humidity9am', 'pressure3pm', 'pressure9am', 'temp3pm', 'temp9am', 'wind_speed9am', 'windspeed9am',
            'wind_speed3pm', 'windspeed3pm']]
y = trial_c['raintomorrow']
                                      
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.20, stratify=y_train)
                                          
cat_columns = list(X_train.select_dtypes('object').columns)
num_columns = list(X_train.select_dtypes(exclude='object').columns)
                                      
dataprep = ColumnTransformer(transformers=[('numerical_transform', numerical_pipeline, num_columns),
                                            ('categ_transform', categorical_pipeline, cat_columns)])
    
pipeline = Pipeline(steps=[('data', dataprep),
                            ('scaler', StandardScaler()),
                            ('model', model_lgbm)])
                                      
pipeline.fit(X_train, y_train)
                                      
prediction_v = pipeline.predict(X_valid)
    
score_v = pipeline.score(X_valid, y_valid)

print(X.isna().mean())
print(score_v)
    

humidity3pm      0.000591
humidity9am      0.001774
pressure3pm      0.000591
pressure9am      0.002661
temp3pm          0.000591
temp9am          0.001774
wind_speed9am    0.755174
windspeed9am     0.247487
wind_speed3pm    0.753992
windspeed3pm     0.246600
dtype: float64
0.8104347826086956


In [61]:
prediction_r = []
score_r =[]

for i in cities:
    
    trial_r = rain[rain['location'] == i][['date', 'humidity3pm', 'humidity9am', 'pressure3pm', 'pressure9am', 'temp3pm', 'temp9am', 'raintomorrow']]
    
    trial_w = wind_table_01[wind_table_01['location'] == i]
    trial_w = trial_w.append(wind_table_02[wind_table_02['location'] == i])
    trial_w = trial_w.append(wind_table_03[wind_table_03['location'] == i])
    trial_w = trial_w.append(wind_table_04[wind_table_04['location'] == i])
    trial_w = trial_w.append(wind_table_05[wind_table_05['location'] == i])
    trial_w = trial_w.append(wind_table_06[wind_table_06['location'] == i])
    trial_w = trial_w.append(wind_table_07[wind_table_07['location'] == i])
    trial_w = trial_w.append(wind_table_08[wind_table_08['location'] == i])
    
    trial_c = trial_r.merge(trial_w, left_on='date', right_on='date')
    
    
    X = trial_c[['humidity3pm', 'humidity9am', 'pressure3pm', 'pressure9am', 'temp3pm', 'temp9am', 'wind_speed9am', 'windspeed9am',
            'wind_speed3pm', 'windspeed3pm']]
    y = trial_c['raintomorrow']
                                      
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y)
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.20, stratify=y_train)
                                          
    cat_columns = list(X_train.select_dtypes('object').columns)
    num_columns = list(X_train.select_dtypes(exclude='object').columns)
                                      
    dataprep = ColumnTransformer(transformers=[('numerical_transform', numerical_pipeline, num_columns),
                                                ('categ_transform', categorical_pipeline, cat_columns)])
    
    pipeline = Pipeline(steps=[('data', dataprep),
                                ('scaler', StandardScaler()),
                                ('model', model_lgbm)])
                                      
    pipeline.fit(X_train, y_train)
                                      
    prediction_v = pipeline.predict(X_valid)
    
    score_v = pipeline.score(X_valid, y_valid)

    print(i)
    print(trial_c.shape)
    print(score_v)
    score_r.append(score_v)
    
print(sum(score_r)/len(score_r))


Albury
(3467, 21)
0.8661016949152542
BadgerysCreek
(3371, 21)
0.8900523560209425
Cobar
(3437, 21)
0.9145299145299145
CoffsHarbour
(3414, 21)
0.8175559380378657
Moree
(3315, 21)
0.9060283687943262
Newcastle
(3402, 21)
0.7996545768566494
NorahHead
(3390, 21)
0.8318890814558059
NorfolkIsland
(3382, 21)
0.8069565217391305
Penrith
(3414, 21)
0.8691910499139415
Richmond
(3411, 21)
0.8896551724137931
Sydney
(3798, 21)
0.848297213622291
SydneyAirport
(3463, 21)
0.8353140916808149
WaggaWagga
(3404, 21)
0.9205526770293609
Williamtown
(2986, 21)
0.8307086614173228
Wollongong
(3434, 21)
0.8561643835616438
Canberra
(3877, 21)
0.8679817905918058
Tuggeranong
(3455, 21)
0.8724489795918368
MountGinini
(3367, 21)
0.8202443280977313
Ballarat
(3489, 21)
0.851602023608769
Bendigo
(3491, 21)
0.8855218855218855
Sale
(3457, 21)
0.8622448979591837
MelbourneAirport
(3470, 21)
0.8661016949152542
Melbourne
(2857, 21)
0.8148148148148148
Mildura
(3468, 21)
0.9338983050847458
Nhil
(2031, 21)
0.9335260115606936
Portl

In [59]:
rain[rain['location'] == 'Uluru']

Unnamed: 0,date,location,mintemp,maxtemp,rainfall,evaporation,sunshine,humidity9am,humidity3pm,pressure9am,...,temp9am,temp3pm,raintoday,amountOfRain,raintomorrow,temp,humidity,precipitation3pm,precipitation9am,modelo_vigente
140672,2013-03-01,Uluru,19.7,30.0,0.8,,,76.0,54.0,1010.6,...,21.7,28.4,No,0.0,No,38.000000,66.8,10,5.880475,0.209739
140673,2013-03-02,Uluru,21.6,33.1,0.0,,,44.0,33.0,1010.5,...,24.6,31.3,No,0.0,No,41.720000,41.6,9,18.827567,0.065101
140674,2013-03-03,Uluru,21.3,36.1,0.0,,,39.0,27.0,1006.9,...,27.6,34.5,No,0.0,No,45.320000,34.4,8,9.866962,0.113213
140675,2013-03-04,Uluru,22.9,37.7,0.0,,,35.0,22.0,1006.0,...,28.7,35.4,No,0.0,No,47.240000,28.4,12,4.934891,0.137201
140676,2013-03-05,Uluru,24.0,39.0,0.0,,,33.0,21.0,1006.9,...,29.9,37.3,No,0.0,No,2.189369,27.2,7,5.248826,0.067973
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142188,2017-06-20,Uluru,3.5,21.8,0.0,,,59.0,27.0,1024.7,...,9.4,20.9,No,0.0,No,28.160000,34.4,12,5.848681,0.002556
142189,2017-06-21,Uluru,2.8,23.4,0.0,,,51.0,24.0,1024.6,...,10.1,22.4,No,0.0,No,30.080000,30.8,10,6.653879,0.002053
142190,2017-06-22,Uluru,3.6,25.3,0.0,,,56.0,21.0,1023.5,...,10.9,24.5,No,0.0,No,32.360000,27.2,9,19.715976,0.023350
142191,2017-06-23,Uluru,5.4,26.9,0.0,,,53.0,24.0,1021.0,...,12.5,26.1,No,0.0,No,34.280000,30.8,12,0.985551,0.007195


In [64]:
rain[rain['location'] == 'NorfolkIsland']['raintomorrow'].value_counts()

No     2045
Yes     919
Name: raintomorrow, dtype: int64