In [0]:
"""Importing all the relevant libraries we need."""

import pandas as pd # For data frames
import numpy as np # For math and efficient arrays
import matplotlib.pyplot as plt # For plotting and visualizing data

# So plots appear in this notebook
%matplotlib inline

In [0]:
# Import the data from csv
rain_data = pd.read_csv('https://raw.githubusercontent.com/Mvalentino92/MachineLearning/master/weatherAUS.csv')
rain_data.describe()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RISK_MM
count,141556.0,141871.0,140787.0,81350.0,74377.0,132923.0,140845.0,139563.0,140419.0,138583.0,128179.0,128212.0,88536.0,85099.0,141289.0,139467.0,142193.0
mean,12.1864,23.226784,2.349974,5.469824,7.624853,39.984292,14.001988,18.637576,68.84381,51.482606,1017.653758,1015.258204,4.437189,4.503167,16.987509,21.687235,2.360682
std,6.403283,7.117618,8.465173,4.188537,3.781525,13.588801,8.893337,8.803345,19.051293,20.797772,7.105476,7.036677,2.887016,2.720633,6.492838,6.937594,8.477969
min,-8.5,-4.8,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,980.5,977.1,0.0,0.0,-7.2,-5.4,0.0
25%,7.6,17.9,0.0,2.6,4.9,31.0,7.0,13.0,57.0,37.0,1012.9,1010.4,1.0,2.0,12.3,16.6,0.0
50%,12.0,22.6,0.0,4.8,8.5,39.0,13.0,19.0,70.0,52.0,1017.6,1015.2,5.0,5.0,16.7,21.1,0.0
75%,16.8,28.2,0.8,7.4,10.6,48.0,19.0,24.0,83.0,66.0,1022.4,1020.0,7.0,7.0,21.6,26.4,0.8
max,33.9,48.1,371.0,145.0,14.5,135.0,130.0,87.0,100.0,100.0,1041.0,1039.6,9.0,9.0,40.2,46.7,371.0


In [0]:
print('There are',rain_data.shape[0],'rows and',rain_data.shape[1],'columns')

There are 142193 rows and 24 columns


In [0]:
# Dropping the Date which seems unimportant, and the RISK_MM. 
dropped_features = ['Date','RISK_MM']
features = rain_data.drop(dropped_features,axis=1)
features = features.drop('RainTomorrow',axis=1)
label = rain_data['RainTomorrow']

In [0]:
# Split the numerical and the categorical data
features_dtypes = features.dtypes
num_bool = features_dtypes != 'object'
cat_bool = features_dtypes == 'object'
cols = features.columns
num_features = features[cols[num_bool]]
cat_features = features[cols[cat_bool]]

In [0]:
# Check to make sure they were split correctly
print('Numerical types:\n',num_features.dtypes)
print('\n\nCatergorical types:\n',cat_features.dtypes)

Numerical types:
 MinTemp          float64
MaxTemp          float64
Rainfall         float64
Evaporation      float64
Sunshine         float64
WindGustSpeed    float64
WindSpeed9am     float64
WindSpeed3pm     float64
Humidity9am      float64
Humidity3pm      float64
Pressure9am      float64
Pressure3pm      float64
Cloud9am         float64
Cloud3pm         float64
Temp9am          float64
Temp3pm          float64
dtype: object


Catergorical types:
 Location       object
WindGustDir    object
WindDir9am     object
WindDir3pm     object
RainToday      object
dtype: object


In [0]:
total_rows = features.shape[0]
for cfeat in cat_features:
    print(cfeat,'has',np.sum(cat_features[cfeat].isna())/total_rows*100,'percent nan values')

print()
for nfeat in num_features:
    print(nfeat,'has',np.sum(num_features[nfeat].isna())/total_rows*100,'percent nan values')

Location has 0.0 percent nan values
WindGustDir has 6.561504434114197 percent nan values
WindDir9am has 7.041837502549352 percent nan values
WindDir3pm has 2.6569521706413113 percent nan values
RainToday has 0.9887969168665124 percent nan values

MinTemp has 0.44798267143952236 percent nan values
MaxTemp has 0.2264527789694289 percent nan values
Rainfall has 0.9887969168665124 percent nan values
Evaporation has 42.78902618272348 percent nan values
Sunshine has 47.692924405561456 percent nan values
WindGustSpeed has 6.5193082641198945 percent nan values
WindSpeed9am has 0.948007285872019 percent nan values
WindSpeed3pm has 1.8495987847503041 percent nan values
Humidity9am has 1.247600092831574 percent nan values
Humidity3pm has 2.5388028946572616 percent nan values
Pressure9am has 9.85561877166949 percent nan values
Pressure3pm has 9.832410878172624 percent nan values
Cloud9am has 37.73533155640573 percent nan values
Cloud3pm has 40.15246882757942 percent nan values
Temp9am has 0.635755

In [0]:
drop_nans = ['Sunshine','Cloud9am','Cloud3pm','Evaporation']
num_features = num_features.drop(drop_nans,axis=1)
dropped_features.extend(drop_nans)
num_features.columns

Index(['MinTemp', 'MaxTemp', 'Rainfall', 'WindGustSpeed', 'WindSpeed9am',
       'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am',
       'Pressure3pm', 'Temp9am', 'Temp3pm'],
      dtype='object')

In [0]:
master_bool = np.full((total_rows,),False,dtype=bool)
for cfeat in cat_features:
    master_bool = np.any([master_bool,cat_features[cfeat].isna()],axis=0)
print('Would lose',np.sum(master_bool)/total_rows*100,'percent of data')

Would lose 12.998530166745198 percent of data


In [0]:
# Losing twelve percent is fine, so let's take these rows out of everything first (cat,num,labels)
# First let's flip the master_bool, since we want to actually keep the thins that are currently False!
master_bool = np.logical_not(master_bool)
cat_features = cat_features[master_bool]
num_features = num_features[master_bool]
label = label[master_bool]

In [0]:
# Alright, now we are ready to Onehot encode
from sklearn.preprocessing import OneHotEncoder
onehot = OneHotEncoder(sparse=False)
cat_X = onehot.fit_transform(cat_features)

# Now we're going to split the categorical features into train and test data
from sklearn.model_selection import train_test_split

# Create seed to split the same way for each piece of data
seed = 23
train_cat_X, test_cat_X = train_test_split(cat_X,test_size=0.20,random_state=seed)

In [0]:
from sklearn.impute import SimpleImputer
# Now let's split the numerical features so we can use an imputer.
# Fit on train, transform on train/test
train_num_features, test_num_features = train_test_split(num_features,test_size=0.20,random_state=seed)

# Create the imputer
imputer = SimpleImputer(strategy='mean')
imputer.fit(train_num_features)

# Imputers return matrices, so save as matrices now
train_num_X = imputer.transform(train_num_features)
test_num_X = imputer.transform(test_num_features)

In [0]:
# Finally let's split the label into train and test. But first, let's convert these 1 and 0
# 1 for yes, and 0 for no.
y = (label == 'Yes').astype(int)
train_y, test_y = train_test_split(y,test_size=0.20,random_state=seed)

In [0]:
# Finally we're going to combine all categorical with the numerical for each train and test
train_X = np.hstack((train_num_X,train_cat_X))
test_X = np.hstack((test_num_X,test_cat_X))

In [0]:
print('Train features shape:',train_X.shape)
print('Test features shape:',test_X.shape)
print('Train labels shape:',train_y.shape)
print('Test labels shape:',test_y.shape)

Train features shape: (98968, 109)
Test features shape: (24742, 109)
Train labels shape: (98968,)
Test labels shape: (24742,)


In [0]:
# Scale the data
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(train_X)
train_X = scaler.transform(train_X)
test_X = scaler.transform(test_X)

In [0]:
# Finally, let's use a naive logistic regression and see how it does!
from sklearn.linear_model import LogisticRegression
lreg = LogisticRegression(solver='newton-cg',max_iter=1000,random_state=seed)
lreg.fit(train_X,train_y)
lreg.score(test_X,test_y)

0.8568426157950044

In [0]:
rand_y = np.random.randint(0,2,size=test_y.size)
np.sum(rand_y == test_y)/test_y.size

0.5054563091100154