In [225]:
#Load the csv file as data frame.
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import preprocessing

df = pd.read_csv('./weatherAUS.csv')
print('Size of weather data frame is :',df.shape)


bad_columns = ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday', 'RainTomorrow']

sdf = df[bad_columns].copy()


for i in range(6):
    le = preprocessing.LabelEncoder()
    le.fit(sdf[bad_columns[i]].astype(str))
    df[bad_columns[i]] = le.transform(sdf[bad_columns[i]].astype(str))

import seaborn as sns
import matplotlib.pyplot as plt
#sns.heatmap(df.corr().abs())
#plt.show()

numRain = df.loc[df['RainTomorrow'] == 1].shape[0]

df = pd.concat([df.loc[df['RainTomorrow'] == 1], df.loc[df['RainTomorrow'] == 0].sample(n=numRain)])
print(df.loc[df['RainTomorrow'] == 1].shape)
print(df.loc[df['RainTomorrow'] == 0].shape)

Size of weather data frame is : (142193, 24)
(31877, 24)
(31877, 24)


In [226]:
# We see there are some columns with null values. 
# Before we start pre-processing, let's find out which of the columns have maximum null values
df.count().sort_values()

Sunshine         33531
Evaporation      36287
Cloud3pm         39032
Cloud9am         40306
Pressure9am      57519
Pressure3pm      57545
WindGustSpeed    59346
Humidity3pm      62030
Temp3pm          62444
WindSpeed3pm     62519
Humidity9am      62834
Rainfall         62856
WindSpeed9am     63104
Temp9am          63252
MinTemp          63418
MaxTemp          63617
RainToday        63754
Date             63754
WindDir3pm       63754
WindDir9am       63754
WindGustDir      63754
Location         63754
RISK_MM          63754
RainTomorrow     63754
dtype: int64

In [227]:
# Features selection: We suppose that wind doesnt contribute to precipitation, so as the location and RISK_MM, 
# since we only want to predict wether it rains tommorrow in australian. The date is not important since we already 
# have RainTomorrow as target variable
#df = df.drop(columns=['WindDir9am','WindDir3pm','WindSpeed9am','WindSpeed3pm','WindGustDir','WindGustSpeed',
#                      'Location','RISK_MM','Date'],axis=1)
df = df.drop(columns=['Date', 'RISK_MM'],axis=1)
df.shape

(63754, 22)

In [228]:
#Let us get rid of all null values in df
df = df.dropna(how='any')
df.shape

(25916, 22)

In [229]:
#its time to remove the outliers in our data - we are using Z-score to detect and remove the outliers.
from scipy import stats
z = np.abs(stats.zscore(df._get_numeric_data()))
print(z)
df= df[(z < 3).all(axis=1)]
print(df.shape)
df[0:5]

[[1.22201422 1.17956147 1.99893258 ... 1.77934597 0.66002303 1.01783144]
 [1.22201422 1.24101008 1.02908447 ... 0.79305896 0.66002303 1.01783144]
 [1.22201422 1.0105778  1.7849955  ... 1.67781642 1.51509865 1.01783144]
 ...
 [0.58713737 0.08884867 1.49974606 ... 1.54727844 0.66002303 0.98248095]
 [1.22201422 1.49345299 0.86782432 ... 0.68637156 0.66002303 0.98248095]
 [1.14265462 0.68797261 0.78662245 ... 0.8945885  0.66002303 0.98248095]]
(24369, 22)


Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
5957,10,21.4,37.5,0.0,14.8,6.9,5,43.0,1,5,...,34.0,29.0,1013.1,1009.6,7.0,6.0,26.2,34.1,0,1
5960,10,21.8,30.7,0.0,8.0,5.9,14,56.0,3,3,...,71.0,63.0,1008.6,1006.2,7.0,7.0,24.4,27.3,0,1
5972,10,23.6,40.4,0.6,11.8,12.2,15,54.0,0,11,...,42.0,17.0,1008.4,1005.0,1.0,2.0,29.9,38.7,0,1
5982,10,16.8,23.3,0.6,8.0,2.3,0,63.0,0,0,...,66.0,53.0,1013.8,1012.2,7.0,7.0,18.5,22.4,0,1
5983,10,16.1,19.1,26.0,6.6,0.0,2,54.0,2,2,...,81.0,93.0,1014.3,1013.2,7.0,7.0,17.1,17.0,1,1


In [230]:
# Standardize data - using MinMaxScaler
from sklearn import preprocessing
scaler = preprocessing.MinMaxScaler()
scaler.fit(df)
df = pd.DataFrame(scaler.transform(df), index=df.index, columns=df.columns)
df[0:5]

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
5957,0.155556,0.762178,0.828784,0.0,0.925,0.492857,0.3125,0.447368,0.0625,0.3125,...,0.223529,0.282828,0.430233,0.400932,0.875,0.75,0.720548,0.783505,0.0,1.0
5960,0.155556,0.773639,0.66005,0.0,0.5,0.421429,0.875,0.618421,0.1875,0.1875,...,0.658824,0.626263,0.325581,0.321678,0.875,0.875,0.671233,0.608247,0.0,1.0
5972,0.155556,0.825215,0.900744,0.020134,0.7375,0.871429,0.9375,0.592105,0.0,0.6875,...,0.317647,0.161616,0.32093,0.293706,0.125,0.25,0.821918,0.902062,0.0,1.0
5982,0.155556,0.630372,0.476427,0.020134,0.5,0.164286,0.0,0.710526,0.0,0.0,...,0.6,0.525253,0.446512,0.461538,0.875,0.875,0.509589,0.481959,0.0,1.0
5983,0.155556,0.610315,0.372208,0.872483,0.4125,0.0,0.125,0.592105,0.125,0.125,...,0.776471,0.929293,0.45814,0.484848,0.875,0.875,0.471233,0.342784,1.0,1.0


In [231]:
#now that we are done with the pre-processing part, let's see which are the important features for RainTomorrow!
#Using SelectKBest to get the top features!
from sklearn.feature_selection import SelectKBest,chi2,f_classif
X = df.loc[:,df.columns!='RainTomorrow']
y = df[['RainTomorrow']]
selector = SelectKBest(f_classif, k=5)
selector.fit(X, y)         # Run score function on (X, y) and get the appropriate features.
X_new = selector.transform(X) # Reduce X to the selected features. (numpy.ndarray)
print (selector.get_support(indices=True))
print(X.columns[selector.get_support(indices=True)]) #get_support Get a mask, or integer index, of the features selected

[ 5 13 16 17 20]
Index(['Sunshine', 'Humidity3pm', 'Cloud9am', 'Cloud3pm', 'RainToday'], dtype='object')


  y = column_or_1d(y, warn=True)


In [232]:
df = df[['Sunshine','RainTomorrow']] # rearrange columns
X = df[['Sunshine']] # Trainingsets
y = df[['RainTomorrow']] # Target Variable

In [236]:
#Random Forest Classifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import time
t0=time.time()
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25)
clf_rf = RandomForestClassifier(n_estimators=101, max_depth=4,random_state=0)
clf_rf.fit(X_train,y_train)
y_pred = clf_rf.predict(X_test)
score = accuracy_score(y_test,y_pred)
print('Accuracy :',score)
print('Time taken :' , time.time()-t0)

  if __name__ == '__main__':


Accuracy : 0.7313310356146397
Time taken : 0.5311212539672852


In [133]:
#Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

t0=time.time()
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25)
clf_dt = DecisionTreeClassifier(random_state=0)
clf_dt.fit(X_train,y_train)
y_pred = clf_dt.predict(X_test)
score = accuracy_score(y_test,y_pred)
print('Accuracy :',score)
print('Time taken :' , time.time()-t0)

Accuracy : 0.7798934850806157
Time taken : 0.1249690055847168


In [96]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC

names = [ 'LogisticReg', 'RandomForest', 'DecisionTrree', 'Bernoulli', 'SVC']
models = [ LogisticRegression(), RandomForestClassifier(), DecisionTreeClassifier(), BernoulliNB(), SVC() ]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25)

for name, model in zip(names, models):
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    score = accuracy_score(y_test,y_pred)
    print(name + ' Accuracy :',score)
    

  y = column_or_1d(y, warn=True)
  


LogisticReg Accuracy : 0.8374553148026556
RandomForest Accuracy : 0.8151309549865032
DecisionTrree Accuracy : 0.7771941343838914
Bernoulli Accuracy : 0.8022178448967681


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


SVC Accuracy : 0.8370905376814766


In [120]:
model = LogisticRegression(penalty='l2', dual=True, 
                           tol=0.1, C=1, 
                           intercept_scaling=1,
                           max_iter=100)


model.fit(X_train,y_train)
y_pred = model.predict(X_test)
score = accuracy_score(y_test,y_pred)
print(' Accuracy :',score)
    

 Accuracy : 0.8374553148026556


  y = column_or_1d(y, warn=True)
