In [1]:
#Load the csv file as data frame.
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

df = pd.read_csv('./weatherAUS.csv')
print('Size of weather data frame is :',df.shape)
#Let us see how our data looks like!
df[0:5]

Size of weather data frame is : (142193, 24)


Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,0.0,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,25.0,1010.6,1007.8,,,17.2,24.3,No,0.0,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,0.0,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,16.0,1017.6,1012.8,,,18.1,26.5,No,1.0,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,0.2,No


In [2]:
# We see there are some columns with null values. 
# Before we start pre-processing, let's find out which of the columns have maximum null values
df.count().sort_values()

Sunshine          74377
Evaporation       81350
Cloud3pm          85099
Cloud9am          88536
Pressure9am      128179
Pressure3pm      128212
WindDir9am       132180
WindGustDir      132863
WindGustSpeed    132923
WindDir3pm       138415
Humidity3pm      138583
Temp3pm          139467
WindSpeed3pm     139563
Humidity9am      140419
RainToday        140787
Rainfall         140787
WindSpeed9am     140845
Temp9am          141289
MinTemp          141556
MaxTemp          141871
Date             142193
Location         142193
RISK_MM          142193
RainTomorrow     142193
dtype: int64

In [3]:
# Features selection: We suppose that wind doesnt contribute to precipitation, so as the location and RISK_MM, 
# since we only want to predict wether it rains tommorrow in australian. The date is not important since we already 
# have RainTomorrow as target variable
df = df.drop(columns=['WindDir9am','WindDir3pm','WindSpeed9am','WindSpeed3pm','WindGustDir','WindGustSpeed',
                      'Location','RISK_MM','Date'],axis=1)
df.shape

(142193, 15)

In [4]:
#Let us get rid of all null values in df
df = df.dropna(how='any')
df.shape

(61918, 15)

In [5]:
#its time to remove the outliers in our data - we are using Z-score to detect and remove the outliers.
from scipy import stats
z = np.abs(stats.zscore(df._get_numeric_data()))
print(z)
df= df[(z < 3).all(axis=1)]
print(df.shape)
df[0:5]

[[0.71673382 1.63191398 0.30770326 ... 0.2477695  1.32044026 1.62157427]
 [0.79521797 0.71755555 0.30770326 ... 1.26131885 0.35282645 0.67377743]
 [0.95218629 1.980241   0.30770326 ... 0.62504159 1.6429782  1.84371415]
 ...
 [1.15624509 1.28358696 0.30770326 ... 1.63859093 1.04397917 1.42905304]
 [0.96788312 1.1384507  0.30770326 ... 1.26131885 1.04397917 0.9995826 ]
 [1.07776093 1.12393707 0.30770326 ... 0.2477695  1.13613287 1.26615046]]
(59544, 15)


Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
5939,17.9,35.2,0.0,12.0,12.3,20.0,13.0,1006.3,1004.4,2.0,5.0,26.6,33.4,No,No
5940,18.4,28.9,0.0,14.8,13.0,30.0,8.0,1012.9,1012.1,1.0,1.0,20.3,27.0,No,No
5942,19.4,37.6,0.0,10.8,10.6,42.0,22.0,1012.3,1009.2,1.0,6.0,28.7,34.9,No,No
5943,21.9,38.4,0.0,11.4,12.2,37.0,22.0,1012.7,1009.1,1.0,5.0,29.1,35.6,No,No
5944,24.2,41.0,0.0,11.2,8.4,19.0,15.0,1010.7,1007.4,1.0,6.0,33.6,37.6,No,No


In [6]:
# Change categorical cloumns yes/no to 1/0 for RainToday and RainTomorrow
df['RainToday'].replace({'No': 0, 'Yes': 1},inplace = True)
df['RainTomorrow'].replace({'No': 0, 'Yes': 1},inplace = True)

In [7]:
# Standardize data - using MinMaxScaler
from sklearn import preprocessing
scaler = preprocessing.MinMaxScaler()
scaler.fit(df)
df = pd.DataFrame(scaler.transform(df), index=df.index, columns=df.columns)
df[0:5]

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
5939,0.662857,0.767901,0.0,0.740741,0.848276,0.090909,0.121212,0.233173,0.245783,0.222222,0.555556,0.720317,0.763496,0.0,0.0
5940,0.677143,0.612346,0.0,0.91358,0.896552,0.204545,0.070707,0.391827,0.431325,0.111111,0.111111,0.55409,0.598972,0.0,0.0
5942,0.705714,0.82716,0.0,0.666667,0.731034,0.340909,0.212121,0.377404,0.361446,0.111111,0.666667,0.775726,0.802057,0.0,0.0
5943,0.777143,0.846914,0.0,0.703704,0.841379,0.284091,0.212121,0.387019,0.359036,0.111111,0.555556,0.78628,0.820051,0.0,0.0
5944,0.842857,0.911111,0.0,0.691358,0.57931,0.079545,0.141414,0.338942,0.318072,0.111111,0.666667,0.905013,0.871465,0.0,0.0


In [8]:
#now that we are done with the pre-processing part, let's see which are the important features for RainTomorrow!
#Using SelectKBest to get the top features!
from sklearn.feature_selection import SelectKBest,chi2,f_classif
X = df.loc[:,df.columns!='RainTomorrow']
y = df[['RainTomorrow']]
selector = SelectKBest(f_classif, k=5)
selector.fit(X, y)         # Run score function on (X, y) and get the appropriate features.
X_new = selector.transform(X) # Reduce X to the selected features. (numpy.ndarray)
print (selector.get_support(indices=True))
print(X.columns[selector.get_support(indices=True)]) #get_support Get a mask, or integer index, of the features selected

[ 4  6  9 10 13]
Index(['Sunshine', 'Humidity3pm', 'Cloud9am', 'Cloud3pm', 'RainToday'], dtype='object')


  y = column_or_1d(y, warn=True)


In [9]:
df = df[['Sunshine', 'Humidity3pm', 'Cloud9am', 'Cloud3pm', 'RainToday','RainTomorrow']] # rearrange columns
X = df[['Sunshine', 'Humidity3pm', 'Cloud9am', 'Cloud3pm', 'RainToday']] # Trainingsets
y = df[['RainTomorrow']] # Target Variable

In [11]:
#Random Forest Classifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import time
t0=time.time()
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25)
clf_rf = RandomForestClassifier(n_estimators=100, max_depth=4,random_state=0)
clf_rf.fit(X_train,y_train)
y_pred = clf_rf.predict(X_test)
score = accuracy_score(y_test,y_pred)
print('Accuracy :',score)
print('Time taken :' , time.time()-t0)

  if __name__ == '__main__':


Accuracy : 0.836490662367325
Time taken : 1.1460011005401611


In [12]:
#Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

t0=time.time()
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25)
clf_dt = DecisionTreeClassifier(random_state=0)
clf_dt.fit(X_train,y_train)
y_pred = clf_dt.predict(X_test)
score = accuracy_score(y_test,y_pred)
print('Accuracy :',score)
print('Time taken :' , time.time()-t0)

Accuracy : 0.7715302969232837
Time taken : 0.09802865982055664
