In [114]:
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Step 1: Import the required libraries

In [115]:
# For linear algebra
import numpy as np
# For data processing
import pandas as pd

Step 2: Load the data set

In [116]:
#Load the data set
df = pd.read_csv('/content/gdrive/MyDrive/Artificial In Lab/Predict Rain Project/weatherAUS.csv')
#Display the shape of the data set
print('Size of weather data frame is :',df.shape)
#Display data
print(df[0:5])

Size of weather data frame is : (36881, 24)
        Date      Location  MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine  \
0  5/18/2009        Hobart      5.1     14.3       0.0          1.8       8.9   
1   7/3/2009    Launceston      1.1     14.5       0.4          NaN       NaN   
2  2/18/2010   Williamtown     19.7     26.2       0.0          7.2       7.2   
3   3/4/2010  PerthAirport     16.6     28.0       0.0          9.0      11.3   
4   9/9/2010     GoldCoast     14.6     25.3       0.0          NaN       NaN   

  WindGustDir  WindGustSpeed WindDir9am  ... Humidity3pm  Pressure9am  \
0          NW           30.0        WSW  ...        47.0       1023.1   
1         SSW           50.0          E  ...        46.0       1001.5   
2         SSE           41.0        SSE  ...        50.0       1020.9   
3          SW           54.0        SSE  ...        41.0       1018.3   
4         NNW           43.0        WNW  ...        67.0       1020.3   

   Pressure3pm  Cloud9am  Clou

Step 3: Data Preprocessing

In [117]:
# Checking for null values
print(df.count().sort_values())

Sunshine         23317
Cloud3pm         23899
Evaporation      24035
Cloud9am         24381
Pressure9am      33309
Pressure3pm      33329
WindGustDir      33513
WindGustSpeed    33520
WindDir9am       34072
WindDir3pm       35919
WindSpeed9am     36219
WindSpeed3pm     36235
RainToday        36255
Rainfall         36255
RainTomorrow     36261
RISK_MM          36261
Humidity9am      36311
Humidity3pm      36370
Temp9am          36394
Temp3pm          36437
MinTemp          36543
MaxTemp          36639
Location         36881
Date             36881
dtype: int64


In [118]:
df = df.drop(columns=['Sunshine','Evaporation','Cloud3pm','Cloud9am','Location','RISK_MM','Date'],axis=1)
print(df.shape)

(36881, 17)


In [119]:
#Removing null values
df = df.dropna(how='any')
print(df.shape)

(28816, 17)


In [120]:
from scipy import stats
z = np.abs(stats.zscore(df._get_numeric_data()))
print(z)
df= df[(z < 3).all(axis=1)]
print(df.shape)

[[1.21193972e+00 1.31984217e+00 2.82689584e-01 ... 1.07104053e+00
  1.30495504e+00 1.27084599e+00]
 [1.85004136e+00 1.29075486e+00 2.39210196e-01 ... 1.75452445e+00
  2.53486404e+00 1.21152731e+00]
 [1.11713127e+00 4.10852679e-01 2.82689584e-01 ... 1.02822894e+00
  8.39501668e-01 3.75247359e-01]
 ...
 [1.24475160e+00 1.21075366e+00 2.82689584e-01 ... 8.12669455e-01
  1.31254359e+00 1.27985722e+00]
 [1.02050923e+00 1.09132924e-02 2.82689584e-01 ... 7.42818340e-01
  1.53886368e-01 1.43791085e-01]
 [5.25980455e-01 4.25396333e-01 2.82689584e-01 ... 7.50767762e-04
  1.06582176e-01 5.08714388e-01]]
(27472, 17)


In [121]:
#Change yes and no to 1 and 0 respectvely for RainToday and RainTomorrow variable
df['RainToday'].replace({'No': 0, 'Yes': '1'},inplace = True)
df['RainTomorrow'].replace({'No': 0, 'Yes': '1'},inplace = True)

In [122]:
from sklearn import preprocessing
scaler = preprocessing.MinMaxScaler()
scaler(df)
df = pd.DataFrame(scaler.transform(df), index=df.index, columns=df.columns)
df.iloc[4:10]

TypeError: ignored

Step 4: Exploratory Data Analysis (EDA)

In [103]:
#Using SelectKBest to get the top features!
from sklearn.feature_selection import SelectKBest, chi2
X = df.loc[:,df.columns!='RainTomorrow']
y = df[['RainTomorrow']]
selector = SelectKBest(chi2, k=3)
selector.fit(X, y)
X_new = selector.transform(X)
print(X.columns[selector.get_support(indices=True)])

ValueError: ignored

The output gives us the three most significant predictor variables:

1. Humidity3pm

2. Rainfall

3. RainToday

The main aim of this demo is to make you understand how Machine Learning works, therefore, to simplify the computations we will assign only one of these significant variables as the input.

In [30]:
#The important features are put in a data frame
df = df[['Humidity3pm','Rainfall','RainToday','RainTomorrow']]
 
#To simplify computations we will use only one feature (Humidity3pm) to build the model
 
X = df[['Humidity3pm']]
y = df[['RainTomorrow']]

Step 5: Building a Machine Learning Model

Logistic Regression

In [31]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import time
 
#Calculating the accuracy and the time taken by the classifier
t0=time.time()
#Data Splicing
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25)
clf_logreg = LogisticRegression(random_state=0)
#Building the model using the training data set
clf_logreg.fit(X_train,y_train)
 
#Evaluating the model using testing data set
y_pred = clf_logreg.predict(X_test)
score = accuracy_score(y_test,y_pred)
 
#Printing the accuracy and the time taken by the classifier
print('Accuracy using Logistic Regression:',score)
print('Time taken using Logistic Regression:' , time.time()-t0)

Accuracy using Logistic Regression: 0.8245486313337216
Time taken using Logistic Regression: 0.05646228790283203


  y = column_or_1d(y, warn=True)


Random Forest Classifier

In [32]:
#Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
 
#Calculating the accuracy and the time taken by the classifier
t0=time.time()
#Data Splicing
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25)
clf_rf = RandomForestClassifier(n_estimators=100, max_depth=4,random_state=0)
#Building the model using the training data set
clf_rf.fit(X_train,y_train)
 
#Evaluating the model using testing data set
y_pred = clf_rf.predict(X_test)
score = accuracy_score(y_test,y_pred)
 
#Printing the accuracy and the time taken by the classifier
print('Accuracy using Random Forest Classifier:',score)
print('Time taken using Random Forest Classifier:' , time.time()-t0)

  # This is added back by InteractiveShellApp.init_path()


Accuracy using Random Forest Classifier: 0.8287711124053582
Time taken using Random Forest Classifier: 0.6551532745361328


Decision Tree Classifier

In [24]:
#Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
 
#Calculating the accuracy and the time taken by the classifier
t0=time.time()
#Data Splicing
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25)
clf_dt = DecisionTreeClassifier(random_state=0)
#Building the model using the training data set
clf_dt.fit(X_train,y_train)
 
#Evaluating the model using testing data set
y_pred = clf_dt.predict(X_test)
score = accuracy_score(y_test,y_pred)
 
#Printing the accuracy and the time taken by the classifier
print('Accuracy using Decision Tree Classifier:',score)
print('Time taken using Decision Tree Classifier:' , time.time()-t0)

Accuracy using Decision Tree Classifier: 0.8249854397204426
Time taken using Decision Tree Classifier: 0.06140637397766113


Support Vector Machine

In [33]:
#Support Vector Machine
from sklearn import svm
from sklearn.model_selection import train_test_split
 
#Calculating the accuracy and the time taken by the classifier
t0=time.time()
#Data Splicing
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25)
clf_svc = svm.SVC(kernel='linear')
 
#Building the model using the training data set
clf_svc.fit(X_train,y_train)
 
#Evaluating the model using testing data set
y_pred = clf_svc.predict(X_test)
score = accuracy_score(y_test,y_pred)
 
#Printing the accuracy and the time taken by the classifier
print('Accuracy using Support Vector Machine:',score)
print('Time taken using Support Vector Machine:' , time.time()-t0)

  y = column_or_1d(y, warn=True)


Accuracy using Support Vector Machine: 0.8283343040186372
Time taken using Support Vector Machine: 6.792050838470459


All the classification models give us an accuracy score of approximately 83-84 % except for Support Vector Machines. Considering the size of our data set, the accuracy is pretty good.