#Importing Libraries


In [2]:
import numpy as np
import pandas as pd

#Importing Dataset

In [3]:
dataset=pd.read_csv("/content/weatherAUS.csv")
X=dataset.iloc[:,[1,2,3,4,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21]].values
Y=dataset.iloc[:,-1].values

In [4]:
print(X)

[['Albury' 13.4 22.9 ... 16.9 21.8 'No']
 ['Albury' 7.4 25.1 ... 17.2 24.3 'No']
 ['Albury' 12.9 25.7 ... 21.0 23.2 'No']
 ...
 ['Uluru' 5.4 26.9 ... 12.5 26.1 'No']
 ['Uluru' 7.8 27.0 ... 15.1 26.0 'No']
 ['Uluru' 14.9 nan ... 15.0 20.9 'No']]


In [5]:
print(Y)

['No' 'No' 'No' ... 'No' 'No' nan]


In [6]:
Y=Y.reshape(-1,1) #to change 1D to 2D List

In [7]:
print(Y)

[['No']
 ['No']
 ['No']
 ...
 ['No']
 ['No']
 [nan]]


#Dealing with Invalid Data

In [8]:
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(missing_values=np.nan,strategy='most_frequent')
X=imputer.fit_transform(X)
Y=imputer.fit_transform(Y)

In [9]:
print(X)

[['Albury' 13.4 22.9 ... 16.9 21.8 'No']
 ['Albury' 7.4 25.1 ... 17.2 24.3 'No']
 ['Albury' 12.9 25.7 ... 21.0 23.2 'No']
 ...
 ['Uluru' 5.4 26.9 ... 12.5 26.1 'No']
 ['Uluru' 7.8 27.0 ... 15.1 26.0 'No']
 ['Uluru' 14.9 20.0 ... 15.0 20.9 'No']]


In [10]:
print(Y)

[['No']
 ['No']
 ['No']
 ...
 ['No']
 ['No']
 ['No']]


#Convert String data to Numerical data / Encoding Dataset

In [11]:
from sklearn.preprocessing import LabelEncoder
level1=LabelEncoder()  #for Location
X[:,0]=level1.fit_transform(X[:,0])
level2=LabelEncoder()  #for wind direction
X[:,4]=level2.fit_transform(X[:,4])  
level3=LabelEncoder()  #for wind direction at 9 am
X[:,6]=level3.fit_transform(X[:,6])
level4=LabelEncoder()  #for wind direction at 3 pm
X[:,7]=level4.fit_transform(X[:,7])  
level5=LabelEncoder()  #for rain today
X[:,-1]=level5.fit_transform(X[:,-1])
level6=LabelEncoder()  #for rain tomorrow
Y=level6.fit_transform(Y)  

  y = column_or_1d(y, warn=True)


In [12]:
print(X)

[[2 13.4 22.9 ... 16.9 21.8 0]
 [2 7.4 25.1 ... 17.2 24.3 0]
 [2 12.9 25.7 ... 21.0 23.2 0]
 ...
 [41 5.4 26.9 ... 12.5 26.1 0]
 [41 7.8 27.0 ... 15.1 26.0 0]
 [41 14.9 20.0 ... 15.0 20.9 0]]


In [13]:
print(Y) #no to 0 and yes to 1

[0 0 0 ... 0 0 0]


#Feature Scaling - for setting values to a specific range

In [14]:
from sklearn.preprocessing import StandardScaler  #range is -3 to +3
sc = StandardScaler()
X=sc.fit_transform(X)

In [15]:
print(X)

[[-1.53166617  0.19132753 -0.04135977 ... -0.01407077  0.02310362
  -0.52979545]
 [-1.53166617 -0.75105231  0.26874452 ...  0.03244663  0.387799
  -0.52979545]
 [-1.53166617  0.11279588  0.35331842 ...  0.62166712  0.22733303
  -0.52979545]
 ...
 [ 1.20928479 -1.06517892  0.52246622 ... -0.69632607  0.65037966
  -0.52979545]
 [ 1.20928479 -0.68822699  0.53656187 ... -0.29317521  0.63579185
  -0.52979545]
 [ 1.20928479  0.42692249 -0.45013361 ... -0.30868102 -0.10818671
  -0.52979545]]


#Splitting Dataset into Training Set and Test set 

In [16]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=0)  #20% for testing data and 80% for training data
#X_train = independent variable of training data
#Y_train = dependent variable of training data
#X_test = independent variable of testing data
#Y_test = dependent variable of testing data

In [17]:
print(X_train)

[[ 0.22535368  1.03946939  0.07140543 ...  0.68369032  0.08145488
  -0.52979545]
 [ 1.42012717 -0.45263203  0.11369237 ... -0.41722163  0.22733303
  -0.52979545]
 [ 0.50647685 -0.20133073 -0.14002932 ... -0.06058818 -0.02065982
   1.88752093]
 ...
 [ 1.0687232   0.75675544  0.93124006 ...  1.10234698  1.07342629
  -0.52979545]
 [ 0.57675765 -0.04426743 -0.16822062 ...  0.01694083 -0.28324049
   1.88752093]
 [ 1.63096955 -0.0285611  -0.91529006 ... -0.35519842 -0.76463838
  -0.52979545]]


In [18]:
print(Y_train)

[1 0 0 ... 0 0 0]


#Training Model

In [19]:
from sklearn.ensemble import RandomForestClassifier  #here it has classification problem so using this class for discrete values
classifier=RandomForestClassifier(n_estimators=100,random_state=0)
classifier.fit(X_train,Y_train)

In [20]:
classifier.score(X_train,Y_train) #99% data is trained

0.9999312525780283

In [23]:
print(Y_test)

[1 1 0 ... 1 0 0]


In [21]:
y_pred=classifier.predict(X_test)

In [22]:
print(y_pred) 

[0 0 0 ... 0 0 0]


In [25]:
y_pred=level6.inverse_transform(y_pred)  #we want it in form of yes or no so we have to reverse the process

In [26]:
print(y_pred)

['No' 'No' 'No' ... 'No' 'No' 'No']


In [27]:
print(Y_test)

[[1]
 [1]
 [0]
 ...
 [1]
 [0]
 [0]]


In [28]:
Y_test = level6.inverse_transform(Y_test)

  y = column_or_1d(y, warn=True)


In [29]:
print(Y_test)

['Yes' 'Yes' 'No' ... 'Yes' 'No' 'No']


In [30]:
Y_test=Y_test.reshape(-1,1)
y_pred=y_pred.reshape(-1,1)

In [33]:
df = np.concatenate((Y_test,y_pred),axis=1) #axis = 1 means it concatenates vertically
dataframe = pd.DataFrame(df,columns=['Rain on Tomorrow','Prediction of Rain'])

In [35]:
print(df) #1 is Y_test and 2nd is y_pred

[['Yes' 'No']
 ['Yes' 'No']
 ['No' 'No']
 ...
 ['Yes' 'No']
 ['No' 'No']
 ['No' 'No']]


In [34]:
print(dataframe) #as in output some predictions are correct(3,5) and some are wrong(1,2)

      Rain on Tomorrow Prediction of Rain
0                  Yes                 No
1                  Yes                 No
2                   No                 No
3                   No                Yes
4                   No                 No
...                ...                ...
29087               No                Yes
29088               No                 No
29089              Yes                 No
29090               No                 No
29091               No                 No

[29092 rows x 2 columns]


#Calculating Accuracy

In [36]:
from sklearn.metrics import accuracy_score
accuracy_score(Y_test,y_pred) #to increase accuracy change n_estimators to a higher value(150,200)

0.8521930427608965

In [37]:
dataframe.to_csv('prediction.csv')