**Importing libraries**

In [100]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

**import dataset**

In [101]:
dataset=pd.read_csv('weatherAUS.csv')
X=dataset.iloc[:,[1,2,3,4,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21]].values
# here :  represent all the rows ,  numbers represents the columns
Y=dataset.iloc[:,-1].values 
# here this command returns one dimensional array
# Y=dataset.iloc[:,22:].values
# here this command returns two dimensional array

In [102]:
X

array([['Albury', 13.4, 22.9, ..., 16.9, 21.8, 'No'],
       ['Albury', 7.4, 25.1, ..., 17.2, 24.3, 'No'],
       ['Albury', 12.9, 25.7, ..., 21.0, 23.2, 'No'],
       ...,
       ['Uluru', 5.4, 26.9, ..., 12.5, 26.1, 'No'],
       ['Uluru', 7.8, 27.0, ..., 15.1, 26.0, 'No'],
       ['Uluru', 14.9, nan, ..., 15.0, 20.9, 'No']], dtype=object)

In [103]:
Y

array(['No', 'No', 'No', ..., 'No', 'No', nan], dtype=object)

In [104]:
Y=Y.reshape(-1,1)
# Here this command convert one dimensional array to two dimensional array.Because train test splitting algorithm only accept 2 dimensional lists
Y

array([['No'],
       ['No'],
       ['No'],
       ...,
       ['No'],
       ['No'],
       [nan]], dtype=object)

**Cleaning and preprocessing the dataset**

In [105]:
imputer =SimpleImputer(missing_values=np.nan,strategy='most_frequent')
# here for all the missing values we substitute most most_frequent(mode) value of each column
X=imputer.fit_transform(X)
Y=imputer.fit_transform(Y)

**Label encoding**

In [106]:
encoder=LabelEncoder()

X[:,0]=encoder.fit_transform(X[:,0])
X[:,4]=encoder.fit_transform(X[:,4])
X[:,6]=encoder.fit_transform(X[:,6])
X[:,7]=encoder.fit_transform(X[:,7])
X[:,-1]=encoder.fit_transform(X[:,-1])
encoder2=LabelEncoder()
Y=encoder2.fit_transform(Y)

  y = column_or_1d(y, warn=True)


**Feature scaling** - feature scaling done for minimize the data range/scale.So we can analyze the data within range(-3 to +3)

In [107]:
sc=StandardScaler()
X=sc.fit_transform(X)
X

array([[-1.53166617,  0.19132753, -0.04135977, ..., -0.01407077,
         0.02310362, -0.52979545],
       [-1.53166617, -0.75105231,  0.26874452, ...,  0.03244663,
         0.387799  , -0.52979545],
       [-1.53166617,  0.11279588,  0.35331842, ...,  0.62166712,
         0.22733303, -0.52979545],
       ...,
       [ 1.20928479, -1.06517892,  0.52246622, ..., -0.69632607,
         0.65037966, -0.52979545],
       [ 1.20928479, -0.68822699,  0.53656187, ..., -0.29317521,
         0.63579185, -0.52979545],
       [ 1.20928479,  0.42692249, -0.45013361, ..., -0.30868102,
        -0.10818671, -0.52979545]])

**Splitting data to training and testing**

In [108]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=0) 

In [109]:
X_train

array([[ 0.22535368,  1.03946939,  0.07140543, ...,  0.68369032,
         0.08145488, -0.52979545],
       [ 1.42012717, -0.45263203,  0.11369237, ..., -0.41722163,
         0.22733303, -0.52979545],
       [ 0.50647685, -0.20133073, -0.14002932, ..., -0.06058818,
        -0.02065982,  1.88752093],
       ...,
       [ 1.0687232 ,  0.75675544,  0.93124006, ...,  1.10234698,
         1.07342629, -0.52979545],
       [ 0.57675765, -0.04426743, -0.16822062, ...,  0.01694083,
        -0.28324049,  1.88752093],
       [ 1.63096955, -0.0285611 , -0.91529006, ..., -0.35519842,
        -0.76463838, -0.52979545]])

**Lets train the model**  - Here we must try with several classificationtraining algorithms like SVM,decision tree,Random forest,etc.the best results were produced by random forest algorithm.So lets use it

In [110]:


model=RandomForestClassifier(n_estimators=100,random_state=0)
model.fit(X_train,Y_train)
model.score(X_train,Y_train)

0.9999312525780283

**Lets evaluate**

In [111]:
predictions=model.predict(X_test)
predictions
# score=accuracy_score(Y_test,predictions)
# score
print(predictions)




[0 0 0 ... 0 0 0]


In [112]:
print(Y_test)

[1 1 0 ... 1 0 0]


In [113]:
# To graphically compare yes no we will have to convert Y_test and predictions from 1/0 to yes/no again using label encoder function


predictions=encoder2.inverse_transform(predictions)
Y_test=encoder2.inverse_transform(Y_test)


In [114]:
print(predictions)

['No' 'No' 'No' ... 'No' 'No' 'No']


In [115]:
print(Y_test)

['Yes' 'Yes' 'No' ... 'Yes' 'No' 'No']


In [116]:
predictions=predictions.reshape(-1,1)
Y_test=Y_test.reshape(-1,1)
# here we convert 1D array to 2D array

In [117]:
print(predictions)

array([['No'],
       ['No'],
       ['No'],
       ...,
       ['No'],
       ['No'],
       ['No']], dtype=object)

In [118]:
print(Y_test)

[['Yes']
 ['Yes']
 ['No']
 ...
 ['Yes']
 ['No']
 ['No']]


In [120]:
df=np.concatenate((Y_test,predictions),axis=1)
print(df)

[['Yes' 'No']
 ['Yes' 'No']
 ['No' 'No']
 ...
 ['Yes' 'No']
 ['No' 'No']
 ['No' 'No']]


In [123]:
dataframe=pd.DataFrame(df,columns=['Rain on tommorow','Prediction on tommorow'])
print(dataframe)

      Rain on tommorow Prediction on tommorow
0                  Yes                     No
1                  Yes                     No
2                   No                     No
3                   No                    Yes
4                   No                     No
...                ...                    ...
29087               No                    Yes
29088               No                     No
29089              Yes                     No
29090               No                     No
29091               No                     No

[29092 rows x 2 columns]


In [124]:
score=accuracy_score(Y_test,predictions)
score

0.8521930427608965

**Download the results**

In [126]:
dataframe.to_csv('result.csv')