In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('AusDataForRainPred.csv')
vifdf = pd.read_csv('AusDataForRainPred.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           145460 non-null  object 
 1   Location       145460 non-null  object 
 2   MinTemp        143975 non-null  float64
 3   MaxTemp        144199 non-null  float64
 4   Rainfall       142199 non-null  float64
 5   Evaporation    82670 non-null   float64
 6   Sunshine       75625 non-null   float64
 7   WindGustDir    135134 non-null  object 
 8   WindGustSpeed  135197 non-null  float64
 9   WindDir9am     134894 non-null  object 
 10  WindDir3pm     141232 non-null  object 
 11  WindSpeed9am   143693 non-null  float64
 12  WindSpeed3pm   142398 non-null  float64
 13  Humidity9am    142806 non-null  float64
 14  Humidity3pm    140953 non-null  float64
 15  Pressure9am    130395 non-null  float64
 16  Pressure3pm    130432 non-null  float64
 17  Cloud9am       89572 non-null

In [4]:
df.shape

(145460, 23)

In [5]:
df.isnull().mean()*100

Date              0.000000
Location          0.000000
MinTemp           1.020899
MaxTemp           0.866905
Rainfall          2.241853
Evaporation      43.166506
Sunshine         48.009762
WindGustDir       7.098859
WindGustSpeed     7.055548
WindDir9am        7.263853
WindDir3pm        2.906641
WindSpeed9am      1.214767
WindSpeed3pm      2.105046
Humidity9am       1.824557
Humidity3pm       3.098446
Pressure9am      10.356799
Pressure3pm      10.331363
Cloud9am         38.421559
Cloud3pm         40.807095
Temp9am           1.214767
Temp3pm           2.481094
RainToday         2.241853
RainTomorrow      2.245978
dtype: float64

 Removing all the null rows in the dataset

In [6]:
df = df.dropna()
vifdf = vifdf.dropna()

 We will convert the categorical columns into numerical columns by using map function and label encoder function

In [7]:
df['RainToday'] = df['RainToday'].map({'No':0,'Yes':1})
vifdf['RainToday'] = vifdf['RainToday'].map({'No':0,'Yes':1})

In [8]:
cat_cols = [i for i in df.columns if df[i].dtype=='O']

In [9]:
encoder = LabelEncoder()
for i in cat_cols:
    df[i] = encoder.fit_transform(df[i])
    vifdf[i] = encoder.fit_transform(vifdf[i])

In [10]:
df.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
6049,407,4,17.9,35.2,0.0,12.0,12.3,11,48.0,1,...,20.0,13.0,1006.3,1004.4,2.0,5.0,26.6,33.4,0,0
6050,408,4,18.4,28.9,0.0,14.8,13.0,8,37.0,10,...,30.0,8.0,1012.9,1012.1,1.0,1.0,20.3,27.0,0,0
6052,410,4,19.4,37.6,0.0,10.8,10.6,5,46.0,5,...,42.0,22.0,1012.3,1009.2,1.0,6.0,28.7,34.9,0,0
6053,411,4,21.9,38.4,0.0,11.4,12.2,14,31.0,14,...,37.0,22.0,1012.7,1009.1,1.0,5.0,29.1,35.6,0,0
6054,412,4,24.2,41.0,0.0,11.2,8.4,14,35.0,7,...,19.0,15.0,1010.7,1007.4,1.0,6.0,33.6,37.6,0,0


 Date column has no way relaated in predicting the rainfall so we can drop that column

In [11]:
df.drop('Date',inplace=True,axis=1)

Now we are entering the model phase, first we separate the data into dependent and independent variables or basically input columns and target column and then we divide the data for training and testing the model. 

In [12]:
input_col = df.drop('RainTomorrow',axis=1)
target_col = df['RainTomorrow']

x_train,x_test,y_train,y_test = train_test_split(input_col,target_col,random_state=100)

We build a classification model using Logistic regression library from sklearn on the training data. 

In [13]:
%time

reg = LogisticRegression()

#training the model
reg.fit(x_train,y_train)

#accuracy
print(str.format('Accuracy : {:.2f}%',reg.score(x_test,y_test)*100))


Wall time: 0 ns
Accuracy : 85.00%


 we will do feature selection. For this we use variance inflation factor from stats models library for numerical columns 

In [14]:
# importing variance inflation factor (IVF)
from statsmodels.stats.outliers_influence import variance_inflation_factor

cols = [cname for cname in vifdf.columns if vifdf[cname].dtype in ['int64', 'float64']]
data = vifdf[cols]

vif_data = pd.DataFrame()
vif_data['feature'] = data.columns

#calculating VIF for each feature
vif_data['VIF'] = [variance_inflation_factor(data.values,i) for i in range(len(data.columns))]

vif_data

Unnamed: 0,feature,VIF
0,MinTemp,58.284148
1,MaxTemp,609.736465
2,Rainfall,1.627144
3,Evaporation,7.220027
4,Sunshine,17.318912
5,WindGustSpeed,26.969533
6,WindSpeed9am,8.403752
7,WindSpeed3pm,13.776204
8,Humidity9am,61.146908
9,Humidity3pm,47.805246


From the results of variance inflation factor we can conclude that coloumns such as Min Temp,Max Temp,Humidity9am,Humidity3pm,pressure9am,pressure3pm and temp9am,temp3pm are having highest VIF values

Higher the vif more is the collinearity so the columns have greater corelation so we create new features with the the set of collinear features and drop the the original features

In [15]:
vifdf['Range'] = vifdf['MaxTemp'] - vifdf['MinTemp']
vifdf['Windspeed'] = vifdf['WindSpeed3pm'] - vifdf['WindSpeed9am']
vifdf['Humidity'] = vifdf['Humidity3pm'] - vifdf['Humidity9am']
vifdf['Pressure'] = vifdf['Pressure3pm'] - vifdf['Pressure9am']
vifdf['Temp'] = vifdf['Temp3pm'] - vifdf['Temp9am']
vifdf['cloud'] = vifdf['Cloud3pm'] - vifdf['Cloud9am']

vifdf.drop(columns=['MaxTemp','MinTemp','WindSpeed3pm','WindSpeed9am','Humidity3pm','Humidity9am','Pressure3pm','Pressure9am','Temp3pm','Temp9am','Cloud3pm','Cloud9am'],inplace=True,axis=1)

Rechecking with variation inflation factor as according to stats we basically drop columns with more VIF (generally around VIF > 5) since we dont need data to be dropped without a reason, we executed the above code for decreaing VIF

In [16]:
vif_data = pd.DataFrame()
vif_data['feature'] = data.columns

#calculating VIF for each feature
vif_data['VIF'] = [variance_inflation_factor(data.values,i) for i in range(len(data.columns))]

vif_data

Unnamed: 0,feature,VIF
0,MinTemp,58.284148
1,MaxTemp,609.736465
2,Rainfall,1.627144
3,Evaporation,7.220027
4,Sunshine,17.318912
5,WindGustSpeed,26.969533
6,WindSpeed9am,8.403752
7,WindSpeed3pm,13.776204
8,Humidity9am,61.146908
9,Humidity3pm,47.805246


Now we remove the columns with VIF greater than 5


In [17]:
#separating dependent and independent columns 1
inp_cols = vifdf.drop(columns=['Date','Sunshine','WindGustDir','WindGustSpeed','WindDir3pm','Range','Temp'])
tar_col = vifdf['RainTomorrow']

#divide the dataset into tarining and testing data
x_train,x_test,y_train,y_test = train_test_split(inp_cols,tar_col,random_state=100)


In [18]:
%time
#build a logistic regression model
log_reg = LogisticRegression()

#training the model
log_reg.fit(x_train,y_train)

print(str.format('Accuracy : {:.2f}%',log_reg.score(x_test,y_test)*100))

Wall time: 0 ns
Accuracy : 100.00%
