# Import Libraries 

In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sb 
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LogisticRegression, LinearRegression
from sklearn.metrics import confusion_matrix
from collections import Counter
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from imageio import imread, imsave
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.autograd import Variable
import torch.utils.data as data
import torch.optim as optim

# Load Data

In [3]:
y_train = pd.read_csv("train_targets.csv")
del y_train['Id']
X_test = pd.read_csv("test_features.csv")
X_train = pd.read_csv("train_features.csv")
sample_submission = pd.read_csv("sample_submission.csv")


# EDA

In [4]:
X_train.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,24.0,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,22.0,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,26.0,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,9.0,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,20.0,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No


In [5]:
X_train.describe()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm
count,116503.0,116786.0,115778.0,66767.0,61668.0,108080.0,115797.0,114525.0,115439.0,114354.0,107166.0,107204.0,74211.0,71397.0,116215.0,115073.0
mean,11.976341,22.800344,2.390686,5.314173,7.479067,39.988444,14.029552,18.863122,70.057693,52.527319,1018.049026,1015.722416,4.558435,4.604297,16.634384,21.318172
std,6.22098,6.881268,8.55374,4.239007,3.812131,13.814264,9.119511,9.102227,18.344061,20.506504,6.998878,6.89073,2.873498,2.705032,6.254076,6.706825
min,-8.5,-4.8,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,980.5,979.0,0.0,0.0,-7.2,-5.4
25%,7.5,17.8,0.0,2.6,4.6,30.0,7.0,13.0,58.0,38.0,1013.4,1011.1,1.0,2.0,12.1,16.5
50%,11.9,22.4,0.0,4.4,8.3,39.0,13.0,19.0,71.0,53.0,1018.0,1015.7,5.0,5.0,16.5,20.9
75%,16.6,27.5,0.8,7.2,10.5,48.0,19.0,24.0,84.0,66.0,1022.7,1020.3,7.0,7.0,21.1,25.8
max,33.9,48.1,371.0,145.0,14.5,135.0,130.0,87.0,100.0,100.0,1041.0,1038.2,9.0,9.0,40.2,46.7


In [6]:
X_test.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
0,2015-09-27,PerthAirport,11.5,25.9,0.0,5.8,11.1,E,69.0,E,...,19.0,54.0,30.0,1023.0,1018.9,1.0,2.0,17.3,25.4,No
1,2015-09-28,PerthAirport,12.9,28.0,0.0,5.2,11.1,ENE,41.0,ENE,...,20.0,54.0,39.0,1017.3,1014.2,1.0,2.0,18.9,24.4,No
2,2015-09-29,PerthAirport,11.5,22.8,0.0,5.0,9.8,WSW,35.0,NNW,...,19.0,71.0,38.0,1018.2,1018.0,6.0,2.0,17.5,21.6,No
3,2015-09-30,PerthAirport,9.2,24.5,0.0,5.2,10.8,WSW,31.0,NNE,...,19.0,69.0,50.0,1024.7,1023.1,1.0,1.0,18.8,23.4,No
4,2015-10-01,PerthAirport,10.9,33.5,0.0,4.4,11.2,N,39.0,NE,...,19.0,44.0,25.0,1022.2,1019.4,0.0,0.0,23.1,27.6,No


# Feature Engineering

In [7]:
full_data = pd.concat([X_train, X_test]).reset_index(drop=True)
assert full_data.shape[0] == X_train.shape[0] + X_test.shape[0]

In [8]:
full_data.columns

Index(['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation',
       'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm', 'RainToday'],
      dtype='object')

In [9]:
full_data = full_data.drop(columns=['Date', 'Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm'])

# Data Cleaning

In [10]:
full_data.isna().sum()

MinTemp            637
MaxTemp            322
Rainfall          1406
Evaporation      60843
Sunshine         67816
WindGustSpeed     9270
WindSpeed9am      1348
WindSpeed3pm      2630
Humidity9am       1774
Humidity3pm       3610
Pressure9am      14014
Pressure3pm      13981
Cloud9am         53657
Cloud3pm         57094
Temp9am            904
Temp3pm           2726
RainToday         1406
dtype: int64

In [11]:
numer_data = full_data.select_dtypes(include = ['float64']).columns
categor_data = full_data.select_dtypes(include = ['object']).columns

In [12]:
for c in numer_data:
    full_data[c] = full_data[c].fillna(full_data[c].mean())
for c in categor_data:
    full_data[c] = full_data[c].fillna(max(full_data[c].value_counts()))
full_data.head()


Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
0,13.4,22.9,0.6,5.469824,7.624853,44.0,20.0,24.0,71.0,22.0,1007.7,1007.1,8.0,4.503167,16.9,21.8,No
1,7.4,25.1,0.0,5.469824,7.624853,44.0,4.0,22.0,44.0,25.0,1010.6,1007.8,4.437189,4.503167,17.2,24.3,No
2,12.9,25.7,0.0,5.469824,7.624853,46.0,19.0,26.0,38.0,30.0,1007.6,1008.7,4.437189,2.0,21.0,23.2,No
3,9.2,28.0,0.0,5.469824,7.624853,24.0,11.0,9.0,45.0,16.0,1017.6,1012.8,4.437189,4.503167,18.1,26.5,No
4,17.5,32.3,1.0,5.469824,7.624853,41.0,7.0,20.0,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No


# One-Hot Econding

In [13]:
full_data = pd.get_dummies(full_data, columns = ['RainToday'])
full_data.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday_109332,RainToday_No,RainToday_Yes
0,13.4,22.9,0.6,5.469824,7.624853,44.0,20.0,24.0,71.0,22.0,1007.7,1007.1,8.0,4.503167,16.9,21.8,0,1,0
1,7.4,25.1,0.0,5.469824,7.624853,44.0,4.0,22.0,44.0,25.0,1010.6,1007.8,4.437189,4.503167,17.2,24.3,0,1,0
2,12.9,25.7,0.0,5.469824,7.624853,46.0,19.0,26.0,38.0,30.0,1007.6,1008.7,4.437189,2.0,21.0,23.2,0,1,0
3,9.2,28.0,0.0,5.469824,7.624853,24.0,11.0,9.0,45.0,16.0,1017.6,1012.8,4.437189,4.503167,18.1,26.5,0,1,0
4,17.5,32.3,1.0,5.469824,7.624853,41.0,7.0,20.0,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,0,1,0


In [14]:
X_train = full_data[:X_train.shape[0]]
X_test = full_data[X_train.shape[0]:]
X_test

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday_109332,RainToday_No,RainToday_Yes
117069,11.5,25.9,0.0,5.800000,11.100000,69.0,46.0,19.0,54.0,30.0,1023.0,1018.9,1.000000,2.000000,17.3,25.4,0,1,0
117070,12.9,28.0,0.0,5.200000,11.100000,41.0,20.0,20.0,54.0,39.0,1017.3,1014.2,1.000000,2.000000,18.9,24.4,0,1,0
117071,11.5,22.8,0.0,5.000000,9.800000,35.0,6.0,19.0,71.0,38.0,1018.2,1018.0,6.000000,2.000000,17.5,21.6,0,1,0
117072,9.2,24.5,0.0,5.200000,10.800000,31.0,13.0,19.0,69.0,50.0,1024.7,1023.1,1.000000,1.000000,18.8,23.4,0,1,0
117073,10.9,33.5,0.0,4.400000,11.200000,39.0,17.0,19.0,44.0,25.0,1022.2,1019.4,0.000000,0.000000,23.1,27.6,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142188,3.5,21.8,0.0,5.469824,7.624853,31.0,15.0,13.0,59.0,27.0,1024.7,1021.2,4.437189,4.503167,9.4,20.9,0,1,0
142189,2.8,23.4,0.0,5.469824,7.624853,31.0,13.0,11.0,51.0,24.0,1024.6,1020.3,4.437189,4.503167,10.1,22.4,0,1,0
142190,3.6,25.3,0.0,5.469824,7.624853,22.0,13.0,9.0,56.0,21.0,1023.5,1019.1,4.437189,4.503167,10.9,24.5,0,1,0
142191,5.4,26.9,0.0,5.469824,7.624853,37.0,9.0,9.0,53.0,24.0,1021.0,1016.8,4.437189,4.503167,12.5,26.1,0,1,0


# Baseline Model

In [15]:
from sklearn.metrics import mean_absolute_error 
def evaluate(y_pred, y_true):
    return mean_absolute_error(y_pred, y_true)

In [15]:
##Random Forest
from sklearn.ensemble import RandomForestClassifier 

clf = RandomForestClassifier(max_depth = 100000, random_state = 0, verbose = 1)

clf.fit(X_train.to_numpy(), np.ravel(y_train.to_numpy()))
predictions = clf.predict(X_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   55.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    1.6s finished


## NN

In [None]:
!pip3 install xgboost
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
model = xgb.XGBRFClassifier(n_estimators=100000, subsample=0.9, colsample_bynode=0.2)
model.fit(X_train.to_numpy(), np.ravel(y_train.to_numpy()))
predictions = model.predict(X_test)





## Cross-Validation

In [29]:
batch_size = 64
n_epochs = 10
batch_no = len(X_train) // batch_size

train_loss = 0
train_loss_min = np.Inf
for epoch in range(n_epochs):
    for i in range(batch_no):
        
        start = i*batch_size
        end = start+batch_size

        dfx, dfy = OurData(X_train)[start:end]
        
        x_var = Variable(torch.FloatTensor(dfx))
        y_var = Variable(torch.LongTensor(dfy)) 
        
       
        optimizer.zero_grad()
        output = model(x_var)
        loss = criterion(output,y_var)
        
        loss.backward()
        
        optimizer.step()
       

        values, labels = torch.max(output, 1)
        
        num_right = np.sum(labels.data.numpy() == dfy)
        train_loss += loss.item()*batch_size
    
    train_loss = train_loss / len(train)
    if train_loss <= train_loss_min:
        print("Training loss decreased ({:6f} ===> {:6f}). Saving the model...".format(train_loss_min,train_loss))
        torch.save(model.state_dict(), "model.pt")
        train_loss_min = train_loss
    

    if epoch % 5 == 0:
        print('')
        print("Epoch: {} \tTrain Loss: {} \tTrain Accuracy: {}".format(epoch+1, train_loss,num_right / len(dfy) ))
print('Training Ended! ')

ValueError: too many values to unpack (expected 2)

In [None]:
val_folds_x = {}
val_folds_y = {}
i=0
l=0
rows = np.arange(X_train.shape[0]//5, X_train.shape[0], (X_train.shape[0]//5))
y_train=y_train.iloc[:rows[-1]]
for r in rows:
    val_folds_x[i] = X_train.iloc[l:r, :]
    val_folds_y[i] = y_train.iloc[l:r, :]
    i+=1 
    l=r
rows

In [None]:
pred_list = []
for v in val_folds_x:
    clf = RandomForestClassifier(max_depth=10, random_state=0, verbose=1)
    clf.fit(val_folds_x[v].to_numpy(), np.ravel(val_folds_y[v].to_numpy()))
    pred_list.append(clf.predict(X_test))
pred_list

In [None]:
pred_list[1] == pred_list[4]

## Submission

In [17]:
assert predictions.shape[0] == 25124
sample_submission['RainTomorrow'] = predictions
sample_submission.to_csv('submission.csv', index = False)