In [16]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")
%matplotlib inline


from math import sqrt
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

### Data

In [17]:
df_aus = pd.read_csv("australia_cleaned.csv")
df_aus.drop(columns=['Unnamed: 0'],inplace=True)
df_aus.head()

Unnamed: 0,Name,Date time,Maximum Temperature,Minimum Temperature,Temperature,Precipitation,Wind Speed,Visibility,Cloud Cover,Relative Humidity,Clear,Overcast,Partially cloudy,Rain,"Rain, Overcast","Rain, Partially cloudy",Snow,"Snow, Partially cloudy"
0,australia,01/01/1970,62.3,55.1,59.9,0.0,19.5,16.560013,97.5,82.74,0,1,0,0,0,0,0,0
1,australia,01/02/1970,53.9,50.3,51.9,0.0,18.3,16.560013,93.8,78.11,0,1,0,0,0,0,0,0
2,australia,01/03/1970,56.0,48.8,52.3,0.0,15.0,16.560013,100.0,85.87,0,1,0,0,0,0,0,0
3,australia,01/04/1970,64.1,51.2,56.3,0.0,16.1,16.560013,91.3,83.15,0,1,0,0,0,0,0,0
4,australia,01/05/1970,72.2,50.3,61.4,0.0,11.4,16.560013,75.0,71.98,0,0,1,0,0,0,0,0


In [18]:
df_aus.shape

(18570, 18)

## K-NN Regression for Temperature

### Create Train and Test set

In [4]:
#considering only relative humidity and windspeed
X_temp=df_aus.drop(columns=['Maximum Temperature','Minimum Temperature','Temperature','Precipitation',
                            'Name','Date time','Cloud Cover','Visibility'],axis=1)
Y_temp=df_aus.Temperature

In [5]:
from sklearn.model_selection import train_test_split
train_X_temp,test_X_temp,train_y_temp,test_y_temp = train_test_split(X_temp,Y_temp,test_size=0.35,shuffle=False)

In [6]:
train_X_temp.head()

Unnamed: 0,Wind Speed,Relative Humidity,Clear,Overcast,Partially cloudy,Rain,"Rain, Overcast","Rain, Partially cloudy",Snow,"Snow, Partially cloudy"
0,19.5,82.74,0,1,0,0,0,0,0,0
1,18.3,78.11,0,1,0,0,0,0,0,0
2,15.0,85.87,0,1,0,0,0,0,0,0
3,16.1,83.15,0,1,0,0,0,0,0,0
4,11.4,71.98,0,0,1,0,0,0,0,0


### Preprocessing – Scaling the features

In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
x_scaled= MinMaxScaler()
train_X_temp[['Wind Speed','Relative Humidity']] = x_scaled.fit_transform(train_X_temp[['Wind Speed','Relative Humidity']])

x_test_scaled= MinMaxScaler()
test_X_temp[['Wind Speed','Relative Humidity']]=x_test_scaled.fit_transform(test_X_temp[['Wind Speed','Relative Humidity']])

In [8]:
train_X_temp.shape

(12070, 10)

In [9]:
test_X_temp.shape

(6500, 10)

## Implementing RandomizedSearchCV

### For Decision Tree

In [10]:
#Note: Max_depth can be done
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
dc= DecisionTreeRegressor(random_state=1234)
params = {'max_features': ['auto', 'sqrt', 'log2'],
          'min_samples_split': [2,3,4,5,6,7,8,9,10,11,12,13,14,15], 
          'min_samples_leaf':[1,2,3,4,5,6,7,8,9,10,11],
          'max_depth':[i for i in range(1,15)]}
#Making models with hyper parameters sets
dc = RandomizedSearchCV(dc, param_distributions=params,cv=8, n_jobs=-1)
#Learning
dc.fit(train_X_temp,train_y_temp)
#The best hyper parameters set
print("Best Hyper Parameters:",dc.best_params_)

Best Hyper Parameters: {'min_samples_split': 15, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 7}


### For Knn

In [19]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
knn = KNeighborsRegressor()
params = {'n_neighbors':[i for i in range(5,20)],
         'leaf_size':[1,2,3,5],
          'weights':['uniform', 'distance'],
          'algorithm':['auto', 'ball_tree','kd_tree','brute'],
          'n_jobs':[-1]}


knn = RandomizedSearchCV(knn,param_distributions=params,cv=8)
knn.fit(train_X_temp,train_y_temp)
knn.best_params_

{'weights': 'uniform',
 'n_neighbors': 19,
 'n_jobs': -1,
 'leaf_size': 3,
 'algorithm': 'ball_tree'}

### For SVM

In [18]:
from sklearn import svm
#making the instance
svm_r=svm.SVR()
#Hyper Parameters Set
params = {'C': [6,7,8,9,10,11,12], 
          'kernel': ['linear','rbf']}
#Making models with hyper parameters sets
svm_r = RandomizedSearchCV(svm_r, param_distributions=params,cv=10, n_jobs=-1)
#Learning
svm_r.fit(train_X_temp,train_y_temp)
#The best hyper parameters set
print("Best Hyper Parameters:\n",svm_r.best_params_)

Best Hyper Parameters:
 {'kernel': 'rbf', 'C': 12}
