In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")
%matplotlib inline


from math import sqrt
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

### Data

In [3]:
df_aus = pd.read_csv("Usa_cleaned.csv.")
df_aus.drop(columns=['Unnamed: 0'],inplace=True)
df_aus.head()

Unnamed: 0,Name,Date time,Maximum Temperature,Minimum Temperature,Temperature,Precipitation,Wind Speed,Visibility,Cloud Cover,Relative Humidity,Conditions,Clear,Overcast,Partially cloudy,Rain,"Rain, Overcast","Rain, Partially cloudy",Snow,"Snow, Overcast","Snow, Partially cloudy"
0,United States,01/01/1970,35.0,28.1,32.0,0.0,13.9,12.8,77.9,61.54,Overcast,0,1,0,0,0,0,0,0,0
1,United States,01/02/1970,41.1,25.1,33.2,0.0,17.2,10.0,37.5,59.52,Partially cloudy,0,0,1,0,0,0,0,0,0
2,United States,01/03/1970,38.0,26.9,33.6,0.05,17.2,10.5,73.3,67.34,"Rain, Partially cloudy",0,0,0,0,0,1,0,0,0
3,United States,01/04/1970,33.2,21.8,27.0,0.0,17.2,9.5,0.0,51.89,Clear,1,0,0,0,0,0,0,0,0
4,United States,01/05/1970,44.0,20.0,30.2,0.0,15.0,9.3,47.1,65.6,Partially cloudy,0,0,1,0,0,0,0,0,0


In [4]:
df_aus.shape

(18545, 20)

## K-NN Regression for Temperature

### Create Train and Test set

In [8]:
#considering only Visibility,relative humidity and windspeed
X_temp=df_aus.drop(columns=['Maximum Temperature','Minimum Temperature','Temperature','Precipitation',
                            'Name','Date time','Cloud Cover','Conditions'],axis=1)
Y_temp=df_aus.Temperature

In [9]:
from sklearn.model_selection import train_test_split
train_X_temp,test_X_temp,train_y_temp,test_y_temp = train_test_split(X_temp,Y_temp,test_size=0.35,shuffle=False)

In [10]:
train_X_temp.head()

Unnamed: 0,Wind Speed,Visibility,Relative Humidity,Clear,Overcast,Partially cloudy,Rain,"Rain, Overcast","Rain, Partially cloudy",Snow,"Snow, Overcast","Snow, Partially cloudy"
0,13.9,12.8,61.54,0,1,0,0,0,0,0,0,0
1,17.2,10.0,59.52,0,0,1,0,0,0,0,0,0
2,17.2,10.5,67.34,0,0,0,0,0,1,0,0,0
3,17.2,9.5,51.89,1,0,0,0,0,0,0,0,0
4,15.0,9.3,65.6,0,0,1,0,0,0,0,0,0


### Preprocessing – Scaling the features

In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
x_scaled= MinMaxScaler()
train_X_temp[['Wind Speed','Relative Humidity','Visibility']] = x_scaled.fit_transform(train_X_temp[['Wind Speed','Relative Humidity','Visibility']])

x_test_scaled= MinMaxScaler()
test_X_temp[['Wind Speed','Relative Humidity','Visibility']]=x_test_scaled.fit_transform(test_X_temp[['Wind Speed','Relative Humidity','Visibility']])

In [12]:
train_X_temp.shape

(12054, 12)

In [13]:
test_X_temp.shape

(6491, 12)

## Implementing RandomizedSearchCV

### For Decision Tree

In [14]:
#Note: Max_depth can be done
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
dc= DecisionTreeRegressor(random_state=1234)
params = {'max_features': ['auto', 'sqrt', 'log2'],
          'min_samples_split': [2,3,4,5,6,7,8,9,10,11,12,13,14,15], 
          'min_samples_leaf':[1,2,3,4,5,6,7,8,9,10,11],
          'max_depth':[i for i in range(1,15)]}
#Making models with hyper parameters sets
dc = RandomizedSearchCV(dc, param_distributions=params,cv=8, n_jobs=-1)
#Learning
dc.fit(train_X_temp,train_y_temp)
#The best hyper parameters set
print("Best Hyper Parameters:",dc.best_params_)

Best Hyper Parameters: {'min_samples_split': 9, 'min_samples_leaf': 10, 'max_features': 'auto', 'max_depth': 9}


### For Knn

In [15]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
knn = KNeighborsRegressor()
params = {'n_neighbors':[i for i in range(5,20)],
         'leaf_size':[1,2,3,5],
          'weights':['uniform', 'distance'],
          'algorithm':['auto', 'ball_tree','kd_tree','brute'],
          'n_jobs':[-1]}


knn = RandomizedSearchCV(knn,param_distributions=params,cv=8)
knn.fit(train_X_temp,train_y_temp)
knn.best_params_

{'weights': 'uniform',
 'n_neighbors': 18,
 'n_jobs': -1,
 'leaf_size': 1,
 'algorithm': 'brute'}

### For SVM

In [16]:
from sklearn import svm
#making the instance
svm_r=svm.SVR()
#Hyper Parameters Set
params = {'C': [6,7,8,9,10,11,12], 
          'kernel': ['linear','rbf']}
#Making models with hyper parameters sets
svm_r = RandomizedSearchCV(svm_r, param_distributions=params,cv=10, n_jobs=-1)
#Learning
svm_r.fit(train_X_temp,train_y_temp)
#The best hyper parameters set
print("Best Hyper Parameters:\n",svm_r.best_params_)

Best Hyper Parameters:
 {'kernel': 'rbf', 'C': 12}
