In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")
%matplotlib inline


from math import sqrt
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

### Data

In [3]:
df_russia = pd.read_csv("russia_cleaned.csv")
df_russia.drop(columns=['Unnamed: 0'],inplace=True)
df_russia.head()

Unnamed: 0,Name,Date time,Maximum Temperature,Minimum Temperature,Temperature,Precipitation,Wind Speed,Visibility,Cloud Cover,Relative Humidity,Conditions,Clear,Overcast,Partially cloudy,Rain,"Rain, Overcast","Rain, Partially cloudy",Snow,"Snow, Overcast","Snow, Partially cloudy"
0,russia,01/01/1970,10.1,-2.1,4.7,0.0,13.9,6.2,48.3,84.54,Partially cloudy,0,0,1,0,0,0,0,0,0
1,russia,01/02/1970,19.1,-3.9,10.4,0.0,18.3,3.1,87.5,89.14,Overcast,0,1,0,0,0,0,0,0,0
2,russia,01/03/1970,28.1,20.9,25.5,0.0,18.3,3.7,92.5,91.09,Overcast,0,1,0,0,0,0,0,0,0
3,russia,01/04/1970,26.9,20.9,22.7,0.0,16.1,3.7,97.5,89.49,Overcast,0,1,0,0,0,0,0,0,0
4,russia,01/05/1970,32.1,28.1,29.5,0.0,16.1,2.7,100.0,92.43,Overcast,0,1,0,0,0,0,0,0,0


In [4]:
df_russia.shape

(18019, 20)

## K-NN Regression for Temperature

### Create Train and Test set

In [5]:
#Considering Visibilty, Cloud Cover and Humidity
X_temp=df_russia.drop(columns=['Maximum Temperature','Minimum Temperature','Temperature','Precipitation','Conditions',
                            'Name','Date time','Wind Speed'],axis=1)
Y_temp=df_russia.Temperature

In [6]:
from sklearn.model_selection import train_test_split
train_X_temp,test_X_temp,train_y_temp,test_y_temp = train_test_split(X_temp,Y_temp,test_size=0.4)

In [7]:
train_X_temp.head()

Unnamed: 0,Visibility,Cloud Cover,Relative Humidity,Clear,Overcast,Partially cloudy,Rain,"Rain, Overcast","Rain, Partially cloudy",Snow,"Snow, Overcast","Snow, Partially cloudy"
5212,6.3,77.3,71.27,0,1,0,0,0,0,0,0,0
7976,6.2,0.0,57.11,1,0,0,0,0,0,0,0,0
9846,6.4,46.8,62.03,0,0,1,0,0,0,0,0,0
6708,4.2,96.7,91.68,0,1,0,0,0,0,0,0,0
5510,6.0,9.7,67.77,1,0,0,0,0,0,0,0,0


### Preprocessing – Scaling the features

In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
x_scaled= MinMaxScaler()
train_X_temp[['Cloud Cover','Relative Humidity','Visibility']] = x_scaled.fit_transform(train_X_temp[['Cloud Cover','Relative Humidity','Visibility']])

x_test_scaled= MinMaxScaler()
test_X_temp[['Cloud Cover','Relative Humidity','Visibility']]=x_test_scaled.fit_transform(test_X_temp[['Cloud Cover','Relative Humidity','Visibility']])

In [10]:
train_X_temp.shape

(10811, 12)

In [11]:
test_X_temp.shape

(7208, 12)

## Implementing RandomizedSearchCV

### For Decision Tree

In [12]:
#Note: Max_depth can be done
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
dc= DecisionTreeRegressor(random_state=1234)
params = {'max_features': ['auto', 'sqrt', 'log2'],
          'min_samples_split': [2,3,4,5,6,7,8,9,10,11,12,13,14,15], 
          'min_samples_leaf':[1,2,3,4,5,6,7,8,9,10,11],
          'max_depth':[i for i in range(1,15)]}
#Making models with hyper parameters sets
dc = RandomizedSearchCV(dc, param_distributions=params,cv=10, n_jobs=-1)
#Learning
dc.fit(train_X_temp,train_y_temp)
#The best hyper parameters set
print("Best Hyper Parameters:",dc.best_params_)

Best Hyper Parameters: {'min_samples_split': 11, 'min_samples_leaf': 7, 'max_features': 'auto', 'max_depth': 8}


### For Knn

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
knn = KNeighborsRegressor()
params = {'n_neighbors':[i for i in range(5,20)],
         'leaf_size':[1,2,3,5],
          'weights':['uniform', 'distance'],
          'algorithm':['auto', 'ball_tree','kd_tree','brute'],
          'n_jobs':[-1]}


knn = RandomizedSearchCV(knn,param_distributions=params,cv=10)
knn.fit(train_X_temp,train_y_temp)
knn.best_params_

{'weights': 'uniform',
 'n_neighbors': 19,
 'n_jobs': -1,
 'leaf_size': 2,
 'algorithm': 'brute'}

### For SVM

In [14]:
from sklearn import svm
#making the instance
svm_r=svm.SVR()
#Hyper Parameters Set
params = {'C': [6,7,8,9,10,11,12], 
          'kernel': ['linear','rbf']}
#Making models with hyper parameters sets
svm_r = RandomizedSearchCV(svm_r, param_distributions=params,cv=10, n_jobs=-1)
#Learning
svm_r.fit(train_X_temp,train_y_temp)
#The best hyper parameters set
print("Best Hyper Parameters:\n",svm_r.best_params_)

Best Hyper Parameters:
 {'kernel': 'rbf', 'C': 12}


In [None]:
#Todo: Tuning For XGBoost