<a href="https://colab.research.google.com/github/Mayank-004/Google-Trend-Analysis/blob/main/KNN_2D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 2D KNN Prediction

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from sklearn.model_selection import train_test_split
from patsy import dmatrices
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score,accuracy_score
import seaborn as sns
import random
import time
random.seed(786)
from sklearn.metrics import mean_squared_error

In [8]:
dataset= pd.read_csv('multiTimeline_2D.csv',error_bad_lines= False, sep=',')


In [10]:
dataset.head(5)

Unnamed: 0,x_t0,x_t1,x_t2
0,69,66,61
1,66,61,57
2,61,57,51
3,57,51,49
4,51,49,47


In [None]:
#Count Missing Data
missing = dataset.isna().sum().sort_values(ascending = False)
percent_missing = ((missing / dataset.isnull().count()) * 100).sort_values(ascending = False)
missing_df = pd.concat([missing,percent_missing], axis = 1, keys = ['Total', 'Percent'],sort = False)
missing_df[missing_df['Total'] >= 1]

In [None]:
dataset.dropna(inplace=True)

In [3]:
X = dataset.iloc[:,0:-1]
y = dataset.iloc[:,-1]

In [4]:
X_train = X.head(X.shape[0] -1)
y_train=y.head(y.shape[0]-1)
X_test= X.tail(1)
y_test=y.tail(1)

In [5]:

## defining a reusable function to evaluate model performance

def eval_result(model_name,y_train,y_predtrain,y_test,y_predtest,n_neighbors):

    rmse_train = mean_squared_error(y_train, y_predtrain, squared=False)
    print( f"RMSE Train error for k= {n_neighbors} is {rmse_train}")
    rmse_test = mean_squared_error(y_test, y_predtest, squared=False)
    print( f"RMSE Test error for k= {n_neighbors} is {rmse_test}")

In [6]:
# reusable function to calculate run time for model training

def run_model(model):
    t0=time.time()
    model.fit(X_train,y_train)
    time_taken= time.time()- t0
    print(f'Time taken: {time_taken}')

# Model Building

In [None]:
# p = 2 is equivalent to using the euclidean distance

for n_neighbors in range(1,4):
  model1 = KNeighborsRegressor(n_neighbors=n_neighbors,weights='distance',algorithm='auto',p=2)
  run_model(model1)
  #model1.fit(X_train,y_train)

  y_hat_train = model1.predict(X_train)
  y_hat_test = model1.predict(X_test)
  print(f'Prediction for K= {n_neighbors} is {y_hat_test}')
  print(f'Actual for K= {n_neighbors} is {y_test.values}')
  y_hat_test=pd.DataFrame(y_hat_test)

  eval_result(model1,y_train,y_hat_train,y_test,y_hat_test,n_neighbors)

Time taken: 0.0019812583923339844
Prediction for K= 1 is [89.]
Actual for K= 1 is [88]
RMSE Train error for k= 1 is 0.8384690232980005
RMSE Test error for k= 1 is 1.0
Time taken: 0.0006394386291503906
Prediction for K= 2 is [91.92302628]
Actual for K= 2 is [88]
RMSE Train error for k= 2 is 0.6973977605094789
RMSE Test error for k= 2 is 3.923026282451332
Time taken: 0.0007379055023193359
Prediction for K= 3 is [88.92665665]
Actual for K= 3 is [88]
RMSE Train error for k= 3 is 0.6669191441105471
RMSE Test error for k= 3 is 0.9266566498721431


In [None]:
# p = 1 is equivalent to using the manhattan distance

for n_neighbors in range(1,4):
  model1 = KNeighborsRegressor(n_neighbors=n_neighbors,weights='distance',algorithm='auto',p=1)
  run_model(model1)
  y_hat_train = model1.predict(X_train)
  y_hat_test = model1.predict(X_test)
  print(f'Prediction for K= {n_neighbors} is {y_hat_test}')
  print(f'Actual for K= {n_neighbors} is {y_test.values}')
  y_hat_test=pd.DataFrame(y_hat_test)

  eval_result(model1,y_train,y_hat_train,y_test,y_hat_test,n_neighbors)

Time taken: 0.0031976699829101562
Prediction for K= 1 is [89.]
Actual for K= 1 is [88]
RMSE Train error for k= 1 is 0.8384690232980005
RMSE Test error for k= 1 is 1.0
Time taken: 0.0006520748138427734
Prediction for K= 2 is [91.625]
Actual for K= 2 is [88]
RMSE Train error for k= 2 is 0.6973977605094789
RMSE Test error for k= 2 is 3.625
Time taken: 0.0006926059722900391
Prediction for K= 3 is [88.9058296]
Actual for K= 3 is [88]
RMSE Train error for k= 3 is 0.6669191441105471
RMSE Test error for k= 3 is 0.9058295964125591


In [None]:
# p = 3 is equivalent to using the minkowski distance

for n_neighbors in range(1,4):
  model1 = KNeighborsRegressor(n_neighbors=n_neighbors,weights='distance',algorithm='auto',p=3)
  run_model(model1)
  y_hat_train = model1.predict(X_train)
  y_hat_test = model1.predict(X_test)
  print(f'Prediction for K= {n_neighbors} is {y_hat_test}')
  print(f'Actual for K= {n_neighbors} is {y_test.values}')

  eval_result(model1,y_train,y_hat_train,y_test,y_hat_test,n_neighbors)

Time taken: 0.0011560916900634766
Prediction for K= 1 is [95.]
Actual for K= 1 is [88]
RMSE Train error for k= 1 is 0.8384690232980005
RMSE Test error for k= 1 is 7.0
Time taken: 0.0006110668182373047
Prediction for K= 2 is [92.06906297]
Actual for K= 2 is [88]
RMSE Train error for k= 2 is 0.6973977605094789
RMSE Test error for k= 2 is 4.069062965020564
Time taken: 0.0006155967712402344
Prediction for K= 3 is [88.95041407]
Actual for K= 3 is [88]
RMSE Train error for k= 3 is 0.6669191441105471
RMSE Test error for k= 3 is 0.950414067241411
Time taken: 0.0011301040649414062
Prediction for K= 4 is [90.26529231]
Actual for K= 4 is [88]
RMSE Train error for k= 4 is 0.6578968633671107
RMSE Test error for k= 4 is 2.2652923131399376
Time taken: 0.0007827281951904297
Prediction for K= 5 is [89.18788547]
Actual for K= 5 is [88]
RMSE Train error for k= 5 is 0.6521138897865351
RMSE Test error for k= 5 is 1.1878854694621452
Time taken: 0.0006630420684814453
Prediction for K= 6 is [89.69181979]
Actu