<a href="https://colab.research.google.com/github/Mayank-004/Google-Trend-Analysis/blob/main/KNN_4D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 2D KNN Prediction

In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from sklearn.model_selection import train_test_split
from patsy import dmatrices
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score,accuracy_score
import seaborn as sns
import random
import time
random.seed(786)
from sklearn.metrics import mean_squared_error

In [43]:
dataset= pd.read_csv('multiTimeline_4D.csv',error_bad_lines= False, sep=',')


In [44]:
#Count Missing Data
missing = dataset.isna().sum().sort_values(ascending = False)
percent_missing = ((missing / dataset.isnull().count()) * 100).sort_values(ascending = False)
missing_df = pd.concat([missing,percent_missing], axis = 1, keys = ['Total', 'Percent'],sort = False)
missing_df[missing_df['Total'] >= 1]

Unnamed: 0,Total,Percent
x_t2,4,2.409639
x_t12,3,1.807229


In [45]:
dataset.dropna(inplace=True)

In [46]:
dataset

Unnamed: 0,x_t0,x_t1,x_t11,x_t12,x_t2
0,69,66,61,57.0,51.0
1,66,61,57,51.0,49.0
2,61,57,51,49.0,47.0
3,57,51,49,47.0,46.0
4,51,49,47,46.0,43.0
...,...,...,...,...,...
157,31,25,24,26.0,28.0
158,25,24,26,28.0,43.0
159,24,26,28,43.0,85.0
160,26,28,43,85.0,96.0


In [47]:
X = dataset.iloc[:,0:-1]
y = dataset.iloc[:,-1]

In [48]:
X_train = X.head(X.shape[0] -1)
y_train=y.head(y.shape[0]-1)
X_test= X.tail(1)
y_test=y.tail(1)

In [49]:

## defining a reusable function to evaluate model performance

def eval_result(model_name,y_train,y_predtrain,y_test,y_predtest,n_neighbors):

    rmse_train = mean_squared_error(y_train, y_predtrain, squared=False)
    print( f"RMSE Train error for k= {n_neighbors} is {rmse_train}")
    rmse_test = mean_squared_error(y_test, y_predtest, squared=False)
    print( f"RMSE Test error for k= {n_neighbors} is {rmse_test}")

In [50]:
# reusable function to calculate run time for model training

def run_model(model):
    t0=time.time()
    model.fit(X_train,y_train)
    time_taken= time.time()- t0
    print(f'Time taken: {time_taken}')

In [51]:
# p = 2 is equivalent to using the euclidean distance
# weights = 'distance' means closer points are weighted more heavily than further away points
# weights = 'Uniform' means every point is given equal weight.
n_neighbors = 3
model1 = KNeighborsRegressor(n_neighbors=n_neighbors,weights='distance',algorithm='auto',p=2)
run_model(model1)
#model1.fit(X_train,y_train)

y_hat_train = model1.predict(X_train)
y_hat_test = model1.predict(X_test)
#y_hat_test=pd.DataFrame(y_hat_test)
eval_result(model1,y_train,y_hat_train,y_test,y_hat_test,n_neighbors)


Time taken: 0.0014002323150634766
RMSE Train error for k= 3 is 0.16718346377260584
RMSE Test error for k= 3 is 12.58090426874243


In [54]:
# p = 2 is equivalent to using the euclidean distance

for n_neighbors in range(1,5):
  model1 = KNeighborsRegressor(n_neighbors=n_neighbors,weights='distance',algorithm='auto',p=2)
  run_model(model1)
  #model1.fit(X_train,y_train)

  y_hat_train = model1.predict(X_train)
  y_hat_test = model1.predict(X_test)
  print(f'Prediction for K= {n_neighbors} is {y_hat_test}')
  print(f'Actual for K= {n_neighbors} is {y_test.values}')
  y_hat_test=pd.DataFrame(y_hat_test)

  eval_result(model1,y_train,y_hat_train,y_test,y_hat_test,n_neighbors)

Time taken: 0.0012545585632324219
Prediction for K= 1 is [96.]
Actual for K= 1 is [100.]
RMSE Train error for k= 1 is 0.23643312187173018
RMSE Test error for k= 1 is 4.0
Time taken: 0.0013051033020019531
Prediction for K= 2 is [95.16455513]
Actual for K= 2 is [100.]
RMSE Train error for k= 2 is 0.16718346377260584
RMSE Test error for k= 2 is 4.835444870868116
Time taken: 0.0009913444519042969
Prediction for K= 3 is [87.41909573]
Actual for K= 3 is [100.]
RMSE Train error for k= 3 is 0.16718346377260584
RMSE Test error for k= 3 is 12.58090426874243
Time taken: 0.0010066032409667969
Prediction for K= 4 is [82.39293628]
Actual for K= 4 is [100.]
RMSE Train error for k= 4 is 0.16718346377260584
RMSE Test error for k= 4 is 17.607063717382687


In [55]:
# p = 1 is equivalent to using the manhattan distance

for n_neighbors in range(1,5):
  model1 = KNeighborsRegressor(n_neighbors=n_neighbors,weights='distance',algorithm='auto',p=1)
  run_model(model1)
  y_hat_train = model1.predict(X_train)
  y_hat_test = model1.predict(X_test)
  print(f'Prediction for K= {n_neighbors} is {y_hat_test}')
  print(f'Actual for K= {n_neighbors} is {y_test.values}')
  y_hat_test=pd.DataFrame(y_hat_test)

  eval_result(model1,y_train,y_hat_train,y_test,y_hat_test,n_neighbors)

Time taken: 0.003316640853881836
Prediction for K= 1 is [96.]
Actual for K= 1 is [100.]
RMSE Train error for k= 1 is 0.23643312187173018
RMSE Test error for k= 1 is 4.0
Time taken: 0.0007982254028320312
Prediction for K= 2 is [95.19512195]
Actual for K= 2 is [100.]
RMSE Train error for k= 2 is 0.16718346377260584
RMSE Test error for k= 2 is 4.8048780487804805
Time taken: 0.0010404586791992188
Prediction for K= 3 is [87.43395327]
Actual for K= 3 is [100.]
RMSE Train error for k= 3 is 0.16718346377260584
RMSE Test error for k= 3 is 12.566046733428692
Time taken: 0.0007596015930175781
Prediction for K= 4 is [82.2164102]
Actual for K= 4 is [100.]
RMSE Train error for k= 4 is 0.16718346377260584
RMSE Test error for k= 4 is 17.783589796848588


In [56]:
# p = 3 is equivalent to using the minkowski distance

for n_neighbors in range(1,5):
  model1 = KNeighborsRegressor(n_neighbors=n_neighbors,weights='distance',algorithm='auto',p=3)
  run_model(model1)
  y_hat_train = model1.predict(X_train)
  y_hat_test = model1.predict(X_test)
  print(f'Prediction for K= {n_neighbors} is {y_hat_test}')
  print(f'Actual for K= {n_neighbors} is {y_test.values}')

  eval_result(model1,y_train,y_hat_train,y_test,y_hat_test,n_neighbors)

Time taken: 0.0017693042755126953
Prediction for K= 1 is [96.]
Actual for K= 1 is [100.]
RMSE Train error for k= 1 is 0.23643312187173018
RMSE Test error for k= 1 is 3.999999999999986
Time taken: 0.0009815692901611328
Prediction for K= 2 is [95.13708302]
Actual for K= 2 is [100.]
RMSE Train error for k= 2 is 0.16718346377260584
RMSE Test error for k= 2 is 4.862916980687174
Time taken: 0.001516580581665039
Prediction for K= 3 is [87.32689178]
Actual for K= 3 is [100.]
RMSE Train error for k= 3 is 0.16718346377260584
RMSE Test error for k= 3 is 12.673108215828748
Time taken: 0.0009620189666748047
Prediction for K= 4 is [82.28409563]
Actual for K= 4 is [100.]
RMSE Train error for k= 4 is 0.16718346377260584
RMSE Test error for k= 4 is 17.715904367624262
