<a href="https://colab.research.google.com/github/Mayank-004/Google-Trend-Analysis/blob/main/KNN_3D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 2D KNN Prediction

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from sklearn.model_selection import train_test_split
from patsy import dmatrices
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score,accuracy_score
import seaborn as sns
import random
import time
random.seed(786)
from sklearn.metrics import mean_squared_error

In [17]:
dataset= pd.read_csv('multiTimeline_3D.csv',error_bad_lines= False, sep=',')


In [30]:
#Count Missing Data
missing = dataset.isna().sum().sort_values(ascending = False)
percent_missing = ((missing / dataset.isnull().count()) * 100).sort_values(ascending = False)
missing_df = pd.concat([missing,percent_missing], axis = 1, keys = ['Total', 'Percent'],sort = False)
missing_df[missing_df['Total'] >= 1]

Unnamed: 0,Total,Percent
x_t2,3,1.807229


In [31]:
dataset.dropna(inplace=True)

In [41]:
dataset

Unnamed: 0,x_t0,x_t1,x_t11,x_t2
0,69,66,61,57.0
1,66,61,57,51.0
2,61,57,51,49.0
3,57,51,49,47.0
4,51,49,47,46.0
...,...,...,...,...
158,25,24,26,28.0
159,24,26,28,43.0
160,26,28,43,85.0
161,28,43,85,96.0


In [32]:
X = dataset.iloc[:,0:-1]
y = dataset.iloc[:,-1]

In [33]:
X_train = X.head(X.shape[0] -1)
y_train=y.head(y.shape[0]-1)
X_test= X.tail(1)
y_test=y.tail(1)

In [34]:

## defining a reusable function to evaluate model performance

def eval_result(model_name,y_train,y_predtrain,y_test,y_predtest,n_neighbors):

    rmse_train = mean_squared_error(y_train, y_predtrain, squared=False)
    print( f"RMSE Train error for k= {n_neighbors} is {rmse_train}")
    rmse_test = mean_squared_error(y_test, y_predtest, squared=False)
    print( f"RMSE Test error for k= {n_neighbors} is {rmse_test}")

In [35]:
# reusable function to calculate run time for model training

def run_model(model):
    t0=time.time()
    model.fit(X_train,y_train)
    time_taken= time.time()- t0
    print(f'Time taken: {time_taken}')

In [36]:
# p = 2 is equivalent to using the euclidean distance
# weights = 'distance' means closer points are weighted more heavily than further away points
# weights = 'Uniform' means every point is given equal weight.
n_neighbors = 3
model1 = KNeighborsRegressor(n_neighbors=n_neighbors,weights='distance',algorithm='auto',p=2)
run_model(model1)
#model1.fit(X_train,y_train)

y_hat_train = model1.predict(X_train)
y_hat_test = model1.predict(X_test)
#y_hat_test=pd.DataFrame(y_hat_test)
eval_result(model1,y_train,y_hat_train,y_test,y_hat_test,n_neighbors)


Time taken: 0.00128936767578125
RMSE Train error for k= 3 is 0.2721655269759087
RMSE Test error for k= 3 is 12.376336766074573


In [37]:
# p = 2 is equivalent to using the euclidean distance

for n_neighbors in range(1,4):
  model1 = KNeighborsRegressor(n_neighbors=n_neighbors,weights='distance',algorithm='auto',p=2)
  run_model(model1)
  #model1.fit(X_train,y_train)

  y_hat_train = model1.predict(X_train)
  y_hat_test = model1.predict(X_test)
  print(f'Prediction for K= {n_neighbors} is {y_hat_test}')
  print(f'Actual for K= {n_neighbors} is {y_test.values}')
  y_hat_test=pd.DataFrame(y_hat_test)

  eval_result(model1,y_train,y_hat_train,y_test,y_hat_test,n_neighbors)

Time taken: 0.0011234283447265625
Prediction for K= 1 is [96.]
Actual for K= 1 is [100.]
RMSE Train error for k= 1 is 0.3849001794597505
RMSE Test error for k= 1 is 4.0
Time taken: 0.0006678104400634766
Prediction for K= 2 is [95.19206281]
Actual for K= 2 is [100.]
RMSE Train error for k= 2 is 0.2721655269759087
RMSE Test error for k= 2 is 4.8079371877103085
Time taken: 0.0007367134094238281
Prediction for K= 3 is [87.62366323]
Actual for K= 3 is [100.]
RMSE Train error for k= 3 is 0.2721655269759087
RMSE Test error for k= 3 is 12.376336766074573


In [38]:
# p = 1 is equivalent to using the manhattan distance

for n_neighbors in range(1,4):
  model1 = KNeighborsRegressor(n_neighbors=n_neighbors,weights='distance',algorithm='auto',p=1)
  run_model(model1)
  y_hat_train = model1.predict(X_train)
  y_hat_test = model1.predict(X_test)
  print(f'Prediction for K= {n_neighbors} is {y_hat_test}')
  print(f'Actual for K= {n_neighbors} is {y_test.values}')
  y_hat_test=pd.DataFrame(y_hat_test)

  eval_result(model1,y_train,y_hat_train,y_test,y_hat_test,n_neighbors)

Time taken: 0.001041412353515625
Prediction for K= 1 is [96.]
Actual for K= 1 is [100.]
RMSE Train error for k= 1 is 0.3849001794597505
RMSE Test error for k= 1 is 4.0
Time taken: 0.0008928775787353516
Prediction for K= 2 is [93.47222222]
Actual for K= 2 is [100.]
RMSE Train error for k= 2 is 0.2721655269759087
RMSE Test error for k= 2 is 6.5277777777777715
Time taken: 0.0007045269012451172
Prediction for K= 3 is [93.61004367]
Actual for K= 3 is [100.]
RMSE Train error for k= 3 is 0.2721655269759087
RMSE Test error for k= 3 is 6.389956331877727


In [39]:
# p = 3 is equivalent to using the minkowski distance

for n_neighbors in range(1,4):
  model1 = KNeighborsRegressor(n_neighbors=n_neighbors,weights='distance',algorithm='auto',p=3)
  run_model(model1)
  y_hat_train = model1.predict(X_train)
  y_hat_test = model1.predict(X_test)
  print(f'Prediction for K= {n_neighbors} is {y_hat_test}')
  print(f'Actual for K= {n_neighbors} is {y_test.values}')

  eval_result(model1,y_train,y_hat_train,y_test,y_hat_test,n_neighbors)

Time taken: 0.0010857582092285156
Prediction for K= 1 is [96.]
Actual for K= 1 is [100.]
RMSE Train error for k= 1 is 0.3849001794597505
RMSE Test error for k= 1 is 3.999999999999986
Time taken: 0.0006811618804931641
Prediction for K= 2 is [95.1448283]
Actual for K= 2 is [100.]
RMSE Train error for k= 2 is 0.2721655269759087
RMSE Test error for k= 2 is 4.855171699716834
Time taken: 0.0007872581481933594
Prediction for K= 3 is [87.38396711]
Actual for K= 3 is [100.]
RMSE Train error for k= 3 is 0.2721655269759087
RMSE Test error for k= 3 is 12.616032885738605
