In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error, mean_absolute_error, r2_score
from sklearn.neighbors import KNeighborsRegressor

In [3]:
df1 = pd.read_csv(r"DataSets\securities.csv")

In [4]:
df2 = pd.read_csv(r"DataSets\prices.csv")

In [5]:
df1 = pd.concat([df1,df2])

In [6]:
df2['Daily Return'] = df2['close'] - df2['open']

In [8]:
df2.head()

Unnamed: 0,date,symbol,open,close,low,high,volume,Daily Return
0,2016-01-05 00:00:00,WLTW,123.43,125.839996,122.309998,126.25,2163600.0,2.409996
1,2016-01-06 00:00:00,WLTW,125.239998,119.980003,119.940002,125.540001,2386400.0,-5.259995
2,2016-01-07 00:00:00,WLTW,116.379997,114.949997,114.93,119.739998,2489500.0,-1.43
3,2016-01-08 00:00:00,WLTW,115.480003,116.620003,113.5,117.440002,2006300.0,1.14
4,2016-01-11 00:00:00,WLTW,117.010002,114.970001,114.089996,117.330002,1408600.0,-2.040001


In [9]:
df2.shape

(851264, 8)

In [10]:
df2.isnull().sum()

date            0
symbol          0
open            0
close           0
low             0
high            0
volume          0
Daily Return    0
dtype: int64

In [11]:
df1.isnull().sum()

Ticker symbol              851264
Security                   851264
SEC filings                851264
GICS Sector                851264
GICS Sub Industry          851264
Address of Headquarters    851264
Date first added           851462
CIK                        851264
date                          505
symbol                        505
open                          505
close                         505
low                           505
high                          505
volume                        505
dtype: int64

In [12]:
df2

Unnamed: 0,date,symbol,open,close,low,high,volume,Daily Return
0,2016-01-05 00:00:00,WLTW,123.430000,125.839996,122.309998,126.250000,2163600.0,2.409996
1,2016-01-06 00:00:00,WLTW,125.239998,119.980003,119.940002,125.540001,2386400.0,-5.259995
2,2016-01-07 00:00:00,WLTW,116.379997,114.949997,114.930000,119.739998,2489500.0,-1.430000
3,2016-01-08 00:00:00,WLTW,115.480003,116.620003,113.500000,117.440002,2006300.0,1.140000
4,2016-01-11 00:00:00,WLTW,117.010002,114.970001,114.089996,117.330002,1408600.0,-2.040001
...,...,...,...,...,...,...,...,...
851259,2016-12-30,ZBH,103.309998,103.199997,102.849998,103.930000,973800.0,-0.110001
851260,2016-12-30,ZION,43.070000,43.040001,42.689999,43.310001,1938100.0,-0.029999
851261,2016-12-30,ZTS,53.639999,53.529999,53.270000,53.740002,1701200.0,-0.110000
851262,2016-12-30 00:00:00,AIV,44.730000,45.450001,44.410000,45.590000,1380900.0,0.720001


In [13]:
scaler= StandardScaler()

In [19]:
x = pd.DataFrame(df2.drop(columns = ["Daily Return"]))
x = pd.DataFrame(x.drop(columns = ["date"]))
x = pd.DataFrame(x.drop(columns = ["symbol"]))
x = pd.DataFrame(x.drop(columns = ["close"]))
x = pd.DataFrame(x.drop(columns = ["high"]))
x = pd.DataFrame(x.drop(columns = ["low"]))
y = df2["Daily Return"]

In [20]:
X_scaled = scaler.fit_transform(x)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.2, random_state = 42)

In [22]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

In [23]:
x

Unnamed: 0,open,volume
0,123.430000,2163600.0
1,125.239998,2386400.0
2,116.379997,2489500.0
3,115.480003,2006300.0
4,117.010002,1408600.0
...,...,...
851259,103.309998,973800.0
851260,43.070000,1938100.0
851261,53.639999,1701200.0
851262,44.730000,1380900.0


In [24]:
param_grid = {
 'n_neighbors': [3, 5, 7, 9, 11, 13, 15],        
# Regularization parameter
 'weights': ['uniform', 'distance'],  # Kernel coefficient
 'metric': ['euclidean', 'manhattan','minkowski'],    
 'p': [1, 2] # power parameter for Minkowski distance   
}

In [28]:
model = KNeighborsRegressor()

In [29]:
grid_search = GridSearchCV(estimator = model, param_grid = param_grid, cv=kfold, scoring='r2', n_jobs = -1)
grid_search.fit(X_train, y_train)

In [30]:
print("Best Cross-Validation Accuracy:", grid_search.best_score_)
print("Best Parameters:", grid_search.best_params_)

Best Cross-Validation Accuracy: -0.050977563914606575
Best Parameters: {'metric': 'manhattan', 'n_neighbors': 15, 'p': 1, 'weights': 'distance'}


In [31]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

In [32]:
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("R-Squared:", r2_score(y_test, y_pred))

Mean Squared Error: 2.265073833385697
Mean Absolute Error: 0.7378994205226624
R-Squared: -0.060543870738931416
