In [137]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, root_mean_squared_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import GridSearchCV

from datetime import date

import yfinance as yf

import pickle

In [2]:
def PlotData(data, graph_title, graph_name):
    matplotlib.use('Agg')
    plt.figure(figsize=(10, 6))
    plt.plot(data.index, data)
    plt.title(graph_title)
    # plt.show()

    plt.savefig(f'{graph_name}.png', format='png')

In [3]:
def MinMaxData(data, col_name):
    x = data.describe()
    x = x[col_name]
    # minVal = np.int32(x.loc['min'])
    minVal = x.loc['min']
    # minVal = round(minVal, 4)
    # maxVal = np.int32(x.loc['max'])
    # maxVal = round(maxVal, 4)
    maxVal = x.loc['max']

    return minVal, maxVal

In [4]:
def DataImport(stock_name):
    # t_day = date.today()
    # data = yf.download(stock_name, start=f"{t_day.year - 5}-01-01", end=f"{t_day.year}-{t_day.month-1}-01")
    data = yf.download(stock_name)
    close_data = data['Close']
    # PlotData(close_data, stock_name, "fullStock")
    open_min, open_max = MinMaxData(data, 'Open')
    high_min, high_max = MinMaxData(data, 'High')
    low_min, low_max = MinMaxData(data, 'Low')
    vol_min, vol_max = MinMaxData(data, 'Volume')
    
    d = {"open_min": open_min, "open_max": open_max, "high_min": high_min, "high_max": high_max, 
    "low_min": low_min, "low_max": low_max, "vol_min": vol_min, "vol_max": vol_max}

    return data, d


In [99]:
data, d = DataImport("GC=F")

[*********************100%%**********************]  1 of 1 completed


In [62]:
PlotData(data['Close'], "INFY", "fullStock")

In [100]:
data

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2000-08-30,273.899994,273.899994,273.899994,273.899994,273.899994,0
2000-08-31,274.799988,278.299988,274.799988,278.299988,278.299988,0
2000-09-01,277.000000,277.000000,277.000000,277.000000,277.000000,0
2000-09-05,275.799988,275.799988,275.799988,275.799988,275.799988,2
2000-09-06,274.200012,274.200012,274.200012,274.200012,274.200012,0
...,...,...,...,...,...,...
2024-04-17,2384.199951,2391.399902,2369.300049,2371.699951,2371.699951,37
2024-04-18,2368.399902,2384.399902,2368.399902,2382.300049,2382.300049,128
2024-04-19,2379.300049,2399.800049,2377.300049,2398.399902,2398.399902,160
2024-04-22,2383.300049,2383.300049,2326.899902,2332.199951,2332.199951,160


In [101]:
d

{'open_min': 255.0,
 'open_max': 2384.199951171875,
 'high_min': 256.1000061035156,
 'high_max': 2429.0,
 'low_min': 255.0,
 'low_max': 2377.300048828125,
 'vol_min': 0.0,
 'vol_max': 386334.0}

In [9]:
def modelBuilding(data):
    columns_to_4decimal = ['Open', 'High', 'Low', 'Close']

    data[columns_to_4decimal] = np.round(data[columns_to_4decimal],4)
    # print(data)
    X = data[['Open', 'Low', 'High', 'Volume']]
    y = data['Close']

    #split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    #creates the svr model
    svr_model = SVR(kernel = 'rbf', C=100, epsilon=0.1)


    #train the model
    svr_model.fit(X_train_scaled, y_train)

    scaler = MinMaxScaler()
    X_train_normalized = scaler.fit_transform(X_train)
    X_test_normalized = scaler.transform(X_test)


    #defines the parameter for grid search 
    param_grid = {
        'kernel': ['linear', 'rbf'],
        'C': [1, 10, 50, 100],
        'epsilon': [0.01, .1, 0.2, 0.5]
    }
    svr = SVR()

    grid_search = GridSearchCV(svr, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    normal_pred = grid_search.fit(X_train_normalized, y_train)

    best_params = grid_search.best_params_


    #trains the model with the best parameter 
    best_svr_model = SVR(**best_params)
    best_svr_model.fit(X_train_normalized, y_train)

    return best_svr_model, scaler, X_test, y_test, X_train, y_train

In [102]:
model, scaler, X_test, y_test, X_train, y_train = modelBuilding(data)

In [103]:
X_test

Unnamed: 0_level_0,Open,Low,High,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2006-12-11,623.0,622.4,629.7,467
2006-11-17,617.5,617.5,617.5,10
2011-02-11,1364.0,1355.2,1368.8,488
2015-06-22,1197.8,1182.7,1197.8,30
2014-10-23,1228.0,1227.9,1228.5,2
...,...,...,...,...
2019-04-25,1280.6,1275.8,1280.8,95
2005-08-29,438.5,437.3,439.5,37
2022-04-14,1977.2,1963.0,1978.7,101
2006-02-22,552.6,551.0,554.8,253


In [104]:
X_train

Unnamed: 0_level_0,Open,Low,High,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2011-06-22,1547.2,1544.4,1558.5,222
2021-11-04,1776.6,1776.2,1798.2,103
2003-04-16,325.5,323.5,325.8,0
2002-05-23,322.5,322.5,322.5,0
2016-04-13,1256.5,1241.3,1256.9,203
...,...,...,...,...
2015-09-18,1128.3,1127.8,1141.2,59
2021-05-13,1814.7,1814.1,1826.8,100
2021-07-02,1778.7,1778.3,1787.9,23
2022-02-25,1906.9,1883.6,1920.0,586


In [105]:
y_test
y_test2 = np.array(y_test)
y_test2

array([ 630. ,  621.5, 1359.9, ..., 1970.9,  553.9, 1574.6])

In [106]:
y_train
y_train2 = np.array(y_train)
y_train2

array([1552.9, 1793. ,  325.8, ..., 1782.6, 1886.5,  416. ])

In [107]:
y_test_pred = model.predict(X_test)
y_train_pred = model.predict(X_train)



In [108]:
y_test_pred

array([1333842.85204359, 1317144.63175472, 2904940.26395437, ...,
       4205183.97454052, 1179001.54818424, 3359187.49477712])

In [109]:
y_train_pred

array([3307131.20332258, 3807803.35109168,  692870.00278366, ...,
       3801346.43631903, 4056806.58408809,  883721.79125505])

In [110]:
scaler2 = MinMaxScaler(feature_range=(0, 1))
y_test = scaler2.fit_transform(y_test2.reshape(-1, 1))
y_test_pred2 = scaler2.fit_transform(y_test_pred.reshape(-1, 1))

y_train = scaler2.fit_transform(y_train2.reshape(-1, 1))
y_train_pred2 = scaler2.fit_transform(y_train_pred.reshape(-1, 1))

In [111]:
y_test_binary = (y_test > 0.5).astype(int)
y_test_pred_binary = (y_test_pred2 > 0.5).astype(int)

y_train_binary = (y_train > 0.5).astype(int)
y_train_pred_binary = (y_train_pred2 > 0.5).astype(int)

In [112]:
y_test_binary

array([[0],
       [0],
       [1],
       ...,
       [1],
       [0],
       [1]])

In [113]:
y_train_binary

array([[1],
       [1],
       [0],
       ...,
       [1],
       [1],
       [0]])

In [114]:
y_test_pred_binary

array([[0],
       [0],
       [1],
       ...,
       [1],
       [0],
       [1]])

In [115]:
y_train_pred_binary

array([[1],
       [1],
       [0],
       ...,
       [1],
       [1],
       [0]])

In [116]:
acc_test = accuracy_score(y_test_binary, y_test_pred_binary)
acc_train = accuracy_score(y_train_binary, y_train_pred_binary)

In [117]:
acc_test

0.9882055602358888

In [118]:
acc_train

0.9863042562157607

In [119]:
(acc_train*100, acc_test*100)

(98.63042562157607, 98.82055602358888)

In [120]:
avg_acc = (acc_test+acc_train)/2
avg_acc*100

98.72549082258247

In [121]:
pres_test = precision_score(y_test_binary, y_test_pred_binary)
pres_train = precision_score(y_train_binary, y_train_pred_binary)

In [122]:
(pres_test, pres_train)

(0.9953379953379954, 0.9669226225634967)

In [123]:
rec_test = recall_score(y_test_binary, y_test_pred_binary)
rec_train = recall_score(y_train_binary, y_train_pred_binary)

In [124]:
(rec_test, rec_train)

(0.9726651480637813, 0.9945321992709599)

In [125]:
f1_test = f1_score(y_test_binary, y_test_pred_binary)
f1_train = f1_score(y_train_binary, y_train_pred_binary)

In [126]:
(f1_test, f1_train)

(0.9838709677419355, 0.9805330937406409)

In [135]:
mse_test = mean_squared_error(y_test_binary, y_test_pred_binary)
mse_train = mean_squared_error(y_train_binary, y_train_pred_binary)

In [136]:
(mse_test, mse_train)

(0.011794439764111205, 0.013695743784239359)

In [139]:
rmse_test = root_mean_squared_error(y_test_binary, y_test_pred_binary)
rmse_train = root_mean_squared_error(y_train_binary, y_train_pred_binary)

In [140]:
(rmse_test, rmse_train)

(0.10860220883624423, 0.11702881604220115)

In [127]:
r2_test = r2_score(y_test_binary, y_test_pred_binary)
r2_train = r2_score(y_train_binary, y_train_pred_binary)

In [128]:
(r2_test, r2_train)

(0.9493927618676379, 0.9395425861325599)

In [142]:
cm_test = confusion_matrix(y_test_binary, y_test_pred_binary)
cm_test

array([[746,   2],
       [ 12, 427]], dtype=int64)

In [148]:
plt.figure(figsize=(8, 6))
sns.heatmap(cm_test, annot=True, fmt='d', cmap='Blues', xticklabels=['0', '1'], yticklabels=['0', '1'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

  plt.show()


In [141]:
cm_train = confusion_matrix(y_train_binary, y_train_pred_binary)
cm_train

array([[3044,   56],
       [   9, 1637]], dtype=int64)

In [147]:
def plotCm(cm):
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['0', '1'], yticklabels=['0', '1'])
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.show()

In [143]:
plotCm(cm_test)

  plt.show()


In [132]:
plotCm(cm_train)

  plt.show()


In [53]:
def Next30_days(data, d):
    new_data = pd.DataFrame()
    new_data['Date'] = pd.to_datetime(data.index)
    new_data = new_data.sort_values(by='Date')

    # Extract the last date in the data
    last_date = new_data['Date'].max()

    # Generate dates for the next 30 days
    next_30_days_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=30, freq='D')

    # Create a DataFrame for the next 30 days data
    next_30_days_data = pd.DataFrame({'Date': next_30_days_dates})
    
    # next_30_days_data['Volume'] = np.random.randint(100000, 500000, size=len(next_30_days_data))
    next_30_days_data['Open'] = np.random.randint(d['open_min'], d['open_max'], size=len(next_30_days_data))
    next_30_days_data['Low'] = np.random.randint(d['low_min'], d['low_max'], size=len(next_30_days_data))
    next_30_days_data['High'] = np.random.randint(d['high_min'], d['high_max'], size=len(next_30_days_data))
    next_30_days_data['Volume'] = np.random.randint(d['vol_min'], d['vol_max'], size=len(next_30_days_data))

    next_30_days_data.set_index('Date', inplace=True)
    # Print or use the prepared next 30 days data
    # print(next_30_days_data)

    return next_30_days_data

In [54]:
def predictData(data, d):
    next_30_days_data = Next30_days(data, d)

    next_30_days_scaled = scaler.transform(next_30_days_data)

    # Make predictions for the next 30 days
    predictions_next_30_days = model.predict(next_30_days_scaled)

    return predictions_next_30_days, next_30_days_data

In [114]:
pred, next_data = predictData(data, d)

In [56]:
def CombineData(data, pred, next_data):
    original_close_data = pd.DataFrame()
    original_close_data['Close'] = data['Close']
    df = pd.DataFrame(pred, index=next_data.index)
    df.columns = ['Close']
    # original_close_data
    # df
    predicted_30_days_data_combined = pd.concat([original_close_data, df])
    
    return predicted_30_days_data_combined

In [115]:
data_final = CombineData(data, pred, next_data)

In [155]:
def plotPredict(data_final):
    x = data_final['Close']
    x = x[::-100]
    # PlotData(x, f"{stock_name} Prediction", "Data_Prediction")
    return x

In [156]:
x = plotPredict(data_final)
PlotData(x, f"INFY Prediction", "Data_Prediction")

In [157]:
def OneYear_data(data):
    data = data['Close']
    one_year_data = data.tail(365)
    # PlotData(one_year_data, "Yearly Stock Data", "yearly")
    return one_year_data

In [158]:
one_year_data = OneYear_data(data)
PlotData(one_year_data, "Yearly Stock Data", "yearly")

In [160]:
def LastMonth_data(data):
    data = data['Close']
    last_month_data = data.tail(30)
    # PlotData(last_moth_data, "Monthly Stock Data", "monthly")
    return last_month_data

In [161]:
last_month_data = LastMonth_data(data)
PlotData(last_month_data, "Monthly Stock Data", "monthly")

In [162]:
def LastWeek_data(data):
    data = data['Close']
    last_week_data = data.tail(7)
    # PlotData(last_week_data, "Weekly Stock Data", "weekly")

    return last_week_data

In [163]:
last_week_data = LastWeek_data(data)
PlotData(last_week_data, "Weekly Stock Data", "weekly")

In [169]:
data = {
    "plot": PlotData,
    "minmax": MinMaxData,
    "dataImport": DataImport,
    "model": modelBuilding,
    "next30": Next30_days,
    "predictData": predictData,
    "combine": CombineData,
    "plotPred": plotPredict,
    "year": OneYear_data,
    "month": LastMonth_data,
    "week": LastWeek_data,
}

In [179]:
with open(r'../stockpred/stockpred/Model/svm_final.pkl', 'wb') as file:
    pickle.dump(data, file)

In [171]:
with open("svm_final.pkl", "rb") as file:
    load_data = pickle.load(file)

In [172]:
data, d = load_data['dataImport']("INFY")

[*********************100%%**********************]  1 of 1 completed
