<a href="https://colab.research.google.com/github/LezendarySandwich/CSL7550-Stock-Price-Prediction/blob/master/dataset_setup.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import os
import shutil
from google.colab import drive

if not os.path.exists('/content/drive'):
  drive.mount('/content/drive')

In [4]:
!pip install quandl
!pip install python-dotenv

Collecting quandl
  Downloading Quandl-3.6.2-py2.py3-none-any.whl (26 kB)
Collecting inflection>=0.3.1
  Downloading inflection-0.5.1-py2.py3-none-any.whl (9.5 kB)
Installing collected packages: inflection, quandl
Successfully installed inflection-0.5.1 quandl-3.6.2
Collecting python-dotenv
  Downloading python_dotenv-0.19.1-py2.py3-none-any.whl (17 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-0.19.1


In [5]:
import pandas as pd
import matplotlib.pyplot as plt 
import quandl
from sklearn.linear_model import LinearRegression

In [6]:
!rm -r sample_data

In [7]:
# @markdown ### CONSTANTS
from time import time

STORAGE = '/content/drive/MyDrive' #@param {type : "string"}
KAGGLE_DIR = 'kaggle' #@param {type : "string"}
KAGGLE_JSON_PATH = os.path.join(STORAGE, KAGGLE_DIR)
QUANDL_PATH = os.path.join(KAGGLE_JSON_PATH, '.quandl')
__PROJECT = os.path.join(STORAGE, 'Neo')
ML_DATASET = os.path.join(__PROJECT, 'dataset')
GOOGLE_STOCK_DATA_ZIP = os.path.join(ML_DATASET, 'google-stock-price.zip')
SPLIT = 0.8
SEED = int(time())

In [8]:
if not os.path.exists(__PROJECT):
  os.makedirs(__PROJECT)

if not os.path.exists(ML_DATASET):
  os.makedirs(ML_DATASET)

In [9]:
!pip install pip --upgrade # command to upgrade pip
!pip install kaggle --upgrade --force-reinstall --no-deps # command to upgrade to kaggle version 1.5.10
# @markdown Kaggle API official [Docs](https://github.com/Kaggle/kaggle-api)

Collecting pip
  Downloading pip-21.3.1-py3-none-any.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 5.4 MB/s 
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 21.1.3
    Uninstalling pip-21.1.3:
      Successfully uninstalled pip-21.1.3
Successfully installed pip-21.3.1
Collecting kaggle
  Downloading kaggle-1.5.12.tar.gz (58 kB)
     |████████████████████████████████| 58 kB 3.1 MB/s             
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25l[?25hdone
  Created wheel for kaggle: filename=kaggle-1.5.12-py3-none-any.whl size=73051 sha256=03c27fd58b75570d33e13eb3073346ec47be35a240974126155764beed51c8f8
  Stored in directory: /root/.cache/pip/wheels/62/d6/58/5853130f941e75b2177d281eb7e44b4a98ed46dd155f556dc5
Successfully built kaggle
Installing collected packages: kaggle
  Attempting uninstall: kaggle
    Foun

In [10]:
os.environ['KAGGLE_CONFIG_DIR']=os.path.join(KAGGLE_JSON_PATH) # we have stored the kaggle.json in .kaggle of my drive
!kaggle --version # Kaggle API 1.5.10

Kaggle API 1.5.12


In [11]:
from dotenv import load_dotenv, find_dotenv
from pathlib import Path

load_dotenv(QUANDL_PATH)
quandl.ApiConfig.api_key = os.getenv('QUANDL_API_KEY')

In [12]:
! kaggle datasets download -d medharawat/google-stock-price

Downloading google-stock-price.zip to /content
  0% 0.00/23.4k [00:00<?, ?B/s]
100% 23.4k/23.4k [00:00<00:00, 20.3MB/s]


In [13]:
! mv google-stock-price.zip $ML_DATASET/google-stock-price.zip

In [14]:
from zipfile import ZipFile

file_path = os.path.join(GOOGLE_STOCK_DATA_ZIP)
directory = os.path.join(ML_DATASET, 'google_stock_price_dataset')

os.makedirs(directory, exist_ok=True)

with ZipFile(file_path, 'r') as zip:
  zip.extractall(directory)

os.remove(file_path)

In [15]:
df = pd.read_csv('/content/drive/MyDrive/Neo/dataset/google_stock_price_dataset/Google_Stock_Price_Train.csv')
df

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,1/3/2012,325.25,332.83,324.97,663.59,7380500
1,1/4/2012,331.27,333.87,329.08,666.45,5749400
2,1/5/2012,329.83,330.75,326.89,657.21,6590300
3,1/6/2012,328.34,328.77,323.68,648.24,5405900
4,1/9/2012,322.04,322.29,309.46,620.76,11688800
...,...,...,...,...,...,...
1253,12/23/2016,790.90,792.74,787.28,789.91,623400
1254,12/27/2016,790.68,797.86,787.66,791.55,789100
1255,12/28/2016,793.70,794.23,783.20,785.05,1153800
1256,12/29/2016,783.33,785.93,778.92,782.79,744300


In [16]:
from datetime import date

class Quadl_Data_setup:

  def __init__(self, dir, start_date, end_date):
    self.__dir = dir
    self.start_date = start_date
    self.end_date = end_date
    self.features = ['Open', 'High', 'Low', 'Close', 'Total Trade Quantity']

  def setup_dataset(self, scrip):

    save_dir = os.path.join(self.__dir, f'{scrip}_{self.start_date}_{self.end_date}')
    if os.path.exists(save_dir):
      print(f'\n{scrip} already done\n')
      train_dataset = pd.read_csv(os.path.join(save_dir, 'dataset_train.csv'))
      test_dataset = pd.read_csv(os.path.join(save_dir, 'dataset_test.csv'))
      print(f'Train_dataset\n{train_dataset}')
      print(f'Test_dataset\n{test_dataset}')
      return 

    os.makedirs(save_dir)

    stock_data = quandl.get(f'NSE/{scrip}', start_date=self.start_date, end_date=self.end_date)
    dataset = pd.DataFrame()
    dataset['Date'] = stock_data.index
    stock_data.reset_index(drop=True, inplace=True)
    dataset = dataset.join(stock_data[self.features]);
    dataset.rename(columns={'Total Trade Quantity': 'Volume'}, inplace=True)

    save_file_test = os.path.join(save_dir, 'dataset_test.csv')
    save_file_train = os.path.join(save_dir, 'dataset_train.csv')

    print(f'\n{scrip} done\n')

    train_dataset=dataset.iloc[:-len(dataset)//5]
    test_dataset=dataset.iloc[-len(dataset)//5:]

    train_dataset.to_csv(save_file_train, encoding='utf-8', index=False)
    test_dataset.to_csv(save_file_test, encoding='utf-8', index=False)
    
    print(f'Train_dataset\n{train_dataset}')
    print(f'Test_dataset\n{test_dataset}')


## WORKING WITH NSE DATA

In [17]:
import tqdm.notebook as tq

scrips = ['TCS', 'TATASTEEL', 'RELIANCE']
START_DATES = [date(2014, 1, 1)]
END_DATES = [date(2019, 1, 1)]


for index, start_date in tq.tqdm(enumerate(START_DATES)):

  quadl_data_setup = Quadl_Data_setup(ML_DATASET, start_date, END_DATES[index])

  for scrip in tq.tqdm(scrips):
    quadl_data_setup.setup_dataset(scrip)

0it [00:00, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]


TCS done

Train_dataset
          Date     Open     High      Low    Close     Volume
0   2014-01-01  2180.10  2184.50  2151.10  2153.30   264976.0
1   2014-01-02  2166.00  2189.00  2156.15  2167.00   863474.0
2   2014-01-03  2164.70  2229.10  2147.25  2222.20  1309087.0
3   2014-01-06  2229.00  2244.00  2197.00  2239.60  1155905.0
4   2014-01-07  2240.00  2256.45  2199.00  2206.15  1448743.0
..         ...      ...      ...      ...      ...        ...
982 2017-12-22  2604.00  2663.90  2590.20  2646.75  1832832.0
983 2017-12-26  2686.00  2689.75  2631.60  2649.45  1087893.0
984 2017-12-27  2649.45  2658.50  2610.50  2619.90   369642.0
985 2017-12-28  2608.80  2653.50  2605.65  2626.95  1302544.0
986 2017-12-29  2623.10  2708.90  2618.55  2701.20  1304771.0

[987 rows x 6 columns]
Test_dataset
           Date    Open    High      Low    Close     Volume
987  2018-01-01  2682.3  2694.8  2635.00  2645.60   675880.0
988  2018-01-02  2660.0  2669.6  2620.20  2631.20   960145.0
989  2018-0