<a href="https://colab.research.google.com/github/LezendarySandwich/CSL7550-Stock-Price-Prediction/blob/master/dataset_setup.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import shutil
from google.colab import drive

if not os.path.exists('/content/drive'):
  drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import matplotlib.pyplot as plt 
import quandl
from sklearn.linear_model import LinearRegression

In [None]:
!rm -r sample_data

rm: cannot remove 'sample_data': No such file or directory


In [127]:
# @markdown ### CONSTANTS
from time import time

STORAGE = '/content/drive/MyDrive' #@param {type : "string"}
KAGGLE_DIR = 'kaggle' #@param {type : "string"}
KAGGLE_JSON_PATH = os.path.join(STORAGE, KAGGLE_DIR)
QUANDL_PATH = os.path.join(KAGGLE_JSON_PATH, '.quandl')
__PROJECT = os.path.join(STORAGE, 'Neo')
ML_DATASET = os.path.join(__PROJECT, 'dataset')
GOOGLE_STOCK_DATA_ZIP = os.path.join(ML_DATASET, 'google-stock-price.zip')
SPLIT = 0.8
SEED = int(time())

In [None]:
if not os.path.exists(__PROJECT):
  os.makedirs(__PROJECT)

if not os.path.exists(ML_DATASET):
  os.makedirs(ML_DATASET)

In [None]:
!pip install pip --upgrade # command to upgrade pip
!pip install kaggle --upgrade --force-reinstall --no-deps # command to upgrade to kaggle version 1.5.10
# @markdown Kaggle API official [Docs](https://github.com/Kaggle/kaggle-api)

Collecting kaggle
  Using cached kaggle-1.5.12-py3-none-any.whl
Installing collected packages: kaggle
  Attempting uninstall: kaggle
    Found existing installation: kaggle 1.5.12
    Uninstalling kaggle-1.5.12:
      Successfully uninstalled kaggle-1.5.12
Successfully installed kaggle-1.5.12


In [None]:
os.environ['KAGGLE_CONFIG_DIR']=os.path.join(KAGGLE_JSON_PATH) # we have stored the kaggle.json in .kaggle of my drive
!kaggle --version # Kaggle API 1.5.10

Kaggle API 1.5.12


In [None]:
!pip install quandl
!pip install python-dotenv



In [None]:
from dotenv import load_dotenv, find_dotenv
from pathlib import Path

load_dotenv(QUANDL_PATH)
quandl.ApiConfig.api_key = os.getenv('QUANDL_API_KEY')

In [None]:
! kaggle datasets download -d medharawat/google-stock-price

Downloading google-stock-price.zip to /content
  0% 0.00/23.4k [00:00<?, ?B/s]
100% 23.4k/23.4k [00:00<00:00, 21.4MB/s]


In [None]:
! mv google-stock-price.zip $ML_DATASET/google-stock-price.zip

In [None]:
from zipfile import ZipFile

file_path = os.path.join(GOOGLE_STOCK_DATA_ZIP)
directory = os.path.join(ML_DATASET, 'google_stock_price_dataset')

os.makedirs(directory, exist_ok=True)

with ZipFile(file_path, 'r') as zip:
  zip.extractall(directory)

os.remove(file_path)

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Neo/dataset/google_stock_price_dataset/Google_Stock_Price_Train.csv')
df

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,1/3/2012,325.25,332.83,324.97,663.59,7380500
1,1/4/2012,331.27,333.87,329.08,666.45,5749400
2,1/5/2012,329.83,330.75,326.89,657.21,6590300
3,1/6/2012,328.34,328.77,323.68,648.24,5405900
4,1/9/2012,322.04,322.29,309.46,620.76,11688800
...,...,...,...,...,...,...
1253,12/23/2016,790.90,792.74,787.28,789.91,623400
1254,12/27/2016,790.68,797.86,787.66,791.55,789100
1255,12/28/2016,793.70,794.23,783.20,785.05,1153800
1256,12/29/2016,783.33,785.93,778.92,782.79,744300


In [142]:
from datetime import date

class Quadl_Data_setup:

  def __init__(self, dir, start_date, end_date):
    self.__dir = dir
    self.start_date = start_date
    self.end_date = end_date
    self.features = ['Open', 'High', 'Low', 'Close', 'Total Trade Quantity']

  def setup_dataset(self, scrip):

    save_dir = os.path.join(self.__dir, f'{scrip}_{self.start_date}_{self.end_date}')
    if os.path.exists(save_dir):
      print(f'\n{scrip} already done\n')
      train_dataset = pd.read_csv(os.path.join(save_dir, 'dataset_train.csv'))
      test_dataset = pd.read_csv(os.path.join(save_dir, 'dataset_test.csv'))
      print(f'Train_dataset\n{train_dataset}')
      print(f'Test_dataset\n{test_dataset}')
      return 

    os.makedirs(save_dir)

    stock_data = quandl.get(f'NSE/{scrip}', start_date=self.start_date, end_date=self.end_date)
    dataset = pd.DataFrame()
    dataset['Date'] = stock_data.index
    stock_data.reset_index(drop=True, inplace=True)
    dataset = dataset.join(stock_data[features]);
    dataset.rename(columns={'Total Trade Quantity': 'Volume'}, inplace=True)

    save_file_test = os.path.join(save_dir, 'dataset_test.csv')
    save_file_train = os.path.join(save_dir, 'dataset_train.csv')

    print(f'\n{scrip} done\n')

    train_dataset = dataset.sample(frac=SPLIT,random_state=SEED)
    test_dataset = dataset.drop(train_dataset.index)

    train_dataset.to_csv(save_file_train, encoding='utf-8', index=False)
    test_dataset.to_csv(save_file_test, encoding='utf-8', index=False)
    
    print(f'Train_dataset\n{train_dataset}')
    print(f'Test_dataset\n{test_dataset}')


## WORKING WITH NSE DATA

In [146]:
import tqdm.notebook as tq

scrips = ['TCS', 'TATASTEEL']
START_DATES = [date(2018, 12, 1)]
END_DATES = [date(2018, 12, 31)]


for index, start_date in tq.tqdm(enumerate(START_DATES)):

  quadl_data_setup = Quadl_Data_setup(ML_DATASET, start_date, END_DATES[index])

  for scrip in tq.tqdm(scrips):
    quadl_data_setup.setup_dataset(scrip)

0it [00:00, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]


TCS already done

Train_dataset
          Date    Open    High      Low    Close     Volume
0   2018-12-17  1999.0  2004.9  1985.00  1994.30  1227921.0
1   2018-12-21  1948.0  1950.0  1886.55  1895.80  3729956.0
2   2018-12-18  1991.9  2002.0  1976.40  1987.85  1768742.0
3   2018-12-05  2006.0  2018.0  1985.00  2006.75  2501539.0
4   2018-12-20  1953.8  1974.9  1946.00  1954.05  1940277.0
5   2018-12-04  1983.0  2019.4  1971.00  2010.85  3270615.0
6   2018-12-03  1984.0  1990.0  1968.30  1982.40  1610576.0
7   2018-12-24  1905.8  1938.9  1905.00  1918.50  1864116.0
8   2018-12-06  1998.0  2017.0  1979.60  1992.70  2321216.0
9   2018-12-26  1921.8  1921.8  1870.25  1889.20  2446614.0
10  2018-12-27  1909.0  1941.7  1872.10  1908.95  4968201.0
11  2018-12-11  1970.0  2010.0  1961.00  2000.00  2942014.0
12  2018-12-12  2001.1  2022.0  1984.95  2016.80  2219993.0
13  2018-12-10  1975.0  2011.0  1960.00  1975.80  2010786.0
14  2018-12-28  1915.0  1920.0  1893.00  1896.05  2239130.0
15  201