In [114]:
!pip install awswrangler

[0m

In [115]:
!pip install nsepython

[0m

In [116]:
import boto3
import awswrangler as wr
import warnings
warnings.filterwarnings("ignore")

In [117]:
import nsepython as nse

In [118]:
#load the AWS credentials and keep it ready

import configparser

reader = configparser.ConfigParser()
reader.read_file(open('/kaggle/input/private-set/calter.config'))


aws_reg = reader["AWS"]["REGION"]
aws_key = reader["AWS"]["KEY"]
aws_sec = reader["AWS"]["SECRET"]

my_session = boto3.Session(aws_access_key_id=aws_key,aws_secret_access_key=aws_sec,
                          region_name=aws_reg)

s3_client = boto3.client('s3',region_name=aws_reg,aws_access_key_id=aws_key,
                        aws_secret_access_key=aws_sec)

There are many sources from which the data can come to Data Engineer or the Analyst. How to pull the data from these sources reliably? 

1 - List the sources and the designate a staging area. 

2 - Think of the source Type and the connection method.

3 - Get the necessary credentials and register them for use.

4 - Code the functions that pull the data into the staging.

5 - Rinse and Repeat

### Lets Designate a Staging Area:

Stage @ AWS ==> s3://pipe-line-source

Place the string path in the variable

In [22]:
stage_path = "pipe-line-source"

In [25]:
# There are two ways to list the contents of the bucket

#Boto way: Note the bucket name is not having the prefix s3://
try:
    s3_client.list_objects_v2(Bucket=stage_path)["Contents"]
except Exception as e:
    print("The bucket must be empty.")

The bucket must be empty.


In [28]:
# listing the buckets

wr.s3.list_buckets(boto3_session=my_session)

['athena-query-res-jan',
 'aws-athena-query-results-642924624251-us-east-1',
 'aws-glue-assets-642924624251-us-east-1',
 'boto-bucket-16',
 'de-yt-starterdata-bkp',
 'pipe-line-source',
 'tab-mcq-de']

In [27]:
#AWSWrangler way

wr.s3.list_objects(path=f's3://'+stage_path,boto3_session=my_session)

#AWSWrangler is quiet when it comes to empty buckets. 

[]

List of Sources : 

In this notebook we will consider the following sources

1) CSV or Json file from Kaggle dataset 
    
    Use Kaggle notebook to load the dataset and then upload to S3
    
2) File of any type in the local file system:

    The process will be same as the above steps used in Kaggle Notebook. 

3) Data needs to be pulled from a web API 

4) Tables from Local or Remote Relational Database needs to be dumped 

5) Tables inside Data Warehouses

### Loading files from Kaggle

a) Kaggle Notebook

In [29]:
%%sh
cd /kaggle/input/bosch-production-line-performance
ls

sample_submission.csv.zip
test_categorical.csv.zip
test_date.csv.zip
test_numeric.csv.zip
train_categorical.csv.zip
train_date.csv.zip
train_numeric.csv.zip


In [32]:
# Get the file paths that needs to be uploaded

import os, json, glob
file_path = glob.glob(pathname="/kaggle/input/bosch-production-line-performance/*.zip",
                      recursive=True)
file_path

['/kaggle/input/bosch-production-line-performance/train_date.csv.zip',
 '/kaggle/input/bosch-production-line-performance/sample_submission.csv.zip',
 '/kaggle/input/bosch-production-line-performance/train_numeric.csv.zip',
 '/kaggle/input/bosch-production-line-performance/test_date.csv.zip',
 '/kaggle/input/bosch-production-line-performance/test_categorical.csv.zip',
 '/kaggle/input/bosch-production-line-performance/test_numeric.csv.zip',
 '/kaggle/input/bosch-production-line-performance/train_categorical.csv.zip']

In [36]:
#Pumping the files into S3
for fqp in file_path:
    key = fqp.split('/')[4]
    print(key)
    wr.s3.upload(local_file=fqp,path=f's3://{stage_path}/{key}',
                     boto3_session=my_session)

train_date.csv.zip


Exception ignored in: <function _S3ObjectBase.__del__ at 0x7f53e2afd200>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/awswrangler/s3/_fs.py", line 243, in __del__
    self.close()
  File "/opt/conda/lib/python3.7/site-packages/awswrangler/s3/_fs.py", line 483, in close
    function_name="put_object", s3_additional_kwargs=self._s3_additional_kwargs
  File "/opt/conda/lib/python3.7/site-packages/awswrangler/_utils.py", line 348, in try_it
    return f(**kwargs)
  File "/opt/conda/lib/python3.7/site-packages/botocore/client.py", line 530, in _api_call
    return self._make_api_call(operation_name, kwargs)
  File "/opt/conda/lib/python3.7/site-packages/botocore/client.py", line 944, in _make_api_call
    operation_model, request_dict, request_context
  File "/opt/conda/lib/python3.7/site-packages/botocore/client.py", line 966, in _make_request
    return self._endpoint.make_request(operation_model, request_dict)
  File "/opt/conda/lib/python3.7/site-pac

sample_submission.csv.zip
train_numeric.csv.zip
test_date.csv.zip
test_categorical.csv.zip
test_numeric.csv.zip
train_categorical.csv.zip


In [40]:
#Validate the files are inside the bucket
wr.s3.list_objects(path=f's3://{stage_path}',boto3_session=my_session)

['s3://pipe-line-source/bosch-production-line-performance',
 's3://pipe-line-source/sample_submission.csv.zip',
 's3://pipe-line-source/test_categorical.csv.zip',
 's3://pipe-line-source/test_date.csv.zip',
 's3://pipe-line-source/test_numeric.csv.zip',
 's3://pipe-line-source/train_categorical.csv.zip',
 's3://pipe-line-source/train_date.csv.zip',
 's3://pipe-line-source/train_numeric.csv.zip']

In [41]:
#Deleting the files in the bucket
wr.s3.delete_objects(path=f's3://{stage_path}',boto3_session=my_session)

In [42]:
#validate deletion
wr.s3.list_objects(path=f's3://{stage_path}',boto3_session=my_session)

[]

### Loading files from Local Filesystem

local_path = 'your/local/file/system/path'

Only change is the local_path name. Rest all remains same.

We will see how to put_objects using the Boto3 way...

In [49]:
#uploading the objects using boto3 client. 

#Ensure the files that needs to uploaded are opened, read and uploaded. Else only the name
#will be written

for fqp in file_path:
    key = fqp.split('/')[4]
    print(key)
    with open(fqp,mode='rb') as temp_bytes:
        s3_client.put_object(Body=temp_bytes,Bucket=stage_path,
                            Key=key)

train_date.csv.zip
sample_submission.csv.zip
train_numeric.csv.zip
test_date.csv.zip
test_categorical.csv.zip
test_numeric.csv.zip
train_categorical.csv.zip


In [53]:
#Validate the buckets the boto3 way and get their keys
#Prepare the Delete Key Objects
file_keys = [file['Key'] for file in s3_client.list_objects_v2(Bucket=stage_path)["Contents"]]

In [54]:
#Validate the file_keys
file_keys

['sample_submission.csv.zip',
 'test_categorical.csv.zip',
 'test_date.csv.zip',
 'test_numeric.csv.zip',
 'train_categorical.csv.zip',
 'train_date.csv.zip',
 'train_numeric.csv.zip']

In [55]:
#Assemble the objects necessary for sending the deleting request
file_objects = []
for keys in file_keys:
    temp = {"Key":keys}
    file_objects.append(temp)
file_objects

[{'Key': 'sample_submission.csv.zip'},
 {'Key': 'test_categorical.csv.zip'},
 {'Key': 'test_date.csv.zip'},
 {'Key': 'test_numeric.csv.zip'},
 {'Key': 'train_categorical.csv.zip'},
 {'Key': 'train_date.csv.zip'},
 {'Key': 'train_numeric.csv.zip'}]

In [56]:
#Delete the contents of the buckets Boto3 way
s3_client.delete_objects(Bucket=stage_path,
                         Delete={'Objects':file_objects,
                                 'Quiet': True})

{'ResponseMetadata': {'RequestId': 'ZTCFDN82W051QTD8',
  'HostId': '/kc9KZzKFMVQh8xaxNBLQfg4eNCee7jSqGj1VgU05eRYkkLqopDgO9Bat3ITP6gc+n5pVKzBpGc=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '/kc9KZzKFMVQh8xaxNBLQfg4eNCee7jSqGj1VgU05eRYkkLqopDgO9Bat3ITP6gc+n5pVKzBpGc=',
   'x-amz-request-id': 'ZTCFDN82W051QTD8',
   'date': 'Sat, 28 Jan 2023 04:20:29 GMT',
   'content-type': 'application/xml',
   'transfer-encoding': 'chunked',
   'server': 'AmazonS3',
   'connection': 'close'},
  'RetryAttempts': 0}}

### Data that needs to be pulled from API. 

There are APIs that have web-site endpoints, and the other APIs that provide a wrapper around the weblink as Python Libraries. 

We will see both of it in action in this section

In [61]:
all_fno_company = nse.fnolist()

In [64]:
bulk_deals = nse.get_bulkdeals()

In [71]:
index_list = nse.nse_get_index_list()

In [72]:
nse.index_info(index_list[0])

{'key': 'BROAD MARKET INDICES',
 'index': 'NIFTY 50',
 'indexSymbol': 'NIFTY 50',
 'last': 17604.35,
 'variation': -287.6,
 'percentChange': -1.61,
 'open': 17877.2,
 'high': 17884.75,
 'low': 17493.55,
 'previousClose': 17891.95,
 'yearHigh': 18887.6,
 'yearLow': 15183.4,
 'pe': '20.7',
 'pb': '4.13',
 'dy': '1.4',
 'declines': '37',
 'advances': '12',
 'unchanged': '1',
 'perChange365d': 2.89,
 'date365dAgo': '27-Jan-2022',
 'chart365dPath': 'https://static.nseindia.com/sparklines/365d/NIFTY-50.jpg',
 'date30dAgo': '28-Dec-2022',
 'perChange30d': -2.86,
 'chart30dPath': 'https://static.nseindia.com/sparklines/30d/NIFTY-50.jpg',
 'chartTodayPath': 'https://static.nseindia.com/sparklines/today/NIFTY-50.jpg',
 'previousDay': 17891.95,
 'oneWeekAgo': 18027.65,
 'oneMonthAgo': 18122.5,
 'oneYearAgo': 17110.15}

In [75]:
holidays = nse.nse_holidays()

In [86]:
company_info = nse.nse_eq("DIXON")

In [87]:
company_info.keys()

dict_keys(['info', 'metadata', 'securityInfo', 'priceInfo', 'industryInfo', 'preOpenMarket'])

In [82]:
equity_data = nse.equity_history("DIXON","EQ","01-12-2022","25-01-2023")

In [83]:
equity_data.head()

Unnamed: 0,_id,CH_SYMBOL,CH_SERIES,CH_MARKET_TYPE,CH_TRADE_HIGH_PRICE,CH_TRADE_LOW_PRICE,CH_OPENING_PRICE,CH_CLOSING_PRICE,CH_LAST_TRADED_PRICE,CH_PREVIOUS_CLS_PRICE,...,CH_52WEEK_LOW_PRICE,CH_TOTAL_TRADES,CH_ISIN,CH_TIMESTAMP,TIMESTAMP,createdAt,updatedAt,__v,VWAP,mTIMESTAMP
0,63bd53649bf4ee0006756813,DIXON,EQ,N,3808.0,3705.5,3792.9,3731.15,3741.0,3774.05,...,3180.55,16225,INE935N01020,2023-01-10,2023-01-09T18:30:00.000Z,2023-01-10T12:00:36.383Z,2023-01-10T12:00:36.383Z,0,3735.67,10-Jan-2023
1,63bea4ff34c2300007bbdca4,DIXON,EQ,N,3758.5,3697.6,3748.0,3705.3,3708.9,3731.15,...,3180.55,10109,INE935N01020,2023-01-11,2023-01-10T18:30:00.000Z,2023-01-11T12:01:03.890Z,2023-01-11T12:01:03.890Z,0,3722.84,11-Jan-2023
2,63bff67f69f05800079718a7,DIXON,EQ,N,3722.65,3634.35,3720.0,3651.05,3659.85,3705.3,...,3180.55,18232,INE935N01020,2023-01-12,2023-01-11T18:30:00.000Z,2023-01-12T12:01:03.756Z,2023-01-12T12:01:03.756Z,0,3661.69,12-Jan-2023
3,63c147e479c141000743bf53,DIXON,EQ,N,3671.0,3603.05,3659.95,3626.55,3635.0,3651.05,...,3180.55,14700,INE935N01020,2023-01-13,2023-01-12T18:30:00.000Z,2023-01-13T12:00:36.442Z,2023-01-13T12:00:36.442Z,0,3621.5,13-Jan-2023
4,63c53c7f2791db000734235b,DIXON,EQ,N,3658.0,3541.0,3644.7,3547.65,3547.1,3626.55,...,3180.55,22164,INE935N01020,2023-01-16,2023-01-15T18:30:00.000Z,2023-01-16T12:01:03.962Z,2023-01-16T12:01:03.962Z,0,3576.05,16-Jan-2023


In [85]:
equity_data.columns

Index(['_id', 'CH_SYMBOL', 'CH_SERIES', 'CH_MARKET_TYPE',
       'CH_TRADE_HIGH_PRICE', 'CH_TRADE_LOW_PRICE', 'CH_OPENING_PRICE',
       'CH_CLOSING_PRICE', 'CH_LAST_TRADED_PRICE', 'CH_PREVIOUS_CLS_PRICE',
       'CH_TOT_TRADED_QTY', 'CH_TOT_TRADED_VAL', 'CH_52WEEK_HIGH_PRICE',
       'CH_52WEEK_LOW_PRICE', 'CH_TOTAL_TRADES', 'CH_ISIN', 'CH_TIMESTAMP',
       'TIMESTAMP', 'createdAt', 'updatedAt', '__v', 'VWAP', 'mTIMESTAMP'],
      dtype='object')

Based on exploring the nse library, the equity history provides a whole lot of information for each equity. The get_advances_declines method provides the list of equities that have advanced and declined recently.

Idea 1: Take the list of Symbols in Advances & Declines, get their history for past 3 months, and store the data as parquets in Staging Area

Idea 2: Nse also provides the details of each equity in a lot more detail. This detail can be collected as seperate jsons and stored in Staging Area

Idea 3: Library provides access to the fnolist and the method option scraper provides the Option chain for the options, which can also be scraped...  

In [88]:
nse.nse_fiidii()

Unnamed: 0,category,date,buyValue,sellValue,netValue
0,DII **,27-Jan-2023,12373.41,8121.08,4252.33
1,FII/FPI *,27-Jan-2023,12414.78,18392.64,-5977.86


In [94]:
fnoList = nse.fnolist()

In [95]:
nse.nse_optionchain_scrapper(fnoList[0])

{'records': {'expiryDates': ['02-Feb-2023',
   '09-Feb-2023',
   '16-Feb-2023',
   '23-Feb-2023',
   '02-Mar-2023',
   '29-Mar-2023',
   '27-Apr-2023',
   '29-Jun-2023',
   '28-Sep-2023',
   '28-Dec-2023',
   '27-Jun-2024',
   '26-Dec-2024',
   '26-Jun-2025',
   '24-Dec-2025',
   '25-Jun-2026',
   '31-Dec-2026',
   '24-Jun-2027',
   '30-Dec-2027'],
  'data': [{'strikePrice': 8500,
    'expiryDate': '29-Jun-2023',
    'CE': {'strikePrice': 8500,
     'expiryDate': '29-Jun-2023',
     'underlying': 'NIFTY',
     'identifier': 'OPTIDXNIFTY29-06-2023CE8500.00',
     'openInterest': 319.5,
     'changeinOpenInterest': 0,
     'pchangeinOpenInterest': 0,
     'totalTradedVolume': 0,
     'impliedVolatility': 0,
     'lastPrice': 1775,
     'change': 0,
     'pChange': 0,
     'totalBuyQuantity': 0,
     'totalSellQuantity': 50,
     'bidQty': 0,
     'bidprice': 0,
     'askQty': 50,
     'askPrice': 10407,
     'underlyingValue': 17604.35}},
   {'strikePrice': 9500,
    'expiryDate': '29-Ju

### I will be coding only the 1st Idea here

1) Get the list of companies that have advanced and declined

2) The equity history is required for last 3 months that is startDate = '01-11-2022' to endDate = '27-11-2023'

3) Store the pandas dataframe as parquets using the awswrangler, into the s3 bucket

In [96]:
advances_declines = nse.nse_get_advances_declines()

In [101]:
companies_adv_declines = list(advances_declines["symbol"])

### Developing the Function spec

1) Parameters are : Symbol, Start_date, End_date, s3_bucket_fql

2) Methods used : nse.equity_history(), awswrangler.s3.to_parquet()

In [119]:
def store_stock_data(symbol:str,start_date:str,
                     end_date:str, s3_bucket_fql:str,
                    boto_session):
    """
    Function writes the data of the symbol given to the designated S3_bucket path 
    as parquet file. The bucket location must be fully qualified including the s3:// 
    header like this 's3://your_bucket/equity_parquets/'. The date must be in "01-12-2022" format. Only Equity data will be 
    fetched.
    """
    equity_dataframe = nse.equity_history(symbol,"EQ",start_date,end_date)
    
    equity_dataframe.head(2)
    
    wr.s3.to_parquet(df=equity_data,path=s3_bucket_fql,
                     boto3_session=boto_session,
                     filename_prefix=symbol,
                dataset=True,
                partition_cols=['CH_SYMBOL'])

In [124]:
s3_bucket_fql = 's3://pipe-line-source/equity_parquets/'
begin_date = '01-11-2022'
to_date = '27-01-2023'

In [126]:
for sym in companies_adv_declines[:5]:
   store_stock_data(symbol=sym, start_date=begin_date,end_date=to_date,
                   boto_session=my_session,s3_bucket_fql=s3_bucket_fql) 