In [2]:
!pip install awswrangler

Collecting awswrangler
  Downloading awswrangler-2.19.0-py3-none-any.whl (267 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m267.1/267.1 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting requests-aws4auth<2.0.0,>=1.1.1
  Downloading requests_aws4auth-1.2.1-py2.py3-none-any.whl (24 kB)
Collecting pymysql<2.0.0,>=1.0.0
  Downloading PyMySQL-1.0.2-py3-none-any.whl (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting pg8000<2.0.0,>=1.20.0
  Downloading pg8000-1.29.4-py3-none-any.whl (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.4/51.4 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting redshift-connector<2.1.0,>=2.0.889
  Downloading redshift_connector-2.0.910-py3-none-any.whl (112 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.1/112.1 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Collecting 

In [None]:
!pip install nsepython

In [20]:
import boto3
import awswrangler as wr
import warnings
warnings.filterwarnings("ignore")

In [None]:
import nsepython as nse

In [14]:
#load the AWS credentials and keep it ready

import configparser

reader = configparser.ConfigParser()
reader.read_file(open('/kaggle/input/private-set/calter.config'))


aws_reg = reader["AWS"]["REGION"]
aws_key = reader["AWS"]["KEY"]
aws_sec = reader["AWS"]["SECRET"]

my_session = boto3.Session(aws_access_key_id=aws_key,aws_secret_access_key=aws_sec,
                          region_name=aws_reg)

s3_client = boto3.client('s3',region_name=aws_reg,aws_access_key_id=aws_key,
                        aws_secret_access_key=aws_sec)

There are many sources from which the data can come to Data Engineer or the Analyst. How to pull the data from these sources reliably? 

1 - List the sources and the designate a staging area. 

2 - Think of the source Type and the connection method.

3 - Get the necessary credentials and register them for use.

4 - Code the functions that pull the data into the staging.

5 - Rinse and Repeat

### Lets Designate a Staging Area:

Stage @ AWS ==> s3://pipe-line-source

Place the string path in the variable

In [5]:
stage_path = "pipe-line-source"

In [6]:
# There are two ways to list the contents of the bucket

#Boto way: Note the bucket name is not having the prefix s3://
try:
    s3_client.list_objects_v2(Bucket=stage_path)["Contents"]
except Exception as e:
    print("The bucket must be empty.")

In [7]:
# listing the buckets

wr.s3.list_buckets(boto3_session=my_session)

['athena-query-res-jan',
 'aws-athena-query-results-642924624251-us-east-1',
 'aws-glue-assets-642924624251-us-east-1',
 'boto-bucket-16',
 'de-yt-starterdata-bkp',
 'pipe-line-source',
 'tab-mcq-de']

In [None]:
#AWSWrangler way

wr.s3.list_objects(path=f's3://'+stage_path,boto3_session=my_session)

#AWSWrangler is quiet when it comes to empty buckets. 

List of Sources : 

In this notebook we will consider the following sources

1) CSV or Json file from Kaggle dataset 
    
    Use Kaggle notebook to load the dataset and then upload to S3
    
2) File of any type in the local file system:

    The process will be same as the above steps used in Kaggle Notebook. 

3) Data needs to be pulled from a web API 

4) Tables from Local or Remote Relational Database needs to be dumped 

5) Tables inside Data Warehouses

### Loading files from Kaggle

a) Kaggle Notebook

In [None]:
%%sh
cd /kaggle/input/bosch-production-line-performance
ls

In [8]:
# Get the file paths that needs to be uploaded

import os, json, glob
file_path = glob.glob(pathname="/kaggle/input/bosch-production-line-performance/*.zip",
                      recursive=True)
file_path

['/kaggle/input/bosch-production-line-performance/train_date.csv.zip',
 '/kaggle/input/bosch-production-line-performance/sample_submission.csv.zip',
 '/kaggle/input/bosch-production-line-performance/train_numeric.csv.zip',
 '/kaggle/input/bosch-production-line-performance/test_date.csv.zip',
 '/kaggle/input/bosch-production-line-performance/test_categorical.csv.zip',
 '/kaggle/input/bosch-production-line-performance/test_numeric.csv.zip',
 '/kaggle/input/bosch-production-line-performance/train_categorical.csv.zip']

In [None]:
#Pumping the files into S3
for fqp in file_path:
    key = fqp.split('/')[4]
    print(key)
    wr.s3.upload(local_file=fqp,path=f's3://{stage_path}/{key}',
                     boto3_session=my_session)

In [None]:
#Validate the files are inside the bucket
wr.s3.list_objects(path=f's3://{stage_path}',boto3_session=my_session)

In [None]:
#Deleting the files in the bucket
wr.s3.delete_objects(path=f's3://{stage_path}',boto3_session=my_session)

In [None]:
#validate deletion
wr.s3.list_objects(path=f's3://{stage_path}',boto3_session=my_session)

### Loading files from Local Filesystem

local_path = 'your/local/file/system/path'

Only change is the local_path name. Rest all remains same.

We will see how to put_objects using the Boto3 way...

In [None]:
#uploading the objects using boto3 client. 

#Ensure the files that needs to uploaded are opened, read and uploaded. Else only the name
#will be written

for fqp in file_path:
    key = fqp.split('/')[4]
    print(key)
    with open(fqp,mode='rb') as temp_bytes:
        s3_client.put_object(Body=temp_bytes,Bucket=stage_path,
                            Key=key)

In [None]:
#Validate the buckets the boto3 way and get their keys
#Prepare the Delete Key Objects
file_keys = [file['Key'] for file in s3_client.list_objects_v2(Bucket=stage_path)["Contents"]]

In [None]:
#Validate the file_keys
file_keys

In [None]:
#Assemble the objects necessary for sending the deleting request
file_objects = []
for keys in file_keys:
    temp = {"Key":keys}
    file_objects.append(temp)
file_objects

In [None]:
#Delete the contents of the buckets Boto3 way
s3_client.delete_objects(Bucket=stage_path,
                         Delete={'Objects':file_objects,
                                 'Quiet': True})

### Data that needs to be pulled from API. 

There are APIs that have web-site endpoints, and the other APIs that provide a wrapper around the weblink as Python Libraries. 

We will see both of it in action in this section

In [None]:
all_fno_company = nse.fnolist()

In [None]:
bulk_deals = nse.get_bulkdeals()

In [None]:
index_list = nse.nse_get_index_list()

In [None]:
nse.index_info(index_list[0])

In [None]:
holidays = nse.nse_holidays()

In [None]:
company_info = nse.nse_eq("DIXON")

In [None]:
company_info.keys()

In [None]:
equity_data = nse.equity_history("DIXON","EQ","01-12-2022","25-01-2023")

In [None]:
equity_data.head()

In [None]:
equity_data.columns

Based on exploring the nse library, the equity history provides a whole lot of information for each equity. The get_advances_declines method provides the list of equities that have advanced and declined recently.

Idea 1: Take the list of Symbols in Advances & Declines, get their history for past 3 months, and store the data as parquets in Staging Area

Idea 2: Nse also provides the details of each equity in a lot more detail. This detail can be collected as seperate jsons and stored in Staging Area

Idea 3: Library provides access to the fnolist and the method option scraper provides the Option chain for the options, which can also be scraped...  

In [None]:
nse.nse_fiidii()

In [None]:
fnoList = nse.fnolist()

In [None]:
nse.nse_optionchain_scrapper(fnoList[0])

### I will be coding only the 1st Idea here

1) Get the list of companies that have advanced and declined

2) The equity history is required for last 3 months that is startDate = '01-11-2022' to endDate = '27-11-2023'

3) Store the pandas dataframe as parquets using the awswrangler, into the s3 bucket

In [None]:
advances_declines = nse.nse_get_advances_declines()

In [None]:
companies_adv_declines = list(advances_declines["symbol"])

### Developing the Function spec

1) Parameters are : Symbol, Start_date, End_date, s3_bucket_fql

2) Methods used : nse.equity_history(), awswrangler.s3.to_parquet()

In [None]:
def store_stock_data(symbol:str,start_date:str,
                     end_date:str, s3_bucket_fql:str,
                    boto_session):
    """
    Function writes the data of the symbol given to the designated S3_bucket path 
    as parquet file. The bucket location must be fully qualified including the s3:// 
    header like this 's3://your_bucket/equity_parquets/'. The date must be in "01-12-2022" format. Only Equity data will be 
    fetched.
    """
    equity_dataframe = nse.equity_history(symbol,"EQ",start_date,end_date)
    
    print(equity_dataframe.head(2))
    
    wr.s3.to_parquet(df=equity_data,path=s3_bucket_fql,
                     boto3_session=boto_session,
                dataset=True)

In [55]:
s3_bucket_fql = 's3://pipe-line-source/equity_parquets'
begin_date = '01-11-2022'
to_date = '27-01-2023'

In [None]:
companies_adv_declines[2:5]

In [None]:
for sym in companies_adv_declines[2:5]:
   store_stock_data(symbol=sym, start_date=begin_date,end_date=to_date,
                   boto_session=my_session,s3_bucket_fql=s3_bucket_fql) 

### Data that needs to be pulled from weblink. 

There are APIs that have web-site endpoints, some time require additional authentication string that needs to be sent along with the requests. We will be using Python's requests library

There are many APIs available. I want to take a look at any of the news paper APIs.
Among them Ycombinator for technology, newsapi seems to be very helpful in terms of data provided. 

https://hackernews.api-docs.io/v0/overview/introduction

https://newsapi.org/docs/get-started

https://developers.kite.trade/apps

I am checking Zerodha dev account now.

In [9]:
import requests
import pandas as pd

In [None]:
def getnews_df(api_url):
    """The function returns the total articles, and array of json objects.
    Requires json, requests and pandas library"""
    rawRequest = requests.request("GET",url=api_url)
    jsonData = rawRequest.json()
    if jsonData['status'] == 'ok':
        articleDF = pd.DataFrame(jsonData['articles'])
        return articleDF
    else:
        return jsonData

In [None]:
ukraine_df = getnews_df(api_url=f'https://newsapi.org/v2/everything?q=Ukraine&from=2022-12-29&to=2023-01-27&sortBy=popularity&apiKey={myAPI}')

In [None]:
def write_news_s3(search_str:str,start_date:str,
                     end_date:str, s3_bucket_fql:str,
                    boto_session, api_key):
    """The date must be in the 2023-01-27 format. 
    Newsapi key needs to be supplied for getting the data written to S3.
    Provide a valid S3 bucket in the format s3://your_bucket/news_parquets/"""
    #Build the url
    buildAPI =f'https://newsapi.org/v2/everything?q={search_str}&from={start_date}&to={end_date}&sortBy=popularity&apiKey={myAPI}'
    #Get raw data
    getRaw = requests.request("GET",url = buildAPI)
    #Convert data to json
    getJson = getRaw.json()
    #Check if json contains required data
    if getJson['status'] == "ok":
        #Inform the user
        print(f'Recieved {getJson["totalResults"]}News articles. Sending to S3')
        #Convert to dataframe
        articleDF = pd.DataFrame(getJson['articles'])
        #Wrap the s3 write in try except block 
        try:
            #write to S3
            wr.s3.to_parquet(df=articleDF,path=s3_bucket_fql,
                         boto3_session=boto_session,
                    dataset=True)
        #Triggering Exception
        except Exception as e:
            
            print(f'There is an error.{e}')
        #inform completion
        print("Sent data to S3")

In [None]:
search_string = 'United States'
start='2022-12-29'
end = '2023-01-15'
s3_endpoint = 's3://pipe-line-source/news_parquets'
myAPI = '401c52566bb34b72b44bf08e738ce953'

In [None]:
write_news_s3(search_str=search_string,start_date=start,
             end_date=end,s3_bucket_fql=s3_endpoint,
                api_key=myAPI,boto_session=my_session)

### Connecting to RDS through Psycopg2 library and then writing to S3

You might be wondering why the AWS's own services Athena and RDS doesn't talk with each other? That is because, Athena is BigData cluster while RDS instance is a regular database instance. 

In order make them talk, connections needs to be invoked through Lambda or through Glue Job. Both of which are chargeable. The work around 

In [12]:
!pip install psycopg2-binary public-ip

Collecting psycopg2-binary
  Downloading psycopg2_binary-2.9.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting public-ip
  Downloading public_ip-0.12-py3-none-any.whl (3.9 kB)
Installing collected packages: psycopg2-binary, public-ip
Successfully installed psycopg2-binary-2.9.5 public-ip-0.12
[0m

In [13]:
import psycopg2
import public_ip

In [15]:
#get the database credentials

reader = configparser.ConfigParser()
reader.read_file(open('/kaggle/input/private-set/calter.config'))


databaseHost = reader["POSTGRES"]["PG_HOST"]
db_pass = reader["POSTGRES"]["PG_PASS"]
db_uname = reader["POSTGRES"]["PG_UNAME"]
db_name = reader["POSTGRES"]["PG_DB"]
db_port = reader["POSTGRES"]["PG_PORT"]

In [45]:
### since the RDS is behind the firewall need the ip 
new_ip = public_ip.get()

In [47]:
#Need to set the current Kaggle instance IP to the AWS Security Group
def change_local_ingress(new_ip):
    #Initiate the new ec2 client
    ec2_client = boto3.client('ec2',region_name=aws_reg,aws_access_key_id=aws_key,
                           aws_secret_access_key=aws_sec)
    #assign the security group
    sec_grp = 'sg-060ab746844bf1595'
    #get the ip from the function argument
    newPermissions=[
          {
              'FromPort': 0,
              'IpProtocol': 'tcp',
              'IpRanges': [
                  {
                      'CidrIp': f'{new_ip}/32',
                  },
              ],
              'ToPort': 50153,
          },

      ]
    #Authorized the ingress
    ec2_client.authorize_security_group_ingress(GroupId=sec_grp,IpPermissions=newPermissions)
    

In [48]:
#This will push the authorisation for the ingress 
change_local_ingress(new_ip=new_ip)

In [50]:
#Function to get the dataframe from the required query
def query_database_rds(query):
    """Function establishes the connection and then 
    uses the query to get the dataframe out of the database table."""
    #Establish connection, and provide autocommit option to true
    try:
        conn = psycopg2.connect(host=databaseHost,
                            dbname=db_name,user=db_uname,
                            password=db_pass,port=db_port)
    
        conn.set_session(autocommit=True)
        
        cur = conn.cursor()
        
    except Exception as e:
        print(e)
    
    #Query the table 
    
    cur.execute(query)
    
    #get the data into templist
    
    tempList = cur.fetchall()
    
    #feed the templist into pandas.DataFrame and return
    
    return pd.DataFrame(tempList)

In [53]:
query_yt = """SELECT * FROM yt_csv LIMIT 5"""
result_dataframe = query_database_rds(query=query_yt)

In [57]:
result_dataframe.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,sMgbbfn8C_g,18.09.03,Вечерний Ургант. В гостях у Ивана – Данила Поп...,Вечерний Ургант,24,2018-03-07T23:00:26.000Z,"""Телевидение|""""Шоу-бизнес""""|""""Юмор""""|""""Дмитрий...",1268968,78234,3083,2228,https://i.ytimg.com/vi/sMgbbfn8C_g/default.jpg,False,False,False,Впервые в нашей студии популярный блогер и сте...,RU
1,GfmussDBrUQ,18.09.03,МС ХОВАНСКИЙ - ИЛОН МАСК,Юрий Хованский,10,2018-03-08T17:17:27.000Z,"""хованский|""""юрий хованский""""|""""юмор""""|""""russi...",646344,76519,29876,12928,https://i.ytimg.com/vi/GfmussDBrUQ/default.jpg,False,False,False,Тесла и ракета в одном лице - уже тут. Я как И...,RU
2,KVSElmHEHHU,18.09.03,ЗАШКВАРНЫЕ ИСТОРИИ #7: ПЯЗОК СНИМАЛА П*РНО?,КЛИККЛАК,23,2018-03-08T15:29:48.000Z,"""кликклак|""""клик клак""""|""""клик""""|""""клак""""|""""кл...",1384530,130120,2928,5418,https://i.ytimg.com/vi/KVSElmHEHHU/default.jpg,False,False,False,Экономь до 40% с покупок в более 500 магазинах...,RU
3,1hun4qYIOD0,18.09.03,ДОНАТНЫЙ ХАОС,itpedia,28,2018-03-08T12:49:30.000Z,"""шевцов|""""щевцов""""|""""алексей""""|""""itpedia""""|""""а...",772727,86617,2479,11371,https://i.ytimg.com/vi/1hun4qYIOD0/default.jpg,False,False,False,Меняй скины на деньги: http://lis-skins.ru/\n_...,RU
4,Zd4wXe2abaI,18.09.03,Паша Техник – Я РОНЯЮ ТРЕНДЫ (FACE cover) by П...,Пацаны вообще ребята,24,2018-03-08T09:37:10.000Z,"""паша техник|""""паша техник клип""""|""""face я рон...",539238,34878,2544,1455,https://i.ytimg.com/vi/Zd4wXe2abaI/default.jpg,False,False,False,Пародия на новый клип FACE - Я РОНЯЮ ЗАПАД от ...,RU


In [61]:
wr.s3.to_csv(df=result_dataframe,path="s3://pipe-line-source/",
                         boto3_session=my_session,
                    dataset=True)

{'paths': ['s3://pipe-line-source/2334894b5c594b0f86b75fcaf83d379b.csv'],
 'partitions_values': {}}