In [1]:
import boto3
import requests
import zipfile
import os
import pandas as pd
import json
import pymongo
import io

In [3]:
try:
  session = boto3.Session(region_name='ap-south-1',
                          aws_access_key_id = os.environ.get('AWS_ACCESS_KEY_ID'),
                          aws_secret_access_key = os.environ.get('AWS_SECRET_ACCESS_KEY'))
  print('Successfully Session Created')
except Exception as e:
  print(e)

Successfully Session Created


In [4]:
try:
  s3 = session.client(service_name='s3',region_name='ap-south-1')
  print('Successfully s3 client created')
except Exception as e:
  print(e)

Successfully s3 client created


In [5]:
for i in s3.list_buckets().get('Buckets'):
    print(i['Name'])

analytical-data-bucket
aws-glue-assets-894034347035-ap-south-1
bucket-athena-result
bucket-for-glue-script
bucket-for-lambda-function
cleaned-json-as-csv
raw-csv-from-jupyter
raw-csv-test123
raw-data-from-webpage
raw-json-from-jupyter
raw-json-test123


In [62]:
try:
    url = 'https://www.sec.gov/Archives/edgar/daily-index/bulkdata/submissions.zip'

    headers = {'user-agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.55'}

    response = requests.get(url,headers = headers,stream=True)

    content = response.content
    
    print('successfully got the zipfile from url')
    
except Exception as e:
    print(e)

successfully got the zipfile from url


In [63]:
try:
    bucket_name = 'raw-data-from-webpage'
    file_path = 'largefile/file.zip'

    s3.put_object(Bucket=bucket_name, Key=file_path, Body=content)
    print('successfully uploaded the zipfile into s3')
except Exception as e:
    print(e)

successfully uploaded the zipfile into s3


In [64]:
try:
    bucket_name = 'raw-data-from-webpage'
    file_path = 'largefile/file.zip'

    obj = s3.get_object(Bucket = bucket_name, Key = file_path)

    zip_file = io.BytesIO(obj['Body'].read())

    with zipfile.ZipFile(zip_file) as z:
      z.extractall()

    for file in z.namelist():
      s3.upload_file(Filename = file, Bucket = bucket_name, Key = 'unzipped/' + file)
    
    print('successfully extracted and uploaded in the unzipped folder')
    
except Exception as e:
    print(e)

successfully extracted and uploaded in the unzipped folder


In [23]:
bucket_name = 'raw-data-from-webpage'
folder_name = 'unzipped/'

for i in s3.list_objects(Bucket=bucket_name,Prefix=folder_name)['Contents']:
    print(i['Key'])

unzipped/CIK0000000003.json
unzipped/CIK0000000013.json
unzipped/CIK0000000014.json
unzipped/CIK0000000017.json
unzipped/CIK0000000018.json
unzipped/CIK0000000020.json
unzipped/CIK0000000049.json
unzipped/CIK0000000051.json
unzipped/CIK0000000063.json
unzipped/CIK0000001750-submissions-001.json
unzipped/CIK0000001750.json
unzipped/CIK0000001761.json
unzipped/CIK0000001800-submissions-001.json
unzipped/CIK0000001800-submissions-002.json
unzipped/CIK0000001800.json
unzipped/CIK0000001830.json
unzipped/CIK0000001841.json
unzipped/CIK0000001848.json
unzipped/CIK0000001853.json
unzipped/CIK0000001860.json
unzipped/CIK0000001904.json
unzipped/CIK0000001918.json
unzipped/CIK0000001923.json
unzipped/CIK0000001947.json
unzipped/CIK0000001952.json
unzipped/CIK0000001958.json
unzipped/CIK0000001961.json
unzipped/CIK0000001969.json
unzipped/CIK0000001985.json
unzipped/CIK0000001988.json
unzipped/CIK0000002024.json
unzipped/CIK0000002034-submissions-001.json
unzipped/CIK0000002034.json
unzipped/CIK

In [68]:
bucket_name = 'raw-data-from-webpage'
folder_name = 'unzipped/'

result = s3.list_objects(Bucket=bucket_name, Prefix=folder_name)

# get the number of files in the folder
if 'Contents' in result:
    file_count = len(result['Contents'])
else:
    file_count = 0

print(f'N.of files in {folder_name}: {file_count}')

N.of files in unzipped/: 1000


In [69]:
bucket_name = 'raw-data-from-webpage'
file_name = 'unzipped/CIK0000000003.json'

obj = s3.get_object(Bucket = bucket_name, Key = file_name)

data1 = json.load(obj['Body'])

print(json.dumps(data1, indent=1))

{
 "cik": "3",
 "entityType": "other",
 "sic": "0000",
 "sicDescription": "",
 "insiderTransactionForOwnerExists": 0,
 "insiderTransactionForIssuerExists": 0,
 "name": "DEFINED ASSET FUNDS MUNICIPAL INVT TR FD NEW YORK SER 33",
 "tickers": [],
 "exchanges": [],
 "ein": null,
 "description": "",
 "website": "",
 "investorWebsite": "",
 "category": "",
 "fiscalYearEnd": "1231",
 "stateOfIncorporation": "NY",
 "stateOfIncorporationDescription": "NY",
 "addresses": {
  "mailing": {
   "street1": null,
   "street2": null,
   "city": null,
   "stateOrCountry": null,
   "zipCode": null,
   "stateOrCountryDescription": null
  },
  "business": {
   "street1": "ONE LIBERTY PLZ - 21ST FLR",
   "street2": "C/O MERRILL LYNCH PIERCE FENNER & SMITH",
   "city": "NEW YORK",
   "stateOrCountry": "NY",
   "zipCode": "10080",
   "stateOrCountryDescription": "NY"
  }
 },
 "phone": null,
 "flags": "",
 "formerNames": [],
 "filings": {
  "recent": {
   "accessionNumber": [
    "0000934850-98-002871",
    "0

In [70]:
bucket_name = 'raw-data-from-webpage'
file_name = 'unzipped/CIK0000001750-submissions-001.json'

obj = s3.get_object(Bucket = bucket_name, Key = file_name)

data2 = json.load(obj['Body'])

print(json.dumps(data2, indent=1))

{
 "accessionNumber": [
  "0001104659-10-021700",
  "0001104659-10-019368",
  "0001127602-10-010769",
  "0001127602-10-010179",
  "0001104659-10-017022",
  "0000000000-10-015578",
  "0001104659-10-015424",
  "0001104659-10-015212",
  "0001104659-10-014676",
  "0000000000-10-012687",
  "0001127602-10-007705",
  "0001127602-10-007092",
  "0001127602-10-007079",
  "0001127602-10-007052",
  "0001398344-10-000254",
  "0000895421-10-000335",
  "0001104659-10-005830",
  "0001102578-10-000022",
  "0001086364-10-006858",
  "0000070858-10-000042",
  "0001104659-10-001864",
  "0001127602-10-001375",
  "0001127602-10-001064",
  "0001104659-10-000669",
  "0001127602-10-001001",
  "0001127602-09-026314",
  "0001127602-09-026032",
  "0001127602-09-025973",
  "0001104659-09-070783",
  "0001127602-09-025715",
  "0001104659-09-070152",
  "0001127602-09-023995",
  "0001127602-09-023993",
  "0001104659-09-064897",
  "0001127602-09-020484",
  "0001104659-09-058080",
  "0001104659-09-056338",
  "0001104659-

In [6]:
try:
  docdb = session.client(service_name='docdb',region_name='ap-south-1')
  print('Successfully docdb client created')
except Exception as e:
  print(e)

Successfully docdb client created


In [7]:
docdb.describe_db_clusters()

{'DBClusters': [{'AvailabilityZones': ['ap-south-1c',
    'ap-south-1a',
    'ap-south-1b'],
   'BackupRetentionPeriod': 1,
   'DBClusterIdentifier': 'project-cluster1',
   'DBClusterParameterGroup': 'default.docdb4.0',
   'DBSubnetGroup': 'default',
   'Status': 'available',
   'EarliestRestorableTime': datetime.datetime(2023, 1, 30, 6, 10, 9, 925000, tzinfo=tzlocal()),
   'Endpoint': 'project-cluster1.cluster-cpxnrfv073zv.ap-south-1.docdb.amazonaws.com',
   'ReaderEndpoint': 'project-cluster1.cluster-ro-cpxnrfv073zv.ap-south-1.docdb.amazonaws.com',
   'MultiAZ': False,
   'Engine': 'docdb',
   'EngineVersion': '4.0.0',
   'LatestRestorableTime': datetime.datetime(2023, 1, 31, 7, 12, 26, 849000, tzinfo=tzlocal()),
   'Port': 27017,
   'MasterUsername': 'admin9876',
   'PreferredBackupWindow': '16:54-17:24',
   'PreferredMaintenanceWindow': 'fri:08:34-fri:09:04',
   'ReadReplicaIdentifiers': [],
   'DBClusterMembers': [{'DBInstanceIdentifier': 'project-db1',
     'IsClusterWriter': Tru

In [51]:
try:
  client = pymongo.MongoClient('mongodb://admin9876:password9876@project-cluster1.cluster-cpxnrfv073zv.ap-south-1.docdb.amazonaws.com:27017/?ssl=true&tlsAllowInvalidCertificates=true&replicaSet=rs0&readPreference=secondaryPreferred&retryWrites=false') 
  print('successfully mongo client created')
except Exception as e:
  print(e)

successfully mongo client created


In [52]:
try:
    db = client["sec_database"]
    print('successfully database created')
except Exception as e:
    print(e)

successfully database created


In [84]:
client.list_database_names()

['mydatabase', 'sec_database']

In [56]:
try:
    coll = db["sec_collection"]
    print('successfully collection created')
except Exception as e:
    print(e)

successfully collection created


In [81]:
db.list_collection_names()

['sec_collection']

In [59]:
try:
    bucket_name = 'raw-data-from-webpage'
    folder_name = 'unzipped/'
    result = s3.list_objects(Bucket=bucket_name, Prefix=folder_name)
    file_names = [content['Key'] for content in result.get('Contents', [])]
    
    for file_name in file_names:
        obj = s3.get_object(Bucket=bucket_name, Key=file_name)
        data = json.load(obj['Body'])
        file_base_name = os.path.basename(file_name)
        data['_id'] = os.path.splitext(file_base_name)[0]
        
        coll.insert_one(data)
        
    print('Successfully uploaded in db...!')
    
except Exception as e:
    print(e)

Successfully uploaded in db...!


In [66]:
collection_info = coll.stats
print(collection_info)

Collection(Database(MongoClient(host=['project-cluster1.cluster-cpxnrfv073zv.ap-south-1.docdb.amazonaws.com:27017'], document_class=dict, tz_aware=False, connect=True, tlsallowinvalidcertificates=True, replicaset='rs0', readpreference='secondaryPreferred', retrywrites=False, tlsdisableocspendpointcheck=True, tls=True), 'sec_database'), 'sec_collection.stats')


In [67]:
num_docs = coll.count_documents({})
print("\nNumber of documents:", num_docs)


Number of documents: 1000


In [42]:
result = coll.find({'_id':'CIK0000000063'})
for i in result:
    print(i)

{'_id': 'CIK0000000063', 'cik': '63', 'entityType': 'other', 'sic': '6021', 'sicDescription': 'National Commercial Banks', 'insiderTransactionForOwnerExists': 0, 'insiderTransactionForIssuerExists': 0, 'name': 'FNW BANCORP INC', 'tickers': [], 'exchanges': [], 'ein': '363085933', 'description': '', 'website': '', 'investorWebsite': '', 'category': '', 'fiscalYearEnd': '1231', 'stateOfIncorporation': 'DE', 'stateOfIncorporationDescription': 'DE', 'addresses': {'mailing': {'street1': None, 'street2': None, 'city': None, 'stateOrCountry': None, 'zipCode': None, 'stateOrCountryDescription': None}, 'business': {'street1': 'SIX FOUNTAIN SQ PLZ', 'street2': None, 'city': 'ELGIN', 'stateOrCountry': 'IL', 'zipCode': '60120', 'stateOrCountryDescription': 'IL'}}, 'phone': '7086971100', 'flags': '', 'formerNames': [], 'filings': {'recent': {'accessionNumber': ['0001172661-21-000582'], 'filingDate': ['2021-02-16'], 'reportDate': [''], 'acceptanceDateTime': ['2021-02-16T12:59:50.000Z'], 'act': ['34'

In [93]:
result = coll.find({'_id': 'CIK0000000063'}, {'name': 1})
for i in result:
    print(i['name'])

FNW BANCORP INC


In [100]:
result = coll.find({'_id': 'CIK0000000063'}, {'sicDescription': 1})
for i in result:
    print(i['sicDescription'])

National Commercial Banks


In [101]:
result = coll.find({'_id': 'CIK0000000003'}, {'name': 1})
for i in result:
    print(i['name'])

DEFINED ASSET FUNDS MUNICIPAL INVT TR FD NEW YORK SER 33


In [103]:
result = coll.find({'_id': 'CIK0000000013'}, {'name': 1})
for i in result:
    print(i['name'])

CORPORATE INCOME FUND SEVENTY NINTH SHORT TERM SERIES


In [91]:
result = coll.find({'_id':'CIK0000000063'})
for i in result:
    print(i)

{'_id': 'CIK0000000063', 'cik': '63', 'entityType': 'other', 'sic': '6021', 'sicDescription': 'National Commercial Banks', 'insiderTransactionForOwnerExists': 0, 'insiderTransactionForIssuerExists': 0, 'name': 'FNW BANCORP INC', 'tickers': [], 'exchanges': [], 'ein': '363085933', 'description': '', 'website': '', 'investorWebsite': '', 'category': '', 'fiscalYearEnd': '1231', 'stateOfIncorporation': 'DE', 'stateOfIncorporationDescription': 'DE', 'addresses': {'mailing': {'street1': None, 'street2': None, 'city': None, 'stateOrCountry': None, 'zipCode': None, 'stateOrCountryDescription': None}, 'business': {'street1': 'SIX FOUNTAIN SQ PLZ', 'street2': None, 'city': 'ELGIN', 'stateOrCountry': 'IL', 'zipCode': '60120', 'stateOrCountryDescription': 'IL'}}, 'phone': '7086971100', 'flags': '', 'formerNames': [], 'filings': {'recent': {'accessionNumber': ['0001172661-21-000582'], 'filingDate': ['2021-02-16'], 'reportDate': [''], 'acceptanceDateTime': ['2021-02-16T12:59:50.000Z'], 'act': ['34'

In [71]:
result = coll.find({'_id':'CIK0000001750-submissions-001'})
for i in result:
    print(i)

{'_id': 'CIK0000001750-submissions-001', 'accessionNumber': ['0001104659-10-021700', '0001104659-10-019368', '0001127602-10-010769', '0001127602-10-010179', '0001104659-10-017022', '0000000000-10-015578', '0001104659-10-015424', '0001104659-10-015212', '0001104659-10-014676', '0000000000-10-012687', '0001127602-10-007705', '0001127602-10-007092', '0001127602-10-007079', '0001127602-10-007052', '0001398344-10-000254', '0000895421-10-000335', '0001104659-10-005830', '0001102578-10-000022', '0001086364-10-006858', '0000070858-10-000042', '0001104659-10-001864', '0001127602-10-001375', '0001127602-10-001064', '0001104659-10-000669', '0001127602-10-001001', '0001127602-09-026314', '0001127602-09-026032', '0001127602-09-025973', '0001104659-09-070783', '0001127602-09-025715', '0001104659-09-070152', '0001127602-09-023995', '0001127602-09-023993', '0001104659-09-064897', '0001127602-09-020484', '0001104659-09-058080', '0001104659-09-056338', '0001104659-09-055833', '0001127602-09-018838', '99