In [4]:
import boto3
import botocore
import pandas as pd
from IPython.display import display, Markdown

In [5]:
s3 = boto3.client('s3')
s3_resource = boto3.resource('s3')

In [6]:
def create_bucket(bucket):
    import logging

    try:
        s3.create_bucket(Bucket=bucket)
    except botocore.exceptions.ClientError as e:
        logging.error(e)
        return 'Bucket ' + bucket + ' could not be created.'
    return 'Created or already exists ' + bucket + ' bucket.'

In [7]:
create_bucket('open-data-noaa')

'Created or already exists open-data-noaa bucket.'

In [8]:
def list_buckets(match=''):
    response = s3.list_buckets()
    if match:
        print(f'Existing buckets containing "{match}" string:')
    else:
        print('All existing buckets:')
    for bucket in response['Buckets']:
        if match:
            if match in bucket["Name"]:
                print(f'  {bucket["Name"]}')

In [9]:
list_buckets(match='open')

Existing buckets containing "open" string:
  open-data-noaa


In [10]:
def list_bucket_contents(bucket, match='', size_mb=0):
    bucket_resource = s3_resource.Bucket(bucket)
    total_size_gb = 0
    total_files = 0
    match_size_gb = 0
    match_files = 0
    for key in bucket_resource.objects.all():
        key_size_mb = key.size/1024/1024
        total_size_gb += key_size_mb
        total_files += 1
        list_check = False
        if not match:
            list_check = True
        elif match in key.key:
            list_check = True
        if list_check and not size_mb:
            match_files += 1
            match_size_gb += key_size_mb
            print(f'{key.key} ({key_size_mb:3.0f}MB)')
        elif list_check and key_size_mb <= size_mb:
            match_files += 1
            match_size_gb += key_size_mb
            print(f'{key.key} ({key_size_mb:3.0f}MB)')

    if match:
        print(f'Matched file size is {match_size_gb/1024:3.1f}GB with {match_files} files')            
    
    print(f'Bucket {bucket} total size is {total_size_gb/1024:3.1f}GB with {total_files} files')

In [None]:
list_bucket_contents(bucket='noaa-isd-pds', match = 'data')

/data/1976/999999-99999-1976.gz ( 47MB)
/data/1977/999999-99999-1977.gz ( 50MB)
/data/1978/999999-99999-1978.gz ( 47MB)
/data/1979/999999-99999-1979.gz ( 51MB)
/data/1980/999999-99999-1980.gz ( 52MB)
/data/1981/999999-99999-1981.gz ( 53MB)
/data/1983/999999-99999-1983.gz ( 54MB)
/data/1984/999999-99999-1984.gz ( 60MB)
/data/1985/999999-99999-1985.gz ( 60MB)
/data/1986/999999-99999-1986.gz ( 65MB)
/data/1987/999999-99999-1987.gz ( 70MB)
/data/1988/999999-99999-1988.gz ( 73MB)
/data/1989/999999-99999-1989.gz ( 64MB)
/data/1990/999999-99999-1990.gz ( 65MB)
/data/1991/999999-99999-1991.gz ( 56MB)
/data/1992/999999-99999-1992.gz ( 47MB)
/data/1993/999999-99999-1993.gz ( 63MB)
/data/1994/999999-99999-1994.gz ( 62MB)
/data/1995/999999-99999-1995.gz ( 67MB)
/data/2019/071460-99999-2019.gz (  1MB)
/data/2019/434150-99999-2019.gz (  0MB)
/data/2019/866110-99999-2019.gz (  0MB)
/data/2019/869460-99999-2019.gz (  0MB)
/pub/data/noaa/NOTICE-ISD-MERGE-ISSUE.TXT (  0MB)
/pub/data/noaa/country-list.tx

In [12]:
list_bucket_contents(bucket='noaa-global-hourly-pds', match = '1950')

1931/12195099999.csv (  0MB)
1932/12195099999.csv (  0MB)
1933/12195099999.csv (  0MB)
1934/12195099999.csv (  0MB)
1935/12195099999.csv (  0MB)


KeyboardInterrupt: 

In [13]:
#start spark session
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("noaa").getOrCreate()

In [14]:
spark

In [15]:
from pyspark import SparkContext, SparkConf
sc = spark.sparkContext

In [16]:
sc

In [17]:
rdd = sc.textFile("s3://noaa-global-hourly-pds/1931/12195099999.csv")

In [18]:
rdd.take(5)

['"STATION","DATE","SOURCE","LATITUDE","LONGITUDE","ELEVATION","NAME","REPORT_TYPE","CALL_SIGN","QUALITY_CONTROL","WND","CIG","VIS","TMP","DEW","SLP","AA1","AY1","GF1","IA1","KA1","MD1","MW1","EQD"',
 '"12195099999","1931-01-09T12:00:00","4","54.1333333","22.95","186.0","SUWALKI, PL","FM-12","99999","V020","999,1,C,0000,1","00450,1,C,N","002000,1,N,9","-0050,1","-0056,1","10266,1",,"2,1,99,9","99,99,9,08,1,05,1,00450,1,00,1,00,1",,,,,"Q01+000032APCTENQ02+000002APC3  "',
 '"12195099999","1931-01-10T12:00:00","4","54.1333333","22.95","186.0","SUWALKI, PL","FM-12","99999","V020","200,1,N,0010,1","00450,1,C,N","002000,1,N,9","-0061,1","-0067,1","10298,1",,"2,1,99,9","08,99,1,08,1,05,1,00450,1,00,1,00,1",,,"3,1,002,1,+999,9",,',
 '"12195099999","1931-01-13T12:00:00","4","54.1333333","22.95","186.0","SUWALKI, PL","FM-12","99999","V020","180,1,N,0010,1","00780,1,C,N","004000,1,N,9","-0022,1","-0028,1","10111,1",,"2,1,99,9","08,99,1,08,1,05,1,00800,1,00,1,00,1",,,"8,1,004,1,+999,9",,',
 '"1219