In [1]:
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession
import configparser
import os

# common config

```python

# spark will automatically download the dependency, no need to manually try to find the package online. the hadoop version should match spark's requirement.
.config('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:3.3.4')


###### config for self-host s3 compatible services.

# if we use ceph or minio, we specify the enpoint url with this config.
.config('spark.hadoop.fs.s3a.endpoint', 'http://192.168.46.220:9000')

# the default bucket url pattern in s3 is long suddomain, like s3://<bucket>.amazon.com....
# in our s3 compatible services, we use path to locale resource, eg. s3://host/bucket/folder/file.parquet
.config('spark.hadoop.fs.s3a.path.style.access', True)
```

## Many different ways to access s3

`~/.aws/config` can define different profiles, we can have many different access key for different accounts.

1. read credential with ini config parser

    have to add these if we also need token
    ```python
    conf.set('spark.hadoop.fs.s3a.aws.credentials.provider', 'org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider')
    conf.set('spark.hadoop.fs.s3a.session.token', <token>)
    ```

2. use profile from `~/.aws/config`

    ```python
    os.environ['AWS_PROFILE'] = 'local'
    .config('spark.hadoop.fs.s3a.aws.credentials.provider', 'com.amazonaws.auth.profile.ProfileCredentialsProvider')
    ```

3. Anonymous

    ```python
    conf.set('spark.hadoop.fs.s3a.aws.credentials.provider', 'org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider')
    ```

ps. we don't need to specify this.
`.config('spark.hadoop.fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem')`


*references*
- [hadoop-aws module doc](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html#Using_Named_Profile_Credentials_with_ProfileCredentialsProvider)
- [reading s3 data from a local pyspark session](https://davidlindelof.com/reading-s3-data-from-a-local-pyspark-session/)

### 1. Read credential with init config parser

In [2]:
ini_config = configparser.ConfigParser()
ini_config.read(os.path.expanduser('~/.aws/credentials'));

In [3]:
profile = 'local'
access_id = ini_config.get(profile, 'aws_access_key_id')
access_key = ini_config.get(profile, 'aws_secret_access_key')

In [4]:
os.environ['AWS_PROFILE'] = 'local'
spark = (SparkSession.builder
    .appName('Test S3 connection')
    .config('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:3.3.4')
    .config('spark.hadoop.fs.s3a.endpoint', 'http://192.168.46.220:9000')
    .config('spark.hadoop.fs.s3a.access.key', access_id)
    .config('spark.hadoop.fs.s3a.secret.key', access_key)
    .config('spark.hadoop.fs.s3a.path.style.access', True)
).getOrCreate()

:: loading settings :: url = jar:file:/opt/homebrew/Cellar/apache-spark/3.3.0/libexec/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/jchien/.ivy2/cache
The jars for the packages stored in: /Users/jchien/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-34adee95-20a3-4bde-90cb-76e944639219;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
:: resolution report :: resolve 112ms :: artifacts dl 4ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.12.262 from central in [default]
	org.apache.hadoop#hadoop-aws;3.3.4 from central in [default]
	org.wildfly.openssl#wildfly-openssl;1.0.7.Final from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	-------------------------------

22/08/06 18:22:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [5]:
spark.read.csv('s3a://test/taxi+_zone_lookup.csv', header='true').show(5)

22/08/06 18:22:59 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
+----------+-------------+--------------------+------------+
|locationid|      borough|                zone|service_zone|
+----------+-------------+--------------------+------------+
|         1|          EWR|      Newark Airport|         EWR|
|         2|       Queens|         Jamaica Bay|   Boro Zone|
|         3|        Bronx|Allerton/Pelham G...|   Boro Zone|
|         4|    Manhattan|       Alphabet City| Yellow Zone|
|         5|Staten Island|       Arden Heights|   Boro Zone|
+----------+-------------+--------------------+------------+
only showing top 5 rows



### 2. use profiler provider

```python
.config('spark.hadoop.fs.s3a.aws.credentials.provider', 'com.amazonaws.auth.profile.ProfileCredentialsProvider')
```

the profile name has to be specified with environment variable `AWS_PROFILE`

In [None]:
os.environ['AWS_PROFILE'] = 'local'
spark = (SparkSession.builder
    .appName('Test S3 connection')
    .config('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:3.3.4')
    .config('spark.hadoop.fs.s3a.aws.credentials.provider', 'com.amazonaws.auth.profile.ProfileCredentialsProvider')
    .config('spark.hadoop.fs.s3a.endpoint', 'http://192.168.46.220:9000')
    .config('spark.hadoop.fs.s3a.path.style.access', True)
).getOrCreate()

### 3. Anonymous

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
 
conf = SparkConf()
conf.set('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:3.3.4')
conf.set('spark.hadoop.fs.s3a.aws.credentials.provider', 'org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider')
 
spark = SparkSession.builder.config(conf=conf).getOrCreate()
df = spark.read.csv('s3a://<some_public_dataset.csv>')

:: loading settings :: url = jar:file:/opt/homebrew/Cellar/apache-spark/3.3.0/libexec/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/jchien/.ivy2/cache
The jars for the packages stored in: /Users/jchien/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-e6cdb1c1-883c-4ade-8a44-89853baad31c;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
:: resolution report :: resolve 109ms :: artifacts dl 4ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.12.262 from central in [default]
	org.apache.hadoop#hadoop-aws;3.3.4 from central in [default]
	org.wildfly.openssl#wildfly-openssl;1.0.7.Final from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	-------------------------------

22/08/06 18:24:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/06 18:24:36 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


                                                                                

# Similar notes
- [spark on gcs](../gcs/spark.ipynb)