In [1]:
import sys, os
environment = ['PYSPARK_PYTHON', 'PYSPARK_DRIVER_PYTHON']
for var in environment:
    os.environ[var] = sys.executable

In [2]:
from pyspark.sql import SparkSession
session = SparkSession.builder.getOrCreate()

In [3]:
rdd = session.sparkContext.parallelize([1,2,3])

In [4]:
rdd.take(num=2)

[1, 2]

In [5]:
rdd.count()

3

In [6]:
rdd.collect()

[1, 2, 3]

In [7]:
df = session.createDataFrame(
  [[1,2,3], [4,5,6]], ['column1', 'column2', 'column3']
)

In [8]:
# Print a table representation of the dataframe with the first n rows.
df.show(n=3)

+-------+-------+-------+
|column1|column2|column3|
+-------+-------+-------+
|      1|      2|      3|
|      4|      5|      6|
+-------+-------+-------+



In [9]:
# Note: RDDs can't be modified in place, so you will need to code things as such:
# my_rdd = my_rdd.map(lambda x: x*100)

In [10]:
import pyspark.sql.functions as funcs
import pyspark.sql.types as types
def multiply_by_ten(number):
    return number*10.0
multiply_udf = funcs.udf(multiply_by_ten, types.DoubleType())
transformed_df = df.withColumn(
    'multiplied', multiply_udf('column1')
)
transformed_df.show()

+-------+-------+-------+----------+
|column1|column2|column3|multiplied|
+-------+-------+-------+----------+
|      1|      2|      3|      10.0|
|      4|      5|      6|      40.0|
+-------+-------+-------+----------+



In [11]:
# RDD mapping

import pyspark.sql.types as types
import math
def take_log_in_all_columns(row: types.Row):
     old_row = row.asDict()
     new_row = {f'log({column_name})': math.log(value) 
                for column_name, value in old_row.items()}
     return types.Row(**new_row)

In [12]:
logarithmic_dataframe = df.rdd.map(take_log_in_all_columns).toDF()

In [14]:
logarithmic_dataframe.show()

+------------------+------------------+------------------+
|      log(column1)|      log(column2)|      log(column3)|
+------------------+------------------+------------------+
|               0.0|0.6931471805599453|1.0986122886681098|
|1.3862943611198906|1.6094379124341003| 1.791759469228055|
+------------------+------------------+------------------+



In [20]:
# SQL operations

df.select('column1', 'column2').show()

+-------+-------+
|column1|column2|
+-------+-------+
|      1|      2|
|      4|      5|
+-------+-------+



In [19]:
df.where('column1 = 3').show()

+-------+-------+-------+
|column1|column2|column3|
+-------+-------+-------+
+-------+-------+-------+



In [24]:
# df.join(df1, ['column1'], how='inner')

In [26]:
df.createOrReplaceTempView('table1')

In [27]:
df2 = session.sql("SELECT column1 AS f1, column2 as f2 from table1")

In [28]:
df.show()

+-------+-------+-------+
|column1|column2|column3|
+-------+-------+-------+
|      1|      2|      3|
|      4|      5|      6|
+-------+-------+-------+



In [29]:
df2.show()

+---+---+
| f1| f2|
+---+---+
|  1|  2|
|  4|  5|
+---+---+



In [30]:
# Dataframe column operations

df3 = df.withColumn(
    'derived_column', df['column1'] + df['column2'] * df['column3']
)

In [32]:
# Aggregations and quick statistics

ADULT_COLUMN_NAMES = [
     "age",
     "workclass",
     "fnlwgt",
     "education",
     "education_num",
     "marital_status",
     "occupation",
     "relationship",
     "race",
     "sex",
     "capital_gain",
     "capital_loss",
     "hours_per_week",
     "native_country",
     "income"
 ]

csv_df = session.read.csv(
     'adult.data.csv', header=False, inferSchema=True
 )
for new_col, old_col in zip(ADULT_COLUMN_NAMES, csv_df.columns):
     csv_df = csv_df.withColumnRenamed(old_col, new_col)

In [33]:
csv_df.describe().show()

+-------+------------------+------------+------------------+-------------+-----------------+--------------+-----------------+------------+-------------------+-------+------------------+----------------+------------------+--------------+------+
|summary|               age|   workclass|            fnlwgt|    education|    education_num|marital_status|       occupation|relationship|               race|    sex|      capital_gain|    capital_loss|    hours_per_week|native_country|income|
+-------+------------------+------------+------------------+-------------+-----------------+--------------+-----------------+------------+-------------------+-------+------------------+----------------+------------------+--------------+------+
|  count|             32561|       32561|             32561|        32561|            32561|         32561|            32561|       32561|              32561|  32561|             32561|           32561|             32561|         32561| 32561|
|   mean| 38.58164675532

In [34]:
work_hours_df = csv_df.groupBy(
    'age'
).agg(
    funcs.avg('hours_per_week'),
    funcs.stddev_samp('hours_per_week')
).sort('age')

In [35]:
work_hours_df.show()

+---+-------------------+---------------------------+
|age|avg(hours_per_week)|stddev_samp(hours_per_week)|
+---+-------------------+---------------------------+
| 17| 21.367088607594937|         10.021014993616216|
| 18| 25.912727272727274|         11.733362123434848|
| 19| 30.678370786516854|         12.119154493614719|
| 20|  32.28021248339974|         11.726599330994663|
| 21|  34.03472222222222|         12.040389374051912|
| 22|  35.17124183006536|         11.968466821743275|
| 23|  36.71835803876853|         10.916632739093428|
| 24|  39.08897243107769|         10.638975889466733|
| 25|  40.00713436385256|         10.452953398659348|
| 26|  41.06496815286624|          11.29552504314252|
| 27| 42.039520958083834|         10.755941741375546|
| 28|  42.02768166089965|         10.737113530868324|
| 29|  42.36531365313653|         10.206157095904361|
| 30| 42.167247386759584|         10.990266114829758|
| 31| 42.877252252252255|         11.008740019442087|
| 32| 42.878019323671495|   

In [37]:
# Connecting to databases

session = SparkSession.builder.config(
    'spark.jars', 'postgresql-42.2.16.jar'
).config(
    'spark.driver.extraClassPath', 'postgresql-42.2.16.jar'
).getOrCreate()

To connect to a RDBMS you need a JDBC driver, that would be a jar file Spark can use to talk to the database. I will show you a PostgreSQL example.

ref: https://jdbc.postgresql.org/download.html

In [38]:
# url = f"jdbc:postgresql://your_host_ip:5432/your_database"
# properties = {'user': 'your_user', 'password': 'your_password'}
# # read from a table into a dataframe
# df = session.read.jdbc(
#     url=url, table='your_table_name', properties=properties
# )

In [None]:
# transformed_df.write.jdbc(
#     url=url, table='new_table', mode='append', properties=properties
# )