# importing pyspark

pyspark is a Python API for Spark that lets you harness the simplicity of Python and the power of Apache Spark to scale up the analytical computations.


In [1]:
import pyspark as ps

In [2]:
# check pyspark version
print(ps.__version__)

4.0.0


# creating spark session

spark session is the entry point to programming Spark with the Dataset and DataFrame API


In [None]:
# create spark session

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("pyspark_practice") \
    .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/07/28 08:27:04 WARN Utils: Your hostname, codespaces-72158f, resolves to a loopback address: 127.0.0.1; using 10.0.0.169 instead (on interface eth0)
25/07/28 08:27:04 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/28 08:27:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# uploading the dataset and creating dataframe

In [4]:
# data here is a list of tuples

data = [
    (1, "Alice", 30, "Engineering", 85000, "2018-03-15", "Bangalore", 4.5),
    (2, "Bob", 45, "Sales", 70000, "2016-07-01", "Mumbai", 3.8),
    (3, "Charlie", 28, "Marketing", 65000, "2019-11-23", "Delhi", 4.2),
    (4, "David", 35, "Engineering", 92000, "2015-04-11", "Bangalore", 4.7),
    (5, "Eva", 40, "HR", 60000, "2017-02-01", "Pune", 3.9),
    (6, "Frank", 29, "Sales", 68000, "2020-09-20", "Chennai", 4.1),
    (7, "Grace", 33, "Engineering", 87000, "2018-12-12", "Hyderabad", 4.6),
    (8, "Hannah", 31, "Marketing", 64000, "2019-05-25", "Delhi", 4.0),
    (9, "Ian", 38, "Sales", 72000, "2014-10-10", "Mumbai", 3.7),
    (10, "Jenny", 27, "HR", 59000, "2021-01-01", "Bangalore", 4.3)
]

In [None]:
# defining the schema for the DataFrame using StructType and StructField

from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, DateType

schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("department", StringType(), True),
    StructField("salary", IntegerType(), True),
    StructField("joining_date", StringType(), True),
    StructField("location", StringType(), True),
    StructField("performance_rating", FloatType(), True)
])

In [6]:
# create dataframe

df = spark.createDataFrame(data, schema)

In [7]:
df.show() # display the DataFrame

                                                                                

+---+-------+---+-----------+------+------------+---------+------------------+
| id|   name|age| department|salary|joining_date| location|performance_rating|
+---+-------+---+-----------+------+------------+---------+------------------+
|  1|  Alice| 30|Engineering| 85000|  2018-03-15|Bangalore|               4.5|
|  2|    Bob| 45|      Sales| 70000|  2016-07-01|   Mumbai|               3.8|
|  3|Charlie| 28|  Marketing| 65000|  2019-11-23|    Delhi|               4.2|
|  4|  David| 35|Engineering| 92000|  2015-04-11|Bangalore|               4.7|
|  5|    Eva| 40|         HR| 60000|  2017-02-01|     Pune|               3.9|
|  6|  Frank| 29|      Sales| 68000|  2020-09-20|  Chennai|               4.1|
|  7|  Grace| 33|Engineering| 87000|  2018-12-12|Hyderabad|               4.6|
|  8| Hannah| 31|  Marketing| 64000|  2019-05-25|    Delhi|               4.0|
|  9|    Ian| 38|      Sales| 72000|  2014-10-10|   Mumbai|               3.7|
| 10|  Jenny| 27|         HR| 59000|  2021-01-01|Ban

In [8]:
# StructType is a data type that defines a schema for a DataFrame.
# It describes the names, data types, and nullability of each column in a structured format
# StructField is a component of StructType that defines a single field in the schema.

# In PySpark, nullability refers to whether a column in a DataFrame can contain null values or not.

# It’s defined in each StructField inside a StructType schema using the third argument:

'''
For example:
StructType([StructField("column_name", DataType(), nullable),])
'''

# If nullable is set to True, the column can contain null values; if set to False, it cannot.

'\nFor example:\nStructType([StructField("column_name", DataType(), nullable),])\n'

In [9]:
# create a new dataframe with csv file

df2 = spark.read.csv(r'/workspaces/pyspark/dataset/employee_data_v2.csv', header=True, inferSchema=True)
df2.show()

+------+---------------+----------+------+------------+
|emp_id|           name|department|salary|joining_date|
+------+---------------+----------+------+------------+
|  1036|    Lisa Thomas|   Support| 82227|  2023-07-17|
|  1099|  Holly Goodwin|   Finance| 97811|  2021-04-03|
|  1019|     Aaron Neal|        IT| 54948|  2020-12-02|
|  1003|    Jason Clark|   Support|108328|  2016-04-20|
|  1090|  Beverly Clark| Marketing| 48881|  2019-03-21|
|  1045|   Joel Roberts|   Support| 42300|  2023-09-08|
|  1074| Richard Miller| Marketing| 94625|  2023-07-29|
|  1056|      Ryan Hall|        IT| 54619|  2017-04-16|
|  1037|     Linda Lane|Operations| 42007|  2018-04-28|
|  1094|    April Young|   Support| 75014|  2021-12-10|
|  1050|   Amber Savage|Operations|105312|  2023-06-04|
|  1080|    Sandra King|   Finance| 42694|  2017-07-02|
|  1033|   Ryan Kim DVM|Operations| 63050|  2021-11-07|
|  1008|William Alvarez| Marketing|119709|  2020-03-05|
|  1011|  Andrea Wright| Marketing| 63399|  2019

In [10]:
# if inferSchema is set to True, Spark will automatically infer the data types of each column based on the data in the CSV file.
# If set to False, all columns will be read as strings.

In [11]:
'''
You can also create a DataFrame like this, as shown below:
data = [
    (1, "Alice", 30, "Engineering", 85000, "2018-03-15", "Bangalore", 4.5),
    (2, "Bob", 45, "Sales", 70000, "2016-07-01", "Mumbai", 3.8),
    (3, "Charlie", 28, "Marketing", 65000, "2019-11-23", "Delhi", 4.2),
    (4, "David", 35, "Engineering", 92000, "2015-04-11", "Bangalore", 4.7),
    (5, "Eva", 40, "HR", 60000, "2017-02-01", "Pune", 3.9),
    (6, "Frank", 29, "Sales", 68000, "2020-09-20", "Chennai", 4.1),
    (7, "Grace", 33, "Engineering", 87000, "2018-12-12", "Hyderabad", 4.6),
    (8, "Hannah", 31, "Marketing", 64000, "2019-05-25", "Delhi", 4.0),
    (9, "Ian", 38, "Sales", 72000, "2014-10-10", "Mumbai", 3.7),
    (10, "Jenny", 27, "HR", 59000, "2021-01-01", "Bangalore", 4.3)
]

columns = ["id", "name", "age", "department", "salary", "joining_date", "city", "performance_score"]

df = spark.createDataFrame(data, columns)
df.show()
'''

'\nYou can also create a DataFrame like this, as shown below:\ndata = [\n    (1, "Alice", 30, "Engineering", 85000, "2018-03-15", "Bangalore", 4.5),\n    (2, "Bob", 45, "Sales", 70000, "2016-07-01", "Mumbai", 3.8),\n    (3, "Charlie", 28, "Marketing", 65000, "2019-11-23", "Delhi", 4.2),\n    (4, "David", 35, "Engineering", 92000, "2015-04-11", "Bangalore", 4.7),\n    (5, "Eva", 40, "HR", 60000, "2017-02-01", "Pune", 3.9),\n    (6, "Frank", 29, "Sales", 68000, "2020-09-20", "Chennai", 4.1),\n    (7, "Grace", 33, "Engineering", 87000, "2018-12-12", "Hyderabad", 4.6),\n    (8, "Hannah", 31, "Marketing", 64000, "2019-05-25", "Delhi", 4.0),\n    (9, "Ian", 38, "Sales", 72000, "2014-10-10", "Mumbai", 3.7),\n    (10, "Jenny", 27, "HR", 59000, "2021-01-01", "Bangalore", 4.3)\n]\n\ncolumns = ["id", "name", "age", "department", "salary", "joining_date", "city", "performance_score"]\n\ndf = spark.createDataFrame(data, columns)\ndf.show()\n'