<a href="https://colab.research.google.com/github/JarekMaleszyk/data-science-project-sandbox/blob/main/pyspark_test4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [217]:
try:
  import pyspark
except:
  !pip install pyspark
  import pyspark
finally:
  from pyspark.sql import SparkSession
  spark = SparkSession.builder.appName('pyspark-notebook').getOrCreate()

In [218]:
spark

In [219]:
df_pyspark = spark.read.option('header', 'true').csv('/content/simple_data2.csv', inferSchema=True, sep=';')

In [220]:
spark.stop()

In [221]:
try:
  spark
except AttributeError as ae:
  print('spark is stopped')
else:
  spark.stop()
finally:
  from pyspark.sql import SparkSession
  spark = SparkSession.builder.appName('pyspark-notebook').getOrCreate()

In [222]:
df_pyspark = spark.read.option('header', 'true').csv('/content/simple_data2.csv', inferSchema=True, sep=';')

In [223]:
df_pyspark.show()

+-----+---+----------+-------+----------+
| name|age|experience| salary|department|
+-----+---+----------+-------+----------+
| Kris| 31|         6| 2230.3|        IT|
| Adam| 30|         5|2230.89|        HR|
|Sunny| 39|        14|3230.21|        HR|
|  Tom| 28|         5|1890.32|        IT|
| John| 40|        18| 3400.0|  HelpDesk|
| Mark| 36|        12|4730.04|        IT|
|  Bob| 35|         7|1930.05|        IT|
|Ellen| 34|         7|2230.96|  HelpDesk|
|  Jim| 46|        21|6230.99|        IT|
| Paul| 39|        11| 2710.5|        IT|
|  Tom| 38|        17|4280.45|        IT|
+-----+---+----------+-------+----------+



### Resilient distributed dataset (RDD)

In [224]:
rdd = spark.sparkContext.parallelize(df_pyspark.collect())

In [225]:
rdd.collect()

[Row(name='Kris', age=31, experience=6, salary=2230.3, department='IT'),
 Row(name='Adam', age=30, experience=5, salary=2230.89, department='HR'),
 Row(name='Sunny', age=39, experience=14, salary=3230.21, department='HR'),
 Row(name='Tom', age=28, experience=5, salary=1890.32, department='IT'),
 Row(name='John', age=40, experience=18, salary=3400.0, department='HelpDesk'),
 Row(name='Mark', age=36, experience=12, salary=4730.04, department='IT'),
 Row(name='Bob', age=35, experience=7, salary=1930.05, department='IT'),
 Row(name='Ellen', age=34, experience=7, salary=2230.96, department='HelpDesk'),
 Row(name='Jim', age=46, experience=21, salary=6230.99, department='IT'),
 Row(name='Paul', age=39, experience=11, salary=2710.5, department='IT'),
 Row(name='Tom', age=38, experience=17, salary=4280.45, department='IT')]

In [226]:
rdd.count()

11

In [227]:
rdd.take(1)

[Row(name='Kris', age=31, experience=6, salary=2230.3, department='IT')]

In [228]:
rdd.foreach(lambda x: print(x))

In [229]:
mapped_rdd = rdd.map(lambda x: (x[0].upper(), x[1]))
result = mapped_rdd.collect()
print(f'RDD with uppercase name')

RDD with uppercase name


In [230]:
filred_rdd = rdd.filter(lambda x: x[1] > 33)
result = filred_rdd.collect()
print(f'RDD with age > 30: {result}')

RDD with age > 30: [Row(name='Sunny', age=39, experience=14, salary=3230.21, department='HR'), Row(name='John', age=40, experience=18, salary=3400.0, department='HelpDesk'), Row(name='Mark', age=36, experience=12, salary=4730.04, department='IT'), Row(name='Bob', age=35, experience=7, salary=1930.05, department='IT'), Row(name='Ellen', age=34, experience=7, salary=2230.96, department='HelpDesk'), Row(name='Jim', age=46, experience=21, salary=6230.99, department='IT'), Row(name='Paul', age=39, experience=11, salary=2710.5, department='IT'), Row(name='Tom', age=38, experience=17, salary=4280.45, department='IT')]


In [231]:
reduced_rdd = rdd.map(lambda x: (x[0], x[1])).reduceByKey(lambda x, y: x + y)
# Map the RDD to ensure each element is a key-value pair (name, age) before applying reduceByKey.
result = reduced_rdd.collect()
print(f'RDD with age reduced: {result}')

RDD with age reduced: [('Sunny', 39), ('Tom', 66), ('Mark', 36), ('Ellen', 34), ('Jim', 46), ('Kris', 31), ('Adam', 30), ('John', 40), ('Bob', 35), ('Paul', 39)]


In [232]:
sorted_rdd = rdd.sortBy(lambda x: x[1], ascending=False)
result = sorted_rdd.collect()
print(f'RDD with sorted by age: {result}')

RDD with sorted by age: [Row(name='Jim', age=46, experience=21, salary=6230.99, department='IT'), Row(name='John', age=40, experience=18, salary=3400.0, department='HelpDesk'), Row(name='Sunny', age=39, experience=14, salary=3230.21, department='HR'), Row(name='Paul', age=39, experience=11, salary=2710.5, department='IT'), Row(name='Tom', age=38, experience=17, salary=4280.45, department='IT'), Row(name='Mark', age=36, experience=12, salary=4730.04, department='IT'), Row(name='Bob', age=35, experience=7, salary=1930.05, department='IT'), Row(name='Ellen', age=34, experience=7, salary=2230.96, department='HelpDesk'), Row(name='Kris', age=31, experience=6, salary=2230.3, department='IT'), Row(name='Adam', age=30, experience=5, salary=2230.89, department='HR'), Row(name='Tom', age=28, experience=5, salary=1890.32, department='IT')]


In [233]:
try:
  rdd.saveAsTextFile('output.txt')
except Exception as e:
  print("folder is not empty")
  !rm -Rf output.txt


In [234]:
rdd_text = spark.sparkContext.textFile('output.txt')

In [235]:
%%bash
head -5 /content/simple_data2.csv

name;age;experience;salary;department
Kris;31;6;2230.30;IT
Adam;30;5;2230.89;HR
Sunny;39;14;3230.21;HR
Tom;28;5;1890.32;IT


In [236]:
!head -5 /content/simple_data2.csv

name;age;experience;salary;department
Kris;31;6;2230.30;IT
Adam;30;5;2230.89;HR
Sunny;39;14;3230.21;HR
Tom;28;5;1890.32;IT


In [237]:
df_pyspark.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience: integer (nullable = true)
 |-- salary: double (nullable = true)
 |-- department: string (nullable = true)



In [244]:
# Read single line JSON
# Each row is a JSON record, records are separated by new line
json_file_path = "/content/data.json"
df_json = spark.read.option("multiline","true").json(json_file_path) #In Spark 2.2+ you can read json file of multiline using following command.

In [245]:
df_json.printSchema()

root
 |-- category: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- price: double (nullable = true)
 |-- quantity: long (nullable = true)

