<a href="https://colab.research.google.com/github/Kiran45181/Pyspark/blob/main/SPARK_STREAMING.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Spark Streaming

In [1]:
import random
import csv

# Generate 30 records with random data
names = ["John", "Jane", "Mike", "Sara", "David", "Emily", "George", "Nina", "Tom", "Anna"]
departments = ["Sales", "IT", "HR", "Finance", "Marketing"]
salaries = [3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]

# Create and open a CSV file for writing
with open('employee_data.csv', mode='w', newline='') as file:
    writer = csv.writer(file)

    # Write the header
    writer.writerow(["ID", "Name", "Department", "Salary"])

    # Write the 30 records
    for i in range(1, 31):
        name = random.choice(names)
        department = random.choice(departments)
        salary = random.choice(salaries)
        writer.writerow([i, name, department, salary])

print("CSV file 'employee_data.csv' has been generated successfully.")

CSV file 'employee_data.csv' has been generated successfully.


In [2]:
!cat /content/employee_data.csv

ID,Name,Department,Salary
1,Emily,Sales,8000
2,Sara,Marketing,5000
3,John,HR,5000
4,Sara,HR,10000
5,Anna,Sales,8000
6,David,IT,9000
7,David,Marketing,10000
8,Mike,Finance,5000
9,Jane,Finance,5000
10,Jane,IT,5000
11,Tom,Sales,9000
12,Emily,Sales,10000
13,John,Marketing,10000
14,Sara,IT,7000
15,Emily,Sales,5000
16,David,IT,7000
17,Nina,Finance,9000
18,Tom,Finance,10000
19,David,HR,8000
20,Nina,IT,10000
21,John,IT,8000
22,John,Sales,6000
23,David,Finance,8000
24,David,Sales,5000
25,George,HR,5000
26,Jane,Marketing,6000
27,Emily,Marketing,10000
28,Tom,HR,3000
29,Jane,HR,3000
30,Sara,Marketing,4000


In [3]:
!ls /content/ -a

.  ..  .config	employee_data.csv  sample_data


In [4]:
!mkdir /content/empdata/

In [5]:
!mv /content/employee_data.csv /content/empdata/

In [6]:
!ls /content/empdata

employee_data.csv


In [7]:
!ls /content/empdata/employee_data.csv

/content/empdata/employee_data.csv


In [8]:
!cat /content/empdata/employee_data.csv

ID,Name,Department,Salary
1,Emily,Sales,8000
2,Sara,Marketing,5000
3,John,HR,5000
4,Sara,HR,10000
5,Anna,Sales,8000
6,David,IT,9000
7,David,Marketing,10000
8,Mike,Finance,5000
9,Jane,Finance,5000
10,Jane,IT,5000
11,Tom,Sales,9000
12,Emily,Sales,10000
13,John,Marketing,10000
14,Sara,IT,7000
15,Emily,Sales,5000
16,David,IT,7000
17,Nina,Finance,9000
18,Tom,Finance,10000
19,David,HR,8000
20,Nina,IT,10000
21,John,IT,8000
22,John,Sales,6000
23,David,Finance,8000
24,David,Sales,5000
25,George,HR,5000
26,Jane,Marketing,6000
27,Emily,Marketing,10000
28,Tom,HR,3000
29,Jane,HR,3000
30,Sara,Marketing,4000


In [10]:
from pyspark.sql.types import StructType, IntegerType, StringType

schema = StructType() \
    .add("EmpID", IntegerType()) \
    .add("Name", StringType()) \
    .add("Department", StringType()) \
    .add("Salary", IntegerType())

print(schema)

StructType([StructField('EmpID', IntegerType(), True), StructField('Name', StringType(), True), StructField('Department', StringType(), True), StructField('Salary', IntegerType(), True)])


In [11]:
from pyspark.sql  import SparkSession
spark = SparkSession.builder.appName("sparkSQLBasics").getOrCreate()

stream_df = spark.readStream \
  .option("sep", ",") \
  .schema(schema) \
  .csv("/content/empdata")

In [12]:
from pyspark.sql.functions import upper
transformed_df = stream_df.withColumn("NameUPPER", upper("Name"))

In [13]:
query = transformed_df.writeStream \
.outputMode("append") \
.format("console")\
.start()

In [14]:
query.stop()

In [15]:
%%bash
cat<<EOF > /content/empdata/employee_data2.csv
1,John,Sales,3000
2,Jane,IT,4000
3,Mike,Sales,5000
4,Sara,Finance,6000
5,David,HR,7000
6,Emily,Marketing,6000
7,George,HR,4000
8,Nina,Sales,5000
9,Tom,IT,8000
10,Anna,Marketing,3000
EOF

In [16]:
!cat /content/empdata/employee_data2.csv

1,John,Sales,3000
2,Jane,IT,4000
3,Mike,Sales,5000
4,Sara,Finance,6000
5,David,HR,7000
6,Emily,Marketing,6000
7,George,HR,4000
8,Nina,Sales,5000
9,Tom,IT,8000
10,Anna,Marketing,3000


In [17]:
!cat /content/empdata/

cat: /content/empdata/: Is a directory


In [18]:
!ls /content/empdata/

employee_data2.csv  employee_data.csv


In [22]:

# folder where JSON files will be saved
query = stream_df.writeStream \
    .format("json") \
    .option("path", "/content/json_output") \
    .option("checkpointLocation", "/content/chkpt") \
    .outputMode("append") \
    .start()


In [24]:
df = spark.read.json("/content/json_output")
df.show()

+----------+-----+-----+------+
|Department|EmpID| Name|Salary|
+----------+-----+-----+------+
|Department| NULL| Name|  NULL|
|     Sales|    1|Emily|  8000|
| Marketing|    2| Sara|  5000|
|        HR|    3| John|  5000|
|        HR|    4| Sara| 10000|
|     Sales|    5| Anna|  8000|
|        IT|    6|David|  9000|
| Marketing|    7|David| 10000|
|   Finance|    8| Mike|  5000|
|   Finance|    9| Jane|  5000|
|        IT|   10| Jane|  5000|
|     Sales|   11|  Tom|  9000|
|     Sales|   12|Emily| 10000|
| Marketing|   13| John| 10000|
|        IT|   14| Sara|  7000|
|     Sales|   15|Emily|  5000|
|        IT|   16|David|  7000|
|   Finance|   17| Nina|  9000|
|   Finance|   18|  Tom| 10000|
|        HR|   19|David|  8000|
+----------+-----+-----+------+
only showing top 20 rows

