Q1

In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

# Create a Spark session
spark = SparkSession.builder.appName("SquareIntegers").getOrCreate()

# Sample data - Replace this with your actual data
data = [(1,), (2,), (3,), (4,), (5,)]

# Create a DataFrame from the sample data
df = spark.createDataFrame(data, ["integer"])

# Define a UDF (User Defined Function) to square the integers
square_udf = udf(lambda x: x ** 2, IntegerType())

# Apply the UDF to the DataFrame
result_df = df.withColumn("squared", square_udf(col("integer")))

# Show the result
result_df.show()

# Stop the Spark session
spark.stop()


                                                                                

+-------+-------+
|integer|squared|
+-------+-------+
|      1|      1|
|      2|      4|
|      3|      9|
|      4|     16|
|      5|     25|
+-------+-------+



Q2

In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, max

# Create a Spark session
spark = SparkSession.builder.appName("MaxOfNumbers").getOrCreate()

# Sample data - Replace this with your actual data
data = [(1,), (7,), (3,), (9,), (5,)]

# Create a DataFrame from the sample data
df = spark.createDataFrame(data, ["number"])

# Find the maximum value
max_value = df.agg(max(col("number"))).collect()[0][0]

# Show the result
print("Maximum value:", max_value)

# Stop the Spark session
spark.stop()

Maximum value: 9


Q3

In [15]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg

# Create a Spark session
spark = SparkSession.builder.appName("AverageOfNumbers").getOrCreate()

# Sample data - Replace this with your actual data
data = [(1,), (2,), (3,), (4,), (5,)]

# Create a DataFrame from the sample data
df = spark.createDataFrame(data, ["number"])

# Find the average value
average_value = df.agg(avg(col("number"))).collect()[0][0]

# Show the result
print("Average value:", average_value)

# Stop the Spark session
spark.stop()

Average value: 3.0


Q4

In [16]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder.appName("ReadCSV").getOrCreate()

# Read the CSV file into a PySpark DataFrame
df = spark.read.csv("data.csv", header=True, inferSchema=True)

# Show the DataFrame
df.show()

# Stop the Spark session
spark.stop()

+----------------+-------+----------+----------+------+-------+---------+--------------------+--------------------+--------------------+--------------------+--------------+--------------+--------------+
|Series_reference| Period|Data_value|Suppressed|STATUS|  UNITS|Magnitude|             Subject|               Group|      Series_title_1|      Series_title_2|Series_title_3|Series_title_4|Series_title_5|
+----------------+-------+----------+----------+------+-------+---------+--------------------+--------------------+--------------------+--------------------+--------------+--------------+--------------+
|   BDCQ.SF1AA2CA|2016.06|  1116.386|      NULL|     F|Dollars|        6|Business Data Col...|Industry by finan...|Sales (operating ...|Forestry and Logging|Current prices|    Unadjusted|          NULL|
|   BDCQ.SF1AA2CA|2016.09|  1070.874|      NULL|     F|Dollars|        6|Business Data Col...|Industry by finan...|Sales (operating ...|Forestry and Logging|Current prices|    Unadjusted| 

Q5

In [17]:
spark = SparkSession.builder.appName("DisplayDataFrame").getOrCreate()

# Define the path to your CSV file
csv_file_path = "data.csv" 

# Read the CSV file into a PySpark DataFrame
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)

# Display the first few rows of the DataFrame
print("First few rows:")
df.show()

# Display the schema of the DataFrame
print("Schema:")
df.printSchema()

# Stop the Spark session
spark.stop()

First few rows:
+----------------+-------+----------+----------+------+-------+---------+--------------------+--------------------+--------------------+--------------------+--------------+--------------+--------------+
|Series_reference| Period|Data_value|Suppressed|STATUS|  UNITS|Magnitude|             Subject|               Group|      Series_title_1|      Series_title_2|Series_title_3|Series_title_4|Series_title_5|
+----------------+-------+----------+----------+------+-------+---------+--------------------+--------------------+--------------------+--------------------+--------------+--------------+--------------+
|   BDCQ.SF1AA2CA|2016.06|  1116.386|      NULL|     F|Dollars|        6|Business Data Col...|Industry by finan...|Sales (operating ...|Forestry and Logging|Current prices|    Unadjusted|          NULL|
|   BDCQ.SF1AA2CA|2016.09|  1070.874|      NULL|     F|Dollars|        6|Business Data Col...|Industry by finan...|Sales (operating ...|Forestry and Logging|Current prices|

Q6

In [18]:
spark = SparkSession.builder.appName("SummaryStatistics").getOrCreate()

# Define the path to your CSV file
csv_file_path = "data.csv"  # Replace with the actual path to your CSV file

# Read the CSV file into a PySpark DataFrame
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)

# Choose the column for which you want to calculate summary statistics
selected_column = "Data_value"  # Replace with the actual column name

# Calculate summary statistics for the selected column
column_summary = df.describe([selected_column])

# Show the summary statistics
column_summary.show()

# Stop the Spark session
spark.stop()

+-------+-----------------+
|summary|       Data_value|
+-------+-----------------+
|  count|             6520|
|   mean|4878.963066564418|
| stddev|7248.031249367176|
|    min|         -398.194|
|    max|        41541.633|
+-------+-----------------+



ADDITIONAL QUESTION

In [20]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split

# Create a Spark session
spark = SparkSession.builder.appName("WordCount").getOrCreate()

# Define the path to your folder containing text files
text_folder_path = "resources/"  # Replace with the actual path to your text files

# Read text files into a PySpark DataFrame
df = spark.read.text(text_folder_path)

# Tokenize the words using the split function
words_df = df.select(explode(split(df.value, " ")).alias("word"))

# Calculate the count of each word
word_counts = words_df.groupBy("word").count()

# Show the word counts
word_counts.show()

# Find the word with the maximum occurrence
max_occurrence_word = word_counts.orderBy("count", ascending=False).first()["word"]
max_occurrence_count = word_counts.orderBy("count", ascending=False).first()["count"]

# Display the word with the maximum occurrence
print("Word with maximum occurrence:", max_occurrence_word)
print("Count of maximum occurrence:", max_occurrence_count)

# Stop the Spark session
spark.stop()

+-------------+-----+
|         word|count|
+-------------+-----+
|        Vide,|    1|
|          lex|    1|
|institutionis|    1|
|       vidit,|    1|
|          Sed|    6|
|         isti|    1|
|         nova|    1|
|         vera|    1|
|          rei|    1|
|      homini,|    1|
|       dicam,|    1|
|       quanta|    1|
|        dixti|    1|
|  voluptates?|    1|
|       cogit.|    1|
|        puto.|    3|
|           ne|    3|
|      inpune.|    1|
|       quoque|    1|
|      audivi,|    1|
+-------------+-----+
only showing top 20 rows

Word with maximum occurrence: non
Count of maximum occurrence: 17
