In [1]:
!apt-get install openjdk-11-jdk -y
!pip install -q pyspark


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  fonts-dejavu-core fonts-dejavu-extra libatk-wrapper-java
  libatk-wrapper-java-jni libxt-dev libxtst6 libxxf86dga1 openjdk-11-jre
  x11-utils
Suggested packages:
  libxt-doc openjdk-11-demo openjdk-11-source visualvm mesa-utils
The following NEW packages will be installed:
  fonts-dejavu-core fonts-dejavu-extra libatk-wrapper-java
  libatk-wrapper-java-jni libxt-dev libxtst6 libxxf86dga1 openjdk-11-jdk
  openjdk-11-jre x11-utils
0 upgraded, 10 newly installed, 0 to remove and 34 not upgraded.
Need to get 5,366 kB of archives.
After this operation, 15.2 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 fonts-dejavu-core all 2.37-2build1 [1,041 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/main amd64 fonts-dejavu-extra all 2.37-2build1 [2,041 kB]
Get:3 http://archive.ubuntu.com/ubuntu jam

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"


In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Large Dataset Analysis") \
    .getOrCreate()


In [10]:
# Generate a large dataset using Spark (simulate 1 million rows)
import pandas as pd
import numpy as np

df = pd.DataFrame({
    'id': range(1, 1000001),
    'value': np.random.randn(1000000)
})

# Save to CSV
df.to_csv('CHOCOLATE DATA SET.csv', index=False)

# Load with PySpark
df_spark = spark.read.csv('CHOCOLATE DATA SET.csv', header=True, inferSchema=True)
df_spark.show(5)


+---+--------------------+
| id|               value|
+---+--------------------+
|  1|  0.4844342082019394|
|  2|-0.39758207185867667|
|  3|   0.598270081563896|
|  4|  0.8801434218712032|
|  5|  0.9821190409789695|
+---+--------------------+
only showing top 5 rows



In [11]:
# Calculate basic stats
df_spark.select("value").describe().show()

# Group by a bucket of values
from pyspark.sql.functions import floor

df_buckets = df_spark.withColumn("bucket", floor(df_spark.value))
df_buckets.groupBy("bucket").count().orderBy("bucket").show()


+-------+--------------------+
|summary|               value|
+-------+--------------------+
|  count|             1000000|
|   mean|0.002727575939625...|
| stddev|  0.9987539950284879|
|    min|  -4.737757151519014|
|    max|  5.3552068524492356|
+-------+--------------------+

+------+------+
|bucket| count|
+------+------+
|    -5|    33|
|    -4|  1268|
|    -3| 21220|
|    -2|135146|
|    -1|341037|
|     0|342308|
|     1|136253|
|     2| 21405|
|     3|  1300|
|     4|    29|
|     5|     1|
+------+------+



In [12]:
# Simulate 10 million rows if needed
rows = spark.range(0, 10000000)
rows = rows.withColumn("value", (rows.id % 1000) - 500)
rows.groupBy("value").count().orderBy("value").show()


+-----+-----+
|value|count|
+-----+-----+
| -500|10000|
| -499|10000|
| -498|10000|
| -497|10000|
| -496|10000|
| -495|10000|
| -494|10000|
| -493|10000|
| -492|10000|
| -491|10000|
| -490|10000|
| -489|10000|
| -488|10000|
| -487|10000|
| -486|10000|
| -485|10000|
| -484|10000|
| -483|10000|
| -482|10000|
| -481|10000|
+-----+-----+
only showing top 20 rows



In [7]:
from google.colab import files
uploaded = files.upload()


Saving CHOCOLATE DATA SET.csv to CHOCOLATE DATA SET.csv


In [8]:
df = spark.read.csv("CHOCOLATE DATA SET.csv", header=True, inferSchema=True)
df.show(5)


+--------------+---------+-------------------+---------+--------+-------------+
|  Sales Person|  Country|            Product|     Date|  Amount|Boxes Shipped|
+--------------+---------+-------------------+---------+--------+-------------+
|Jehu Rudeforth|       UK|    Mint Chip Choco|04-Jan-22| $5,320 |          180|
|   Van Tuxwell|    India|      85% Dark Bars|01-Aug-22| $7,896 |           94|
|  Gigi Bohling|    India|Peanut Butter Cubes|07-Jul-22| $4,501 |           91|
|  Jan Morforth|Australia|Peanut Butter Cubes|27-Apr-22|$12,726 |          342|
|Jehu Rudeforth|       UK|Peanut Butter Cubes|24-Feb-22|$13,685 |          184|
+--------------+---------+-------------------+---------+--------+-------------+
only showing top 5 rows

