In [1]:
import pandas as pd

# CSV content (cleaned)
csv_data = """OrderID,OrderDate,Customer,Segment,Region,Product,Category,SubCategory,Quantity,UnitPrice,Discount,Profit
CA-1001,2023-01-15,Ravi,Consumer,South,Laptop,Technology,Computers,1,55000,0.10,5000
CA-1002,2023-02-20,Priya,Corporate,North,Printer,Technology,Peripherals,2,12000,0.15,1800
CA-1003,2023-01-25,Amit,Consumer,East,Notebook,Office Supplies,Paper,3,200,0.05,150
CA-1004,2023-03-01,Anita,Home Office,West,Table,Furniture,Tables,1,18000,0.20,-1500
CA-1005,2023-02-05,Divya,Consumer,South,Phone,Technology,Phones,2,20000,0.00,3000
"""

# Save as CSV
with open("superstore.csv", "w") as f:
    f.write(csv_data)

# Load into pandas DataFrame
df = pd.read_csv("superstore.csv")
print(df)


   OrderID   OrderDate Customer      Segment Region   Product  \
0  CA-1001  2023-01-15     Ravi     Consumer  South    Laptop   
1  CA-1002  2023-02-20    Priya    Corporate  North   Printer   
2  CA-1003  2023-01-25     Amit     Consumer   East  Notebook   
3  CA-1004  2023-03-01    Anita  Home Office   West     Table   
4  CA-1005  2023-02-05    Divya     Consumer  South     Phone   

          Category  SubCategory  Quantity  UnitPrice  Discount  Profit  
0       Technology    Computers         1      55000      0.10    5000  
1       Technology  Peripherals         2      12000      0.15    1800  
2  Office Supplies        Paper         3        200      0.05     150  
3        Furniture       Tables         1      18000      0.20   -1500  
4       Technology       Phones         2      20000      0.00    3000  


In [2]:
print(" Head:")
print(df.head())
print("\n Shape:", df.shape)
print("\n Data types:")
print(df.dtypes)


 Head:
   OrderID   OrderDate Customer      Segment Region   Product  \
0  CA-1001  2023-01-15     Ravi     Consumer  South    Laptop   
1  CA-1002  2023-02-20    Priya    Corporate  North   Printer   
2  CA-1003  2023-01-25     Amit     Consumer   East  Notebook   
3  CA-1004  2023-03-01    Anita  Home Office   West     Table   
4  CA-1005  2023-02-05    Divya     Consumer  South     Phone   

          Category  SubCategory  Quantity  UnitPrice  Discount  Profit  
0       Technology    Computers         1      55000      0.10    5000  
1       Technology  Peripherals         2      12000      0.15    1800  
2  Office Supplies        Paper         3        200      0.05     150  
3        Furniture       Tables         1      18000      0.20   -1500  
4       Technology       Phones         2      20000      0.00    3000  

 Shape: (5, 12)

 Data types:
OrderID         object
OrderDate       object
Customer        object
Segment         object
Region          object
Product         ob

In [3]:
print("\n Selected Columns:")
print(df[["Customer", "Product", "Profit"]])



 Selected Columns:
  Customer   Product  Profit
0     Ravi    Laptop    5000
1    Priya   Printer    1800
2     Amit  Notebook     150
3    Anita     Table   -1500
4    Divya     Phone    3000


In [4]:
filtered_df = df[(df["Profit"] > 2000) & (df["Discount"] == 0.0)]
print("\n Orders with Profit > 2000 and Discount = 0:")
print(filtered_df)



 Orders with Profit > 2000 and Discount = 0:
   OrderID   OrderDate Customer   Segment Region Product    Category  \
4  CA-1005  2023-02-05    Divya  Consumer  South   Phone  Technology   

  SubCategory  Quantity  UnitPrice  Discount  Profit  
4      Phones         2      20000       0.0    3000  


In [5]:
sorted_df = df.sort_values(by="Profit", ascending=False)
print("\n Sorted by Profit (Descending):")
print(sorted_df)



 Sorted by Profit (Descending):
   OrderID   OrderDate Customer      Segment Region   Product  \
0  CA-1001  2023-01-15     Ravi     Consumer  South    Laptop   
4  CA-1005  2023-02-05    Divya     Consumer  South     Phone   
1  CA-1002  2023-02-20    Priya    Corporate  North   Printer   
2  CA-1003  2023-01-25     Amit     Consumer   East  Notebook   
3  CA-1004  2023-03-01    Anita  Home Office   West     Table   

          Category  SubCategory  Quantity  UnitPrice  Discount  Profit  
0       Technology    Computers         1      55000      0.10    5000  
4       Technology       Phones         2      20000      0.00    3000  
1       Technology  Peripherals         2      12000      0.15    1800  
2  Office Supplies        Paper         3        200      0.05     150  
3        Furniture       Tables         1      18000      0.20   -1500  


In [7]:
grouped_df = df.groupby("Category").agg(
    Total_Profit=("Profit", "sum"),
    Avg_Discount=("Discount", "mean")
).reset_index()

print("\n Grouped by Category:")
print(grouped_df)



 Grouped by Category:
          Category  Total_Profit  Avg_Discount
0        Furniture         -1500      0.200000
1  Office Supplies           150      0.050000
2       Technology          9800      0.083333


In [8]:
df["TotalPrice"] = df["Quantity"] * df["UnitPrice"]
print("\n DataFrame with TotalPrice:")
print(df[["OrderID", "Quantity", "UnitPrice", "TotalPrice"]])



 DataFrame with TotalPrice:
   OrderID  Quantity  UnitPrice  TotalPrice
0  CA-1001         1      55000       55000
1  CA-1002         2      12000       24000
2  CA-1003         3        200         600
3  CA-1004         1      18000       18000
4  CA-1005         2      20000       40000


In [9]:
df = df.drop(columns=["SubCategory"])
print("\n DataFrame after dropping 'SubCategory':")
print(df.columns)



 DataFrame after dropping 'SubCategory':
Index(['OrderID', 'OrderDate', 'Customer', 'Segment', 'Region', 'Product',
       'Category', 'Quantity', 'UnitPrice', 'Discount', 'Profit',
       'TotalPrice'],
      dtype='object')


In [10]:
# Simulate a null value in Discount
df.loc[0, "Discount"] = None

# Fill nulls with 0.10
df["Discount"] = df["Discount"].fillna(0.10)

print("\n Discount after filling nulls:")
print(df["Discount"])



 Discount after filling nulls:
0    0.10
1    0.15
2    0.05
3    0.20
4    0.00
Name: Discount, dtype: float64


In [11]:
def classify(row):
    if row['Profit'] > 4000:
        return 'High'
    elif row['Profit'] > 0:
        return 'Medium'
    else:
        return 'Low'

# Apply the function
df["ProfitLevel"] = df.apply(classify, axis=1)

print("\n DataFrame with Profit Level:")
print(df[["OrderID", "Profit", "ProfitLevel"]])



 DataFrame with Profit Level:
   OrderID  Profit ProfitLevel
0  CA-1001    5000        High
1  CA-1002    1800      Medium
2  CA-1003     150      Medium
3  CA-1004   -1500         Low
4  CA-1005    3000      Medium


In [12]:
!pip install pyspark




In [13]:
from pyspark.sql import SparkSession

# Step 1: Start SparkSession
spark = SparkSession.builder.appName("SuperstoreAssignment").getOrCreate()

# Step 2: Load the CSV
df = spark.read.option("header", True).option("inferSchema", True).csv("superstore.csv")

# Step 3: Show schema and first 5 rows
df.printSchema()
df.show(5)


root
 |-- OrderID: string (nullable = true)
 |-- OrderDate: date (nullable = true)
 |-- Customer: string (nullable = true)
 |-- Segment: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- SubCategory: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- UnitPrice: integer (nullable = true)
 |-- Discount: double (nullable = true)
 |-- Profit: integer (nullable = true)

+-------+----------+--------+-----------+------+--------+---------------+-----------+--------+---------+--------+------+
|OrderID| OrderDate|Customer|    Segment|Region| Product|       Category|SubCategory|Quantity|UnitPrice|Discount|Profit|
+-------+----------+--------+-----------+------+--------+---------------+-----------+--------+---------+--------+------+
|CA-1001|2023-01-15|    Ravi|   Consumer| South|  Laptop|     Technology|  Computers|       1|    55000|     0.1|  5000|
|CA-1002|2023-02-20|   Priya|  

In [14]:
df_selected = df.selectExpr("OrderID", "OrderDate", "Customer as Client", "Product", "Profit")
df_selected.show()


+-------+----------+------+--------+------+
|OrderID| OrderDate|Client| Product|Profit|
+-------+----------+------+--------+------+
|CA-1001|2023-01-15|  Ravi|  Laptop|  5000|
|CA-1002|2023-02-20| Priya| Printer|  1800|
|CA-1003|2023-01-25|  Amit|Notebook|   150|
|CA-1004|2023-03-01| Anita|   Table| -1500|
|CA-1005|2023-02-05| Divya|   Phone|  3000|
+-------+----------+------+--------+------+



In [15]:
filtered_df = df.filter((df.Segment == "Consumer") & (df.Profit < 1000))
filtered_df.show()


+-------+----------+--------+--------+------+--------+---------------+-----------+--------+---------+--------+------+
|OrderID| OrderDate|Customer| Segment|Region| Product|       Category|SubCategory|Quantity|UnitPrice|Discount|Profit|
+-------+----------+--------+--------+------+--------+---------------+-----------+--------+---------+--------+------+
|CA-1003|2023-01-25|    Amit|Consumer|  East|Notebook|Office Supplies|      Paper|       3|      200|    0.05|   150|
+-------+----------+--------+--------+------+--------+---------------+-----------+--------+---------+--------+------+



In [16]:
from pyspark.sql.functions import avg

df.groupBy("Region").agg(avg("Profit").alias("Average_Profit")).show()


+------+--------------+
|Region|Average_Profit|
+------+--------------+
| South|        4000.0|
|  East|         150.0|
|  West|       -1500.0|
| North|        1800.0|
+------+--------------+



In [17]:
from pyspark.sql.functions import col

df = df.withColumn("TotalPrice", col("Quantity") * col("UnitPrice"))
df.select("OrderID", "Quantity", "UnitPrice", "TotalPrice").show()


+-------+--------+---------+----------+
|OrderID|Quantity|UnitPrice|TotalPrice|
+-------+--------+---------+----------+
|CA-1001|       1|    55000|     55000|
|CA-1002|       2|    12000|     24000|
|CA-1003|       3|      200|       600|
|CA-1004|       1|    18000|     18000|
|CA-1005|       2|    20000|     40000|
+-------+--------+---------+----------+



In [18]:
from pyspark.sql.functions import when

df = df.withColumn("ProfitLevel",
    when(col("Profit") > 2000, "High")
    .when(col("Profit") <= 0, "Loss")
    .otherwise("Medium")
)
df.select("OrderID", "Profit", "ProfitLevel").show()


+-------+------+-----------+
|OrderID|Profit|ProfitLevel|
+-------+------+-----------+
|CA-1001|  5000|       High|
|CA-1002|  1800|     Medium|
|CA-1003|   150|     Medium|
|CA-1004| -1500|       Loss|
|CA-1005|  3000|       High|
+-------+------+-----------+



In [19]:
df = df.drop("SubCategory")
print(" SubCategory dropped. Remaining columns:")
print(df.columns)


 SubCategory dropped. Remaining columns:
['OrderID', 'OrderDate', 'Customer', 'Segment', 'Region', 'Product', 'Category', 'Quantity', 'UnitPrice', 'Discount', 'Profit', 'TotalPrice', 'ProfitLevel']


In [20]:
df = df.fillna({"Discount": 0.10})
df.select("OrderID", "Discount").show()


+-------+--------+
|OrderID|Discount|
+-------+--------+
|CA-1001|     0.1|
|CA-1002|    0.15|
|CA-1003|    0.05|
|CA-1004|     0.2|
|CA-1005|     0.0|
+-------+--------+



In [21]:
from pyspark.sql.functions import to_date, year, month

df = df.withColumn("OrderDate", to_date("OrderDate", "yyyy-MM-dd"))
df = df.withColumn("Year", year("OrderDate"))
df = df.withColumn("Month", month("OrderDate"))

df.select("OrderID", "OrderDate", "Year", "Month").show()


+-------+----------+----+-----+
|OrderID| OrderDate|Year|Month|
+-------+----------+----+-----+
|CA-1001|2023-01-15|2023|    1|
|CA-1002|2023-02-20|2023|    2|
|CA-1003|2023-01-25|2023|    1|
|CA-1004|2023-03-01|2023|    3|
|CA-1005|2023-02-05|2023|    2|
+-------+----------+----+-----+



In [22]:
!pip install dask




In [23]:
import dask.dataframe as dd

# Load the superstore.csv file
df = dd.read_csv('superstore.csv')

# View structure (just like .info())
df.head()  # Only previews a few rows


Unnamed: 0,OrderID,OrderDate,Customer,Segment,Region,Product,Category,SubCategory,Quantity,UnitPrice,Discount,Profit
0,CA-1001,2023-01-15,Ravi,Consumer,South,Laptop,Technology,Computers,1,55000,0.1,5000
1,CA-1002,2023-02-20,Priya,Corporate,North,Printer,Technology,Peripherals,2,12000,0.15,1800
2,CA-1003,2023-01-25,Amit,Consumer,East,Notebook,Office Supplies,Paper,3,200,0.05,150
3,CA-1004,2023-03-01,Anita,Home Office,West,Table,Furniture,Tables,1,18000,0.2,-1500
4,CA-1005,2023-02-05,Divya,Consumer,South,Phone,Technology,Phones,2,20000,0.0,3000


In [24]:
avg_discount = df.groupby("Category")["Discount"].mean().compute()
print(" Average Discount by Category:\n")
print(avg_discount)


 Average Discount by Category:

Category
Furniture          0.200000
Office Supplies    0.050000
Technology         0.083333
Name: Discount, dtype: float64


In [25]:
filtered = df[(df["Quantity"] > 1) & (df["Profit"] > 2000)]
filtered_df = filtered.compute()

print(" Filtered Orders (Quantity > 1 and Profit > 2000):\n")
print(filtered_df)


 Filtered Orders (Quantity > 1 and Profit > 2000):

   OrderID   OrderDate Customer   Segment Region Product    Category  \
4  CA-1005  2023-02-05    Divya  Consumer  South   Phone  Technology   

  SubCategory  Quantity  UnitPrice  Discount  Profit  
4      Phones         2      20000       0.0    3000  


In [26]:
filtered_df.to_csv("filtered_superstore.csv", index=False)
print(" Saved as filtered_superstore.csv")


 Saved as filtered_superstore.csv


In [27]:
from google.colab import files
files.download("filtered_superstore.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [28]:
# Step 1: Create nested JSON content
json_data = """
[
  {
    "OrderID": "CA-1001",
    "Customer": {"Name": "Ravi", "Segment": "Consumer"},
    "Details": {"Region": "South", "Profit": 5000}
  },
  {
    "OrderID": "CA-1002",
    "Customer": {"Name": "Priya", "Segment": "Corporate"},
    "Details": {"Region": "North", "Profit": 1800}
  }
]
"""

# Save the JSON to a file
with open("orders.json", "w") as f:
    f.write(json_data)


In [29]:
# Step 2: Load JSON using PySpark
df_json = spark.read.option("multiLine", True).json("orders.json")

# Print schema to see nested structure
df_json.printSchema()

# Select nested fields
df_json.select("OrderID", "Customer.Name", "Details.Profit").show()


root
 |-- Customer: struct (nullable = true)
 |    |-- Name: string (nullable = true)
 |    |-- Segment: string (nullable = true)
 |-- Details: struct (nullable = true)
 |    |-- Profit: long (nullable = true)
 |    |-- Region: string (nullable = true)
 |-- OrderID: string (nullable = true)

+-------+-----+------+
|OrderID| Name|Profit|
+-------+-----+------+
|CA-1001| Ravi|  5000|
|CA-1002|Priya|  1800|
+-------+-----+------+

