In [1]:
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as f

In [2]:
spark = SparkSession.builder.master("local[*]").appName("Datamanipulation").getOrCreate()

In [3]:
# read our data - lives in a csv file

df = spark.read.option("header","true").csv("Sample - EU Superstore.csv")

# how many rows of the EU Superstore dataset have the country being France


In [7]:
df.columns

['Row ID',
 'Order ID',
 'Order Date',
 'Ship Date',
 'Ship Mode',
 'Customer ID',
 'Customer Name',
 'Segment',
 'City',
 'State',
 'Country',
 'Region',
 'Product ID',
 'Category',
 'Sub-Category',
 'Product Name',
 'Sales',
 'Quantity',
 'Discount',
 'Profit']

In [10]:
france_rows=df['country']=='France'

In [15]:
france_rows=df.filter(df['country']=='France').count()

In [18]:
print(f'there are {france_rows} rows for france')

there are 2827 rows for france


# of those, how many are profitable?



In [4]:
df.filter((df["Country"].isin("France")) & (df["Profit"] > 0.0)).count()

2329

# how any different discount brackets exist? what are they?


In [9]:
df.select("Discount").distinct().count()


14

In [10]:
df.select("Discount").distinct().show()

+--------+
|Discount|
+--------+
|     0.3|
|     0.7|
|       0|
|     0.2|
|    0.15|
|    0.35|
|     0.8|
|    0.45|
|     0.5|
|    0.65|
|     0.6|
|     0.1|
|    0.85|
|     0.4|
+--------+



# let's see the totl profit by discount bracket, make sure they are ordered by 


In [14]:
profit_discount = df.groupby("Discount").agg({"Profit":"sum"})

profit_discount = profit_discount.withColumn("sum(Profit)", f.round(profit_discount["sum(Profit)"], 2)).\
orderBy("sum(Profit)", ascending=False).withColumnRenamed("sum(Profit)", "totalProfit")
profit_discount.show()


+--------+-----------+
|Discount|totalProfit|
+--------+-----------+
|       0|  383806.53|
|     0.1|  126884.03|
|    0.15|   24677.56|
|     0.2|    2189.55|
|     0.8|    -460.28|
|     0.3|    -758.42|
|    0.45|   -1103.19|
|    0.85|   -3068.66|
|     0.7|   -5496.77|
|    0.65|   -6221.97|
|    0.35|   -9122.65|
|     0.6|  -20517.46|
|     0.4|  -21346.43|
|     0.5|  -96632.12|
+--------+-----------+



# what is the value after which we should stop offering discount?



In [16]:
profit_discount.filter(profit_discount["totalProfit"] > 0).orderBy("totalProfit", ascending=True).select("Discount").collect()[0][0]

'0.2'

# who are the top 5 most profitable customers



In [4]:
 top_customers = df.groupBy("Customer ID", "Customer Name").agg({"Profit":"sum"})

top_customers = top_customers.orderBy("sum(Profit)", ascending=False).limit(5)
top_customers.show()


+-----------+-----------------+------------------+
|Customer ID|    Customer Name|       sum(Profit)|
+-----------+-----------------+------------------+
|   SP-20920|     Susan Pistek| 4974.512999999999|
|   PJ-18835|    Patrick Jones|3986.0039999999995|
|   PO-18865|Patrick O'Donnell|          3778.197|
|   EB-13840|    Ellis Ballard|           3459.66|
|   MG-18145|  Mike Gockenbach|3144.4439999999995|
+-----------+-----------------+------------------+



In [45]:
top_customers=top_customers.withColumnRenamed('Customer Name','customer name').drop('sum(Profit)')

In [9]:
top_5 = top_customers.collect()[:5]

In [11]:
top_names=[i[1] for i in top_5]

In [12]:
top_names

['Susan Pistek',
 'Patrick Jones',
 "Patrick O'Donnell",
 'Ellis Ballard',
 'Mike Gockenbach']

# get all the rows belonging to those 5 customer names: hint, you may need the collect method - how many rows are they?



In [13]:
df.filter(df["Customer Name"].isin(top_names)).count()


76

# create a new column which is the value of the sale were there not discount applied. Hint: orginal = sales/(1-d)

In [16]:
df = df.withColumn("original", f.round(df["Sales"] / (1-df["Discount"]), 2))

In [17]:
df.select('original').show()

+--------+
|original|
+--------+
|    79.2|
|  388.92|
|   35.19|
|   50.94|
|  307.44|
|   122.4|
|  413.82|
|  428.22|
| 3979.29|
|   43.56|
|   25.26|
| 2715.45|
|   12.21|
| 2549.76|
|  153.45|
|  142.65|
|  690.12|
|    8.16|
|  347.88|
|  639.45|
+--------+
only showing top 20 rows



# calculate the difference between sales and discount value


In [18]:
df.select('Sales','original','Discount').show()

+--------+--------+--------+
|   Sales|original|Discount|
+--------+--------+--------+
|    79.2|    79.2|       0|
|  388.92|  388.92|       0|
|   35.19|   35.19|       0|
|   50.94|   50.94|       0|
|  307.44|  307.44|       0|
|   122.4|   122.4|       0|
|  413.82|  413.82|       0|
|  428.22|  428.22|       0|
| 3979.29| 3979.29|       0|
|   43.56|   43.56|       0|
|   25.26|   25.26|       0|
|2443.905| 2715.45|     0.1|
|   12.21|   12.21|       0|
|2167.296| 2549.76|    0.15|
| 138.105|  153.45|     0.1|
| 128.385|  142.65|     0.1|
|  690.12|  690.12|       0|
|    8.16|    8.16|       0|
|  347.88|  347.88|       0|
| 575.505|  639.45|     0.1|
+--------+--------+--------+
only showing top 20 rows



In [19]:
df = df.withColumn("Discount_Values", f.round(df["Sales"] - (df["Sales"] * df["Discount"]), 2))


In [20]:
df.select('Sales','original','Discount','discount_values').show()

+--------+--------+--------+---------------+
|   Sales|original|Discount|discount_values|
+--------+--------+--------+---------------+
|    79.2|    79.2|       0|           79.2|
|  388.92|  388.92|       0|         388.92|
|   35.19|   35.19|       0|          35.19|
|   50.94|   50.94|       0|          50.94|
|  307.44|  307.44|       0|         307.44|
|   122.4|   122.4|       0|          122.4|
|  413.82|  413.82|       0|         413.82|
|  428.22|  428.22|       0|         428.22|
| 3979.29| 3979.29|       0|        3979.29|
|   43.56|   43.56|       0|          43.56|
|   25.26|   25.26|       0|          25.26|
|2443.905| 2715.45|     0.1|        2199.51|
|   12.21|   12.21|       0|          12.21|
|2167.296| 2549.76|    0.15|         1842.2|
| 138.105|  153.45|     0.1|         124.29|
| 128.385|  142.65|     0.1|         115.55|
|  690.12|  690.12|       0|         690.12|
|    8.16|    8.16|       0|           8.16|
|  347.88|  347.88|       0|         347.88|
| 575.505|

# how much money did we not gain due to the discounts - per discount bracket?


In [22]:
df = df.withColumn("losses", f.round(df["Sales"] - df["Discount_Values"],2))

In [29]:
df.select('Discount','losses').groupBy('Discount').\
sum().withColumn('sum(losses)',f.round('sum(losses)')).orderBy('sum(losses)',ascending=False).show()

+--------+-----------+
|Discount|sum(losses)|
+--------+-----------+
|     0.5|    91866.0|
|     0.1|    76241.0|
|    0.15|    38448.0|
|     0.4|    28035.0|
|    0.35|    18956.0|
|     0.6|    15858.0|
|     0.2|     8522.0|
|    0.65|     4277.0|
|     0.7|     2560.0|
|     0.3|     1841.0|
|    0.45|     1146.0|
|    0.85|      677.0|
|     0.8|      127.0|
|       0|        0.0|
+--------+-----------+



# find the discount bracket which made us not gain the most (dynamically)



In [39]:
bad_discount=df.select('Discount','losses').groupBy('Discount').\
sum().withColumn('sum(losses)',f.round('sum(losses)')).orderBy('sum(losses)',ascending=False).collect()[0][0]

In [40]:
bad_discount

'0.5'

# what would have been the total profit if we removed all orders from that discount group? 


In [36]:
df.columns

['Row ID',
 'Order ID',
 'Order Date',
 'Ship Date',
 'Ship Mode',
 'Customer ID',
 'Customer Name',
 'Segment',
 'City',
 'State',
 'Country',
 'Region',
 'Product ID',
 'Category',
 'Sub-Category',
 'Product Name',
 'Sales',
 'Quantity',
 'Discount',
 'Profit',
 'original',
 'Discount_Values',
 'losses']

In [37]:
df = df.withColumn("Profit", df["Profit"].cast("float"))

In [63]:
profit_without_discount = df.filter(df["Discount"] != bad_discount).agg({'profit':'sum'})

In [64]:
profit_without_discount.show()

+------------------+
|       sum(profit)|
+------------------+
|469461.85667362204|
+------------------+



# how much more (or less) profit is that?


In [68]:
profit= df.select('Profit').agg({'Profit':'sum'})


In [81]:
 def more():
        if profit_without_discount.collect()[0][0] > profit.collect()[0][0]:
            print("Total Profit without bad discount is more : ")
            return profit_without_discount.collect()[0][0] - profit.collect()[0][0]
        else:
        	print("Total Profit with bad discount is more : ")
        	return profit.collect()[0][0]- profit_without_discount.collect()[0][0]
more()

Total Profit without bad discount is more : 


96632.11525454745

# create a temporary table for our superstore table in sql


In [98]:
df.createOrReplaceTempView("sdf")


# use an SQL query to count the number of rows


In [100]:
spark.sql("SELECT COUNT(sdf.Sales) FROM sdf").show()


+------------+
|count(Sales)|
+------------+
|       10000|
+------------+



# Use an SQL query to calculate the profit ratio for each country: hint, ratio is sum(profit)/sum(sales)


In [102]:
spark.sql("SELECT sdf.country, sum(sdf.profit) / sum(sdf.sales) as ratio FROM sdf GROUP BY country ORDER BY ratio desc").show()

+--------------+--------------------+
|       country|               ratio|
+--------------+--------------------+
|   Switzerland|  0.2909201193350232|
|       Austria|  0.2641908775042505|
|        Norway|  0.2517747548521659|
|       Belgium| 0.23508766583987942|
|United Kingdom| 0.21170103540397134|
|         Spain| 0.18941580658358978|
|       Finland| 0.18864296633316185|
|       Germany| 0.17066792076621765|
|        France| 0.12693568221933804|
|         Italy| 0.06844355185424991|
|       Ireland|-0.44426677493909256|
|       Denmark| -0.4957190005664471|
|   Netherlands| -0.5298342790541865|
|        Sweden| -0.5745674280714466|
|      Portugal| -0.5761662270806188|
+--------------+--------------------+



# is the country with the largest profit ratio, the country with the largest profit?



In [104]:

highest_ratio = spark.sql("SELECT country, sum(profit) / sum(sales) as ratio FROM sdf GROUP BY country ORDER BY ratio desc").take(1)[0][0]

highest_profit = spark.sql("SELECT country, sum(sdf.Profit) as profit FROM sdf GROUP BY country ORDER BY profit desc").take(1)[0][0]

print(f"The country has highest ratio is {highest_ratio} and largest profit is {highest_profit}")


The country has highest ratio is Switzerland and largest profit is United Kingdom
