In [2]:
import pyspark.sql
import pyspark.sql.functions as sf

from pyspark.sql import SparkSession

spark = SparkSession.\
        builder.\
        appName("best_one").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "512m").\
        getOrCreate()

In [9]:
data = spark.read.format("csv") \
    .option("header", True) \
    .option("inferSchema", True) \
    .option("sep", ',') \
    .csv("WA_Sales_Products_2012-14_Updated.csv")

In [12]:
data.show(1, True, True)

-RECORD 0---------------------------------
 Country           | United States        
 Order method type | Fax                  
 Retailer type     | Outdoors Shop        
 Product line      | Camping Equipment    
 Item Type         | Cooking Gear         
 Product           | TrailChef Deluxe ... 
 Year              | 2012                 
 Quarter           | Q1 2012              
 Total Revenue     | 59628.66             
 Units Sold        | 489                  
 Unit Cost         | 121.94               
 Gross margin      | 0.34754797           
 Total Profit      | -18233.4345          
only showing top 1 row



In [13]:
# Функция Pivot (является ли это pivot?)
result = data.select(
    sf.countDistinct("Country"),
    sf.countDistinct("Retailer type"),
    sf.countDistinct("Product line"),
    sf.countDistinct("Item type"),
    sf.countDistinct("Quarter")
)

result.toPandas()

Unnamed: 0,count(DISTINCT Country),count(DISTINCT Retailer type),count(DISTINCT Product line),count(DISTINCT Item type),count(DISTINCT Quarter)
0,21,8,5,21,11


In [17]:
# Spark
revenue_per_product_line = data.groupBy("Quarter", "Country").pivot("Product line").agg(sf.sum("Total Revenue"))
revenue_per_product_line.toPandas()

Unnamed: 0,Quarter,Country,Camping Equipment,Golf Equipment,Mountaineering Equipment,Outdoor Protection,Personal Accessories
0,Q3 2013,Sweden,1433530.62,1250510.97,794786.44,48039.27,3577140.77
1,Q4 2012,Spain,3264717.34,1593436.54,954726.68,211146.20,3991933.57
2,Q2 2013,Italy,5873795.00,2924732.03,1966086.80,111329.82,5921051.85
3,Q3 2012,United States,15847275.46,6085923.58,4055966.08,914399.41,22332252.19
4,Q1 2014,Switzerland,3966205.47,2157061.01,1640871.99,53438.20,5615068.69
...,...,...,...,...,...,...,...
226,Q2 2014,Sweden,2886658.67,1917255.46,1539344.09,37180.62,4531302.06
227,Q2 2014,Belgium,3880015.73,1722833.33,1505551.13,47924.34,5111504.00
228,Q2 2014,United States,24552601.92,11032775.44,9602092.49,302668.32,36187901.13
229,Q2 2013,Belgium,3575529.48,1627402.32,1223276.89,72646.57,4154780.66


In [None]:
# создадим агрегат с Country и Product Line, а quantity по quarter в колонках

# UnPivot

In [18]:
# по продуктам

revenue_camping = revenue_per_product_line.select(
    sf.col("Quarter"),
    sf.col("Country"),
    sf.lit("Camping Equipment").alias("Product line"),
    sf.col("Camping Equipment").alias("Total Revenue")
)

revenue_golf = revenue_per_product_line.select(
    sf.col("Quarter"),
    sf.col("Country"),
    sf.lit("Golf Equipment").alias("Product line"),
    sf.col("Golf Equipment").alias("Total Revenue")
)

result = revenue_camping \
    .union(revenue_golf) 

result.limit(10).toPandas()

Unnamed: 0,Quarter,Country,Product line,Total Revenue
0,Q3 2013,Sweden,Camping Equipment,1433530.62
1,Q4 2012,Spain,Camping Equipment,3264717.34
2,Q2 2013,Italy,Camping Equipment,5873795.0
3,Q3 2012,United States,Camping Equipment,15847275.46
4,Q1 2014,Switzerland,Camping Equipment,3966205.47
5,Q1 2013,China,Camping Equipment,7745789.4
6,Q2 2012,Germany,Camping Equipment,5315912.78
7,Q2 2014,Austria,Camping Equipment,4221897.72
8,Q1 2014,China,Camping Equipment,9213181.55
9,Q4 2012,France,Camping Equipment,6286894.57


In [20]:
import functools

# Unpivot по одной колонки
def unpivot_column(df, other, pivot_column, pivot_value, result_column):
    columns = [df[c] for c in other] + \
        [sf.lit(pivot_value).alias(pivot_column)] + \
        [df[pivot_value].alias(result_column)]
    return df.select(*columns)

# Unpivot по многим колонкам
def unpivot(df, pivot_column, pivot_values, result_column):
    """
    df - input data frame
    pivot_column - the name of the new column containg each pivot column name
    pivot_values - the list of pivoted column names
    result_column - the name of the column containing the values of the pivot columns
    """
    common_columns = [f.name for f in df.schema.fields if not f.name in pivot_values]
    unpivot_dfs = [unpivot_column(df, common_columns, pivot_column, v, result_column) for v in pivot_values]
    return functools.reduce(lambda x,y: x.union(y), unpivot_dfs)

In [21]:
product_lines = ["Camping Equipment", "Golf Equipment"]
result_per_product_line = unpivot(revenue_per_product_line, "Product Line", product_lines, "Revenue")
result_per_product_line.toPandas()

Unnamed: 0,Quarter,Country,Mountaineering Equipment,Outdoor Protection,Personal Accessories,Product Line,Revenue
0,Q3 2013,Sweden,794786.44,48039.27,3577140.77,Camping Equipment,1433530.62
1,Q4 2012,Spain,954726.68,211146.20,3991933.57,Camping Equipment,3264717.34
2,Q2 2013,Italy,1966086.80,111329.82,5921051.85,Camping Equipment,5873795.00
3,Q3 2012,United States,4055966.08,914399.41,22332252.19,Camping Equipment,15847275.46
4,Q1 2014,Switzerland,1640871.99,53438.20,5615068.69,Camping Equipment,3966205.47
...,...,...,...,...,...,...,...
457,Q2 2014,Sweden,1539344.09,37180.62,4531302.06,Golf Equipment,1917255.46
458,Q2 2014,Belgium,1505551.13,47924.34,5111504.00,Golf Equipment,1722833.33
459,Q2 2014,United States,9602092.49,302668.32,36187901.13,Golf Equipment,11032775.44
460,Q2 2013,Belgium,1223276.89,72646.57,4154780.66,Golf Equipment,1627402.32
