In [1]:
import pandas as pd
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import col, lag, first, count, max as spark_max, min as spark_min

In [2]:
import OpSpark

In [3]:
spark = SparkSession.builder.appName("Correlation Matrix Spark").getOrCreate()

In [4]:
dic = {
    'Product': ['A', 'B', 'C','A', 'B', 'C','A', 'B', 'C'],
    'Year': [2010, 2010, 2010, 2011, 2011, 2011, 2012, 2012, 2012],
    'Revenue': [100, 200, 300, 110, 190, 320, 120, 220, 350]
}
dic_to_df = spark.createDataFrame(pd.DataFrame(data=dic))
print(dic_to_df.shape)
dic_to_df.show()

(9, 3)
+-------+----+-------+
|Product|Year|Revenue|
+-------+----+-------+
|      A|2010|    100|
|      B|2010|    200|
|      C|2010|    300|
|      A|2011|    110|
|      B|2011|    190|
|      C|2011|    320|
|      A|2012|    120|
|      B|2012|    220|
|      C|2012|    350|
+-------+----+-------+



In [18]:
dic_to_df.shape

(9, 3)

In [5]:
dic_to_df.pct_change(periods=1, partition_by="Product", order_by="Year", pct_cols="Revenue").na.fill(0).show(5, False)

+-------+----+-------+---------------------+
|Product|Year|Revenue|pct_change           |
+-------+----+-------+---------------------+
|B      |2010|200    |0.0                  |
|B      |2011|190    |-0.050000000000000044|
|B      |2012|220    |0.1578947368421053   |
|C      |2010|300    |0.0                  |
|C      |2011|320    |0.06666666666666665  |
+-------+----+-------+---------------------+
only showing top 5 rows



In [6]:
dic_to_df.pct_change(periods=1, partition_by="Product", order_by="Year", pct_cols=["Year", "Revenue"]).na.fill(0).show(5, False)

+-------+----+-------+--------------------+---------------------+
|Product|Year|Revenue|Year_pct_change     |Revenue_pct_change   |
+-------+----+-------+--------------------+---------------------+
|B      |2010|200    |0.0                 |0.0                  |
|B      |2011|190    |4.975124378110429E-4|-0.050000000000000044|
|B      |2012|220    |4.972650422674363E-4|0.1578947368421053   |
|C      |2010|300    |0.0                 |0.0                  |
|C      |2011|320    |4.975124378110429E-4|0.06666666666666665  |
+-------+----+-------+--------------------+---------------------+
only showing top 5 rows



In [7]:
dic_to_df.pct_change().na.fill(0).show(5, False)

+-------+----+-------+---------------------+
|Product|Year|Revenue|pct_change           |
+-------+----+-------+---------------------+
|B      |2010|200    |0.0                  |
|B      |2011|190    |-0.050000000000000044|
|B      |2012|220    |0.1578947368421053   |
|C      |2010|300    |0.0                  |
|C      |2011|320    |0.06666666666666665  |
+-------+----+-------+---------------------+
only showing top 5 rows



In [8]:
dic_to_df.pct_change(2).na.fill(0).show(5, False)

+-------+----+-------+-------------------+
|Product|Year|Revenue|pct_change         |
+-------+----+-------+-------------------+
|B      |2010|200    |0.0                |
|B      |2011|190    |0.0                |
|B      |2012|220    |0.10000000000000009|
|C      |2010|300    |0.0                |
|C      |2011|320    |0.0                |
+-------+----+-------+-------------------+
only showing top 5 rows



In [10]:
pct_change_df = dic_to_df.pct_change(partition_by="Product", order_by="Year", pct_cols="Revenue").na.fill(0)
print(pct_change_df.shape)
pct_change_df.show(5, False)
pct_change_df.printSchema()

(9, 4)
+-------+----+-------+---------------------+
|Product|Year|Revenue|pct_change           |
+-------+----+-------+---------------------+
|B      |2010|200    |0.0                  |
|B      |2011|190    |-0.050000000000000044|
|B      |2012|220    |0.1578947368421053   |
|C      |2010|300    |0.0                  |
|C      |2011|320    |0.06666666666666665  |
+-------+----+-------+---------------------+
only showing top 5 rows

root
 |-- Product: string (nullable = true)
 |-- Year: long (nullable = true)
 |-- Revenue: long (nullable = true)
 |-- pct_change: double (nullable = false)



In [11]:
agg_dict_test = {
    'pct_change': 'first'
}

to_matrix_df = pct_change_df.toMatrix(groupBy_col="Product", pivot_col="Year", agg_dict=agg_dict_test).orderBy("Product")
print(to_matrix_df.shape)
to_matrix_df.show(5, False)

(3, 4)
+-------+----+---------------------+-------------------+
|Product|2010|2011                 |2012               |
+-------+----+---------------------+-------------------+
|A      |0.0 |0.10000000000000009  |0.09090909090909083|
|B      |0.0 |-0.050000000000000044|0.1578947368421053 |
|C      |0.0 |0.06666666666666665  |0.09375            |
+-------+----+---------------------+-------------------+



In [12]:
new_col_list = ["Year", "pct_change"]
to_matrix_df.toPanel(pivot_col="Product", new_columns=new_col_list).show(7, False)

+-------+----+---------------------+
|Product|Year|pct_change           |
+-------+----+---------------------+
|A      |2010|0.0                  |
|A      |2011|0.10000000000000009  |
|A      |2012|0.09090909090909083  |
|B      |2010|0.0                  |
|B      |2011|-0.050000000000000044|
|B      |2012|0.1578947368421053   |
|C      |2010|0.0                  |
+-------+----+---------------------+
only showing top 7 rows



In [13]:
new_col_list = ["Year", "pct_change"]
to_matrix_df.transpose(pivot_col="Product", new_columns=new_col_list).show(7, False)

+-------+----+---------------------+
|Product|Year|pct_change           |
+-------+----+---------------------+
|A      |2010|0.0                  |
|A      |2011|0.10000000000000009  |
|A      |2012|0.09090909090909083  |
|B      |2010|0.0                  |
|B      |2011|-0.050000000000000044|
|B      |2012|0.1578947368421053   |
|C      |2010|0.0                  |
+-------+----+---------------------+
only showing top 7 rows



In [14]:
new_col_list = ["Year", "pct_change"]
to_panel_df = to_matrix_df.T(pivot_col="Product", new_columns=new_col_list)
print(to_panel_df.shape)
to_panel_df.show(7, False)

(9, 3)
+-------+----+---------------------+
|Product|Year|pct_change           |
+-------+----+---------------------+
|A      |2010|0.0                  |
|A      |2011|0.10000000000000009  |
|A      |2012|0.09090909090909083  |
|B      |2010|0.0                  |
|B      |2011|-0.050000000000000044|
|B      |2012|0.1578947368421053   |
|C      |2010|0.0                  |
+-------+----+---------------------+
only showing top 7 rows



In [15]:
agg_dict_test2 = {
    'pct_change': 'first'
}
to_panel_df.toMatrix(groupBy_col="Product", pivot_col="Year", agg_dict=agg_dict_test2).show(7, False)

+-------+----+---------------------+-------------------+
|Product|2010|2011                 |2012               |
+-------+----+---------------------+-------------------+
|A      |0.0 |0.10000000000000009  |0.09090909090909083|
|B      |0.0 |-0.050000000000000044|0.1578947368421053 |
|C      |0.0 |0.06666666666666665  |0.09375            |
+-------+----+---------------------+-------------------+



In [16]:
to_matrix_df.corr(pivot_col="Product").show()

+-------------+-------------+-------------------+
|Product_m_dim|Product_n_dim|pearson_coefficient|
+-------------+-------------+-------------------+
|            A|            A|                1.0|
|            A|            B|0.20731725371079707|
|            A|            C| 0.9334853274873337|
|            B|            A|0.20731725371079707|
|            B|            B|                1.0|
|            B|            C| 0.5443518400395578|
|            C|            A| 0.9334853274873337|
|            C|            B| 0.5443518400395579|
|            C|            C| 0.9999999999999999|
+-------------+-------------+-------------------+



In [17]:
to_matrix_df.toCorrelationMatrix().show()

+-------------+-------------+-------------------+
|Product_m_dim|Product_n_dim|pearson_coefficient|
+-------------+-------------+-------------------+
|            A|            A|                1.0|
|            A|            B|0.20731725371079707|
|            A|            C| 0.9334853274873337|
|            B|            A|0.20731725371079707|
|            B|            B|                1.0|
|            B|            C| 0.5443518400395578|
|            C|            A| 0.9334853274873337|
|            C|            B| 0.5443518400395579|
|            C|            C| 0.9999999999999999|
+-------------+-------------+-------------------+

