In [None]:
%%html
<style>
.output_subarea.output_text.output_stream.output_stdout > pre {
    width:max-content;
}
.p-Widget.jp-RenderedText.jp-OutputArea-output > pre {
   width:max-content;
}
</style>

In [None]:
import pandas as pd

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lag, first, count, max as spark_max, min as spark_min

In [None]:
spark = SparkSession.builder.appName("Spark Wrapper Demo").getOrCreate()

In [None]:
from wrapper import SparkWrapper, string_match, union_all, regex_expr

dic = {
    'Product': ['A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C'],
    'Year': [2010, 2010, 2010, 2011, 2011, 2011, 2012, 2012, 2012],
    'Revenue': [100, 200, 300, 110, 190, 320, 120, 220, 350]
}
dic_to_df = spark.createDataFrame(pd.DataFrame(data=dic))
print(dic_to_df.Shape)
dic_to_df.show()
dic_to_df.printSchema()

# *Shape* SparkWrapper:
### The shape wrapper is added to the Spark DataFrame class in order to have the must commonly used method in Pandas and Numpy type objects, this is pretty useful when you want to track the dimension of the Spark DataFrame at some spaecific transformation stage and get an insight of what your rows and columns number are gathering into different dimensions.
> Important note: *shape* method is called as the traditional **.shape** of Pandas an Numpy objects.

### It returns: 
- Tuple with: (total row number, total column number), *such as (n, m) matrix dimension*

In [None]:
dic_to_df.Shape

# *pctChange* from SparkWrapper:
### The pct_change wrapper is added to the Spark DataFrame class in order to have the must commonly used method in Pandas objects, this is for getting the relative percentage change between one observation to another sorted by some sortable date-type column and lagged by some laggable numeric-type column. 
> Important note: you can call *pct_change* method as the traditional **.pct_change** way for Pandas dataframe objects or you can rather specify the parameters of the function. So if any parameter is specified then the method will infere which column to sort and which column to lag in order to get the **relative percentage change**.

### It returns: 
- Tuple with: (total row number, total column number), *such as (n, m) matrix dimension*

### One of the many options that we can use this *.pctChange* method is with no parameter specified thus it will infere which column to sort and which column to lag in order to get the **relative percentage change**.

In [None]:
dic_to_df.pctChange().show(5, False)

### Another option is configuring all recept parameters from the function, the following are:
- **periods**; this parameter will control the offset of the lag periods since the default value is 1 this will always return a lag-1 information DataFrame
- **partition_by**; the partition parameter will fixed the partition column over the DataFrame e.g. _"bank_segment", "assurance_product_type"_
- **order_by**; order by parameter will be the specific column to order the sequential observations, e.g. _"balance_date", "trade_close_date", "contract_date"_
- **pct_cols**; percentage change col (pct_cols) will be the spacific column to lag-over giving back the relative change between one element to other, e.g. *$(x_{t} \div{x_{t-1}})$*

### In this case we will specify only the **periods** parameter to yield a lag of -2 days over the DataFrame

In [None]:
dic_to_df.pctChange(periods=2).na.fill(0).show(5, False)

### With parameters **partition_by, order_by & pct_cols**

In [None]:
dic_to_df.pctChange(partition_by="Product", order_by="Year", pct_cols="Revenue").na.fill(0).show(5, False)

In [None]:
dic_to_df.pctChange(partition_by="Product", order_by="Year", pct_cols=["Year", "Revenue"]).na.fill(0).show(5, False)

In [None]:
pct_change_df = dic_to_df.pctChange(partition_by="Product", order_by="Year", pct_cols="Revenue").na.fill(0)
print(pct_change_df.Shape)
pct_change_df.show(5, False)
pct_change_df.printSchema()

# *Matrix* from SparkWrapper

In [None]:
#+-------+----+-------+
#|Product|Year|Revenue|
#+-------+----+-------+
#|      A|2010|    100|
#|      B|2010|    200|
#|      C|2010|    300|
#|      A|2011|    110|
#|      B|2011|    190|
#|      C|2011|    320|
#|      A|2012|    120|
#|      B|2012|    220|
#|      C|2012|    350|
#+-------+----+-------+

agg_dict_test = {'Revenue': 'max'}
to_matrix_df = dic_to_df.toMatrix(group_by="Product", pivot_col="Year", agg_dict=agg_dict_test).orderBy("Product_Year")
print(to_matrix_df.Shape)
to_matrix_df.show(5, False)

# *Panel* from SparkWrapper

In [None]:
to_matrix_df.toPanel(pivot_col='Product_Year', new_col='mean_Revenue').show(10, False)

# *Cartesian* from SparkWrapper

In [None]:
to_matrix_df.cartRDD('Product_Year').take(5)

# *CorrMat* from SparkWrapper

In [None]:
import time
start = time.time()
to_matrix_df.corrMatrix().show()
end = time.time()
print('elapsed:', int(end - start), 'segundos')

In [None]:
to_matrix_df.corrMatrix(offset=0.9).show()

# *corrStat* from SparkWrapper

In [None]:
dic_to_df.corrStat('Year', 'Product', agg_dict_test).show()

# *uniqueRow* from SparkWrapper

In [None]:
dic_to_df.uniqueRow('Product')

In [None]:
dic_to_df.uniqueRow('Year')

# *vecAssembler* from SparkWrapper

In [None]:
to_matrix_df.vecAssembler(to_matrix_df.columns[1:]).show()

# *Join Small* from SparkWrapper

In [None]:
to_matrix_df.joinSmall

# Test con más datos:

### Para fines de ejemplos con más variables se usa el csv de bank el cual contiene información de las campañas de marketing de una institución financiera portuguesa (nota: descargar el comprimido [bank.zip](https://archive.ics.uci.edu/ml/machine-learning-databases/00222/)) que pertenece al compendio de datasets para ML que dispone el repositorio de la universidad de UCI donde se puede encontrar el [diccionario de datos](https://archive.ics.uci.edu/ml/datasets/bank+marketing) para investigación en el campo de modelos de aprendizaje automático.

In [None]:
bank_csv = spark.read.csv('bank.csv', header=True, inferSchema=True)
bank_csv.show(5, False)

# *Sort Columns Asc* from SparkWrapper

In [None]:
bank_csv.sortColAsc().show(5, False)

------------------------------------------------------------------------------------------------------------------------------------------------------------

# FreqItems from Pyspark Vs freq_items from SparkWrapper

-----------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
bank_csv.freqItems(['job', 'marital', 'education'], support=0.5).show(5, False)

In [None]:
#bank_csv.sampleBy('marital', fractions={'married':0.5, 'single':0.5, 'divorced':0.5}).show(5, False)

In [None]:
#bank_csv.sample(withReplacement=True, fraction=0.5).show(5, False)

-----------------------------------------------------------------------------------------------------------------------------------------------------------

# *Sample N* from SparkWrapper

In [None]:
bank_csv.sampleN(20).show(30, False)

----------------------------------------------------------------------------------------------------------------------------------------------------------------

# *Select Regex, Select Contains & Regex Expr* from SparkWrapper

In [None]:
bank_csv.selectRegex(regex_expr(['day', 'ous', 'pr'])).show(5, False)
bank_csv.selectContains(['day', 'ous', 'pr']).show(5, False)

In [None]:
agg_dict_ = {
    'balance': 'mean, count, sum, stddev, var, min',
    'housing': 'count, sum',
    'loan': 'count',
    'age': 'mean'
}
embedded_rep_test = bank_csv.toMatrix('education', 'deposit', agg_dict_)
embedded_rep_test.show()

# *Cosstab* de Pyspark Vs Matrix de SparkWrapper

In [None]:
bank_csv.where(string_match('contact == telephone')).crosstab('education', 'campaign').orderBy('education_campaign').show()

In [None]:
test_agg_dict_ = {
    'balance': 'count'
}
bank_csv.where(string_match('contact == telephone')).toMatrix('education', 'campaign', test_agg_dict_).orderBy('education_campaign').show()

# *crossPct* from SparkWrapper

In [None]:
_agg_dict_ = {
    'balance': 'mean',
    'age': 'mean',
    'loan': 'count'
}
test = bank_csv.crossPct('education', 'deposit', _agg_dict_, cols='all')
test.show()

In [None]:
tra = test.toPanel('education_deposit', ['label', 'value'])
tra.show(50, False)
tra.printSchema()

# *Select StartsWith & EndsWith* from SparkWrapper

In [None]:
bank_csv.selectStartswith('m').show(5, False)
bank_csv.selectStartswith(['m', 'd']).show(5, False)

bank_csv.selectEndswith('y').show(5, False)
bank_csv.selectEndswith(['y', 'h']).show(5, False)

In [None]:
for i in range(3):
    bank_csv.sampleN(5).show()

# *ForEach Col* from SparkWrapper

In [None]:
test_3_dict = {
    'balance': 'mean',
    'housing': 'count',
    'age': 'mean'
}
test_3 = bank_csv.toMatrix('education', 'deposit', test_3_dict)
test_3.foreachCol(test_3_dict, 'sum').show()

# *ResumeDF & tabularTable* from SparkWrapper

In [None]:
test_4_dict = {
    'balance': 'mean',
    'age': 'mean'
}
test_4 = bank_csv.toMatrix('education', 'deposit', test_4_dict)
sum_df = test_4.foreachCol(test_4_dict, 'sum')

sum_df.resumeDF(new_col='education_deposit').show()

In [None]:
bank_csv.tabularTable('education', 'deposit', test_4_dict).show()

# *Empty Scan* from SparkWrapper

In [None]:
bank_csv.emptyScan().show()

# *Union All* from SparkWrapper

In [None]:
print('before union:', bank_csv.count())
u = union_all([bank_csv, bank_csv, bank_csv, bank_csv, bank_csv])
print('after union:', u.count())
u.show(5, False)

In [None]:
bank_csv.rollingDown('balance', 'month', method='mean', window=20).show(25, False)