In [8]:
%%html
<style>
.output_subarea.output_text.output_stream.output_stdout > pre {
    width:max-content;
}
.p-Widget.jp-RenderedText.jp-OutputArea-output > pre {
   width:max-content;
}
</style>

In [9]:
import pandas as pd
from pyspark.sql.functions import col, lag, first, count, max as spark_max, min as spark_min

In [10]:
import os
import visualize
from com.ophelia.OpheliaMain import Ophelia
module_path = os.path.abspath(os.path.join('..'))

In [11]:
ophelia = Ophelia("Risk Classification Demo")
sc = ophelia.Spark.build_spark_context()


21:00:51.464 Ophelia [INFO] ¡Hi! My name is Ophelia Vendata
21:00:51.465 Ophelia [INFO] I am an artificial assistant for data mining & ML engine with spark
21:00:51.465 Ophelia [INFO] Welcome to Ophelia spark miner engine
21:00:51.465 Ophelia [INFO] Lib Version Ophelia.dev1.0
21:00:51.465 Ophelia [WARN] V for Vendata...

21:00:51.465 Ophelia [WARN] Initializing Spark Session
21:00:51.490 Ophelia [INFO] Spark Version: 3.0.0
21:00:51.490 Ophelia [INFO] This Is: 'Risk Classification Demo' Project
21:00:51.490 Ophelia [INFO] Spark Context Initialized Success


In [12]:
spark = ophelia.SparkSession

In [13]:
from com.ophelia.wrapper import SparkWrapper, string_match, union_all
from com.ophelia.utils import regex_expr

dic = {
    'Product': ['A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C'],
    'Year': [2010, 2010, 2010, 2011, 2011, 2011, 2012, 2012, 2012],
    'Revenue': [100, 200, 300, 110, 190, 320, 120, 220, 350]
}
dic_to_df = spark.createDataFrame(pd.DataFrame(data=dic))
print(dic_to_df.Shape)
dic_to_df.show()
dic_to_df.printSchema()

(9, 3)
+-------+----+-------+
|Product|Year|Revenue|
+-------+----+-------+
|      A|2010|    100|
|      B|2010|    200|
|      C|2010|    300|
|      A|2011|    110|
|      B|2011|    190|
|      C|2011|    320|
|      A|2012|    120|
|      B|2012|    220|
|      C|2012|    350|
+-------+----+-------+

root
 |-- Product: string (nullable = true)
 |-- Year: long (nullable = true)
 |-- Revenue: long (nullable = true)



# *Shape* SparkWrapper:
### The shape wrapper is added to the Spark DataFrame class in order to have the must commonly used method in Pandas and Numpy type objects, this is pretty useful when you want to track the dimension of the Spark DataFrame at some spaecific transformation stage and get an insight of what your rows and columns number are gathering into different dimensions.
> Important note: *shape* method is called as the traditional **.shape** of Pandas an Numpy objects.

### It returns: 
- Tuple with: (total row number, total column number), *such as (n, m) matrix dimension*

In [14]:
dic_to_df.Shape

(9, 3)

# *pctChange* from SparkWrapper:
### The pct_change wrapper is added to the Spark DataFrame class in order to have the must commonly used method in Pandas objects, this is for getting the relative percentage change between one observation to another sorted by some sortable date-type column and lagged by some laggable numeric-type column. 
> Important note: you can call *pct_change* method as the traditional **.pct_change** way for Pandas dataframe objects or you can rather specify the parameters of the function. So if any parameter is specified then the method will infere which column to sort and which column to lag in order to get the **relative percentage change**.

### It returns: 
- Tuple with: (total row number, total column number), *such as (n, m) matrix dimension*

### One of the many options that we can use this *.pctChange* method is with no parameter specified thus it will infere which column to sort and which column to lag in order to get the **relative percentage change**.

In [8]:
dic_to_df.pctChange().show(5, False)

+-------------------+
|Revenue            |
+-------------------+
|null               |
|1.0                |
|0.5                |
|-0.6333333333333333|
|0.7272727272727273 |
+-------------------+
only showing top 5 rows



### Another option is configuring all recept parameters from the function, the following are:
- **periods**; this parameter will control the offset of the lag periods since the default value is 1 this will always return a lag-1 information DataFrame
- **partition_by**; the partition parameter will fixed the partition column over the DataFrame e.g. _"bank_segment", "assurance_product_type"_
- **order_by**; order by parameter will be the specific column to order the sequential observations, e.g. _"balance_date", "trade_close_date", "contract_date"_
- **pct_cols**; percentage change col (pct_cols) will be the spacific column to lag-over giving back the relative change between one element to other, e.g. *$(x_{t} \div{x_{t-1}})$*

### In this case we will specify only the **periods** parameter to yield a lag of -2 days over the DataFrame

In [None]:
dic_to_df.pctChange(periods=2).na.fill(0).show(5, False)

### With parameters **partition_by, order_by & pct_cols**

In [None]:
dic_to_df.pctChange(partition_by="Product", order_by="Year", pct_cols="Revenue").na.fill(0).show(5, False)

In [None]:
dic_to_df.pctChange(partition_by="Product", order_by="Year", pct_cols=["Year", "Revenue"]).na.fill(0).show(5, False)

In [None]:
pct_change_df = dic_to_df.pctChange(partition_by="Product", order_by="Year", pct_cols="Revenue").na.fill(0)
print(pct_change_df.Shape)
pct_change_df.show(5, False)
pct_change_df.printSchema()

# *Matrix* from SparkWrapper

In [9]:
#+-------+----+-------+
#|Product|Year|Revenue|
#+-------+----+-------+
#|      A|2010|    100|
#|      B|2010|    200|
#|      C|2010|    300|
#|      A|2011|    110|
#|      B|2011|    190|
#|      C|2011|    320|
#|      A|2012|    120|
#|      B|2012|    220|
#|      C|2012|    350|
#+-------+----+-------+

agg_dict_test = {'Revenue': 'max'}
to_matrix_df = dic_to_df.toMatrix(group_by="Product", pivot_col="Year", agg_dict=agg_dict_test).orderBy("Product_Year")
print(to_matrix_df.Shape)
to_matrix_df.show(5, False)

(3, 4)
+------------+----------------+----------------+----------------+
|Product_Year|2010_Revenue_max|2011_Revenue_max|2012_Revenue_max|
+------------+----------------+----------------+----------------+
|A           |100             |110             |120             |
|B           |200             |190             |220             |
|C           |300             |320             |350             |
+------------+----------------+----------------+----------------+



# *Panel* from SparkWrapper

In [7]:
to_matrix_df.toPanel(pivot_col='Product_Year', new_col='mean_Revenue').show(10, False)

NameError: name 'to_matrix_df' is not defined

# *Cartesian* from SparkWrapper

In [None]:
to_matrix_df.cartRDD('Product_Year').take(5)

# *CorrMat* from SparkWrapper

In [None]:
import time
start = time.time()
to_matrix_df.corrMatrix().show()
end = time.time()
print('elapsed:', int(end - start), 'segundos')

In [None]:
to_matrix_df.corrMatrix(offset=0.9).show()

# *corrStat* from SparkWrapper

In [None]:
dic_to_df.corrStat('Year', 'Product', agg_dict_test).show()

# *uniqueRow* from SparkWrapper

In [None]:
dic_to_df.uniqueRow('Product')

In [None]:
dic_to_df.uniqueRow('Year')

# *vecAssembler* from SparkWrapper

In [None]:
to_matrix_df.vecAssembler(to_matrix_df.columns[1:]).show()

# *Join Small* from SparkWrapper

In [None]:
to_matrix_df.joinSmall

In [95]:
#from pyspark.ml.linalg import Vectors
#from pyspark.ml.stat import ChiSquareTest
#dataset = [[0, Vectors.dense([0, 0, 1])],
#           [0, Vectors.dense([1, 0, 1])],
#           [1, Vectors.dense([2, 1, 1])],
#           [1, Vectors.dense([3, 1, 1])]]
#dataset = spark.createDataFrame(dataset, ["label", "features"])
#dataset.show(5, False)
#chiSqResult = ChiSquareTest.test(dataset, 'features', 'label')
#chiSqResult.show(5, False)
#chiSqResult.select("degreesOfFreedom").show()

# Test con más datos

In [15]:
bank_csv = spark.read.csv('data/raw/csv/bank.csv', header=True, inferSchema=True)
bank_csv.show(5, False)

+---+----------+-------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+-------+
|age|job       |marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|deposit|
+---+----------+-------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+-------+
|59 |admin.    |married|secondary|no     |2343   |yes    |no  |unknown|5  |may  |1042    |1       |-1   |0       |unknown |yes    |
|56 |admin.    |married|secondary|no     |45     |no     |no  |unknown|5  |may  |1467    |1       |-1   |0       |unknown |yes    |
|41 |technician|married|secondary|no     |1270   |yes    |no  |unknown|5  |may  |1389    |1       |-1   |0       |unknown |yes    |
|55 |services  |married|secondary|no     |2476   |yes    |no  |unknown|5  |may  |579     |1       |-1   |0       |unknown |yes    |
|54 |admin.    |married|tertiary |no     |184    |no     |no  |unknown|5  |m

In [230]:
target = 127106153880.0

array = [((i+2)/4)*i for i in range(1, 1000000)]
array[713037]

127106153880.0

In [231]:
def binaryHelperSearch(array, target, left_p, right_p):
    if left_p > right_p:
        return -1
    mid_point = (left_p + right_p) // 2
    potential_match = array[mid_point]
    if target == potential_match:
        return mid_point
    elif target < potential_match:
        return binaryHelperSearch(array, target, left_p, mid_point - 1)
    else:
        return binaryHelperSearch(array, target, mid_point + 1, right_p)

def binarySearch(array, target):
    return binaryHelperSearch(array, target, 0, len(array) - 1)

In [232]:
binarySearch(array, target)

713037

In [157]:
from pyspark.sql.types import StringType

def __rand_int_suffix(col, n_rows):
    random_suffix = str(randint(1, n_rows))
    return col + '_' + random_suffix

def skewness_reducer(col_to_rand, n_rows):
    udf_rand_suffix = udf(__rand_int_suffix, StringType())
    return udf_rand_suffix(col_to_rand, lit(n_rows)).alias('rand_id')

def join_skewness(self, df, on, how):
    remove_skewness_df = df.select('*', skewness_reducer(on, df.))
    remove_skewness_self
    return self.join(df, on, how)

In [158]:
bank_csv.select('*', skewness_reducer('education', bank_csv.Shape[0])).show()

+---+-----------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+-------+--------------+
|age|        job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|deposit|       rand_id|
+---+-----------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+-------+--------------+
| 59|     admin.| married|secondary|     no|   2343|    yes|  no|unknown|  5|  may|    1042|       1|   -1|       0| unknown|    yes|secondary_6916|
| 56|     admin.| married|secondary|     no|     45|     no|  no|unknown|  5|  may|    1467|       1|   -1|       0| unknown|    yes|secondary_6308|
| 41| technician| married|secondary|     no|   1270|    yes|  no|unknown|  5|  may|    1389|       1|   -1|       0| unknown|    yes|secondary_8919|
| 55|   services| married|secondary|     no|   2476|    yes|  no|unknown|  5|  may|     579|       1|   -1

# *Sort Columns Asc* from SparkWrapper

In [None]:
bank_csv.sortColAsc().show(5, False)

------------------------------------------------------------------------------------------------------------------------------------------------------------

# FreqItems from Pyspark Vs freq_items from SparkWrapper

-----------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
bank_csv.freqItems(['job', 'marital', 'education'], support=0.5).show(5, False)

In [None]:
bank_csv.sampleBy('marital', fractions={'married':0.5, 'single':0.5, 'divorced':0.5}).show(5, False)

In [None]:
bank_csv.sample(withReplacement=True, fraction=0.5).show(5, False)

-----------------------------------------------------------------------------------------------------------------------------------------------------------

# *Sample N* from SparkWrapper

In [None]:
bank_csv.sampleN(20).show(30, False)

----------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
#from itertools import chain
#from pyspark.sql.functions import create_map, lit
#
#map_values = {
#    'no': '0',
#    'yes': '1'
#}
#mapping_expr = create_map([lit(x) for x in chain(*map_values.items())])
#test = bank_csv.select('*', (mapping_expr[bank_csv['deposit']]).alias('bin_deposit'))

# *Select Regex, Select Contains & Regex Expr* from SparkWrapper

In [None]:
bank_csv.selectRegex(regex_expr(['day', 'ous', 'pr'])).show(5, False)
bank_csv.selectContains(['day', 'ous', 'pr']).show(5, False)

In [None]:
agg_dict_ = {
    'balance': 'mean, count, sum, stddev, var, min',
    'housing': 'count, sum',
    'loan': 'count',
    'age': 'mean'
}
embedded_rep_test = bank_csv.toMatrix('education', 'deposit', agg_dict_)
embedded_rep_test.show()

# *Cosstab* de Pyspark Vs Matrix de SparkWrapper

In [None]:
bank_csv.where(string_match('contact == telephone')).crosstab('education', 'campaign').orderBy('education_campaign').show()

In [None]:
test_agg_dict_ = {
    'balance': 'count'
}
bank_csv.where(string_match('contact == telephone')).toMatrix('education', 'campaign', test_agg_dict_).orderBy('education_campaign').show()

# *crossPct* from SparkWrapper

In [None]:
_agg_dict_ = {
    'balance': 'mean',
    'age': 'mean',
    'loan': 'count'
}
test = bank_csv.crossPct('education', 'deposit', _agg_dict_, cols='all')
test.show()

In [None]:
tra = test.toPanel('education_deposit', ['label', 'value'])
tra.show(50, False)
tra.printSchema()

# *Select StartsWith & EndsWith* from SparkWrapper

In [None]:
bank_csv.selectStartswith('m').show(5, False)
bank_csv.selectStartswith(['m', 'd']).show(5, False)

bank_csv.selectEndswith('y').show(5, False)
bank_csv.selectEndswith(['y', 'h']).show(5, False)

In [None]:
for i in range(3):
    bank_csv.sampleN(5).show()

# *ForEach Col* from SparkWrapper

In [None]:
test_3_dict = {
    'balance': 'mean',
    'housing': 'count',
    'age': 'mean'
}
bank_csv.foreachCol('education', 'deposit', test_3_dict, 'sum').show()

In [None]:
1247.3143 + 2232.0278

# *ResumeDF & tabularTable* from SparkWrapper

In [None]:
from pyspark.sql import DataFrame, functions as f

def resume_dataframe(self, group_by=None, new_col=None):
    cols_types = [k for k, v in self.dtypes if v != 'string']
    if group_by is None:
        try:
            agg_df = self.agg(*[f.sum(c).alias(c) for c in cols_types])
            return agg_df.withColumn(new_col, f.lit('+++ total')).select(new_col, *cols_types)
        except Exception as e:
            raise AssertionError(f"empty expression found. {e}")
    return self.groupBy(group_by).agg(*[f.sum(c).alias(c) for c in cols_types])
DataFrame.resumeDF = resume_dataframe

In [None]:
def __expression(cols_list, expr):
    expr_dict = {
        'sum': '+'.join(cols_list),
        'sub': '-'.join(cols_list),
        'mul': '*'.join(cols_list),
        'div': '/'.join(cols_list),
    }
    return expr_dict[expr]

def foreach_col(self, group_by, pivot_col, agg_dict, oper):
    func = []
    regex_keys = list(agg_dict.keys())
    regex_values = list(agg_dict.values())
    df = self.toMatrix(group_by, pivot_col, agg_dict)
    for i in range(len(regex_keys)):
        cols_list = df.selectRegex(regex_expr(regex_keys[i])).columns
        expression = f.expr(__expression(cols_list, oper))
        func.append(expression.alias(f'{regex_keys[i]}_{regex_values[i]}_{oper}'))
    return df.select('*', *func)

In [None]:
def tab_table(self, group_by, pivot_col, agg_dict, oper='sum'):
    sum_by_col_df = foreach_col(self, group_by, pivot_col, agg_dict, oper)
    return sum_by_col_df.union(resume_dataframe(sum_by_col_df, new_col=self.columns[0]))
DataFrame.tabularTable = tab_table

In [None]:
bank_csv.resumeDF(group_by='education').show()
bank_csv.resumeDF(new_col='education').show()

In [None]:
test_4_dict = {
    'balance': 'mean, sum',
    'age': 'mean',
    'duration': 'mean'
}
bank_csv.tabularTable('education', 'deposit', test_4_dict).show()

# *Empty Scan* from SparkWrapper

In [None]:
bank_csv.emptyScan().show()

# *Union All* from SparkWrapper

In [None]:
print('before union:', bank_csv.count())
u = union_all([bank_csv, bank_csv, bank_csv, bank_csv, bank_csv])
print('after union:', u.count())
u.show(5, False)

In [None]:
bank_csv.rollingDown('balance', 'month', method='mean', window=20).show(25, False)

In [None]:
bank_csv.rollingDown('marital', 'month', method='mean', window=5).show(30, False)

In [None]:
from pyspark.sql.functions import isnan, when, count, col
df_orders.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_orders.columns]).show()

In [None]:
#from functools import reduce
#
#
#list_comp = [test.groupBy(i).count().sort(f.col("count").desc()).limit(1).select(f.lit(i).alias("col"), f.col(i).alias("mode")) for i in test.columns]
#mode = reduce(lambda a, b: a.union(b), list_comp)

In [None]:
#mode.show(20, False)

In [None]:
# multiplicación de df's cuadradas
# multiplicación de df x df_vector
# multiplicación de escalar x df y df_vector
# todos usaran el metodo transpose para hacer las multiplicaciones columnares siempre
# método split train, test que calcule el tamaño de la muestra y genere de eso el dataset de train y test
# muestreo estratificado aleatorio
# toMatrix generando la mediana por cortes [0.5, 0.25, 0.75]
# toMatrix generando la moda
# separador de strings en columnas