<h1 align="center">Pyspark Sql</h1>

- [**Librerias**](#Librerias)
- [**Funciones**](#Funciones)
- [**Codigo**](#Codigo)

<html><h1 align="center", style="background: #4777d1; color:white">Librerias</h1></html>

[**Volver al Inicio**](#Pyspark-Sql) 

In [None]:
import os
from pyspark.sql import SQLContext, SparkSession
from pyspark.sql.functions import size, lit, explode, col, round, dense_rank, rank, desc
from pyspark.sql.window import Window
import json

In [2]:
pyspark_session_sql = SparkSession \
                    .builder \
                    .appName("spark_session_sql") \
                    .config('spark.driver.memory', '6g') \
                    .getOrCreate()


sqlcontext = SQLContext(pyspark_session_sql)



<html><h1 align="center", style="background: #4777d1; color:white">Funciones</h1></html>

[**Volver al Inicio**](#Pyspark-Sql) 

In [3]:
def get_csv_file(file, sep=None):
    print(f'\n\x1b[1;33;40mRead {file} Data...\x1b[0m\n')
    csv_file  = pyspark_session_sql.read.csv(file, header = True, sep = sep)
    return csv_file

def get_dataframe_table(context, data, table_name):
    context.registerDataFrameAsTable(data, table_name)

<html><h1 align="center", style="background: #4777d1; color:white">Codigo</h1></html>

[**Volver al Inicio**](#Pyspark-Sql) 

In [4]:
data_per = get_csv_file('csvfiles/Person_StateProvince.csv', sep = ';')
data_sal = get_csv_file('csvfiles/Sales_SalesTaxRate.csv', sep = ';')


[1;33;40mRead csvfiles/Person_StateProvince.csv Csv Data...[0m


[1;33;40mRead csvfiles/Sales_SalesTaxRate.csv Csv Data...[0m



In [5]:
get_dataframe_table(sqlcontext, data_per, "data_per")
get_dataframe_table(sqlcontext, data_sal, "data_sal")

In [6]:
table_avg_tax = sqlcontext.sql('''WITH table_avg_tax AS (
                                   SELECT data_sal.StateProvinceID, data_sal.TaxRate, data_per.StateProvinceID, data_per.CountryRegionCode
                                   FROM data_sal 
                                   LEFT JOIN data_per 
                                   ON data_sal.StateProvinceID = data_per.StateProvinceID)
                                SELECT table_avg_tax.CountryRegionCode, ROUND(AVG(REPLACE(table_avg_tax.TaxRate,',','.')),4) AS average_taxRate 
                                FROM table_avg_tax 
                                GROUP BY table_avg_tax.CountryRegionCode
                                ORDER BY table_avg_tax.CountryRegionCode''')
table_avg_tax.show()

+-----------------+---------------+
|CountryRegionCode|average_taxRate|
+-----------------+---------------+
|               AU|           10.0|
|               CA|         8.4333|
|               DE|           16.0|
|               FR|           19.6|
|               GB|           17.5|
|               US|          7.405|
+-----------------+---------------+



In [7]:
data_country = get_csv_file('csvfiles/Sales_CountryRegionCurrency.csv', sep = ';')
data_country_reg = get_csv_file('csvfiles/Person_CountryRegion.csv', sep = ';')
data_currency_rate = get_csv_file('csvfiles/Sales_CurrencyRate.csv', sep = ';')
data_currency = get_csv_file('csvfiles/Sales_Currency.csv', sep = ';')


[1;33;40mRead csvfiles/Sales_CountryRegionCurrency.csv Csv Data...[0m


[1;33;40mRead csvfiles/Person_CountryRegion.csv Csv Data...[0m


[1;33;40mRead csvfiles/Sales_CurrencyRate.csv Csv Data...[0m


[1;33;40mRead csvfiles/Sales_Currency.csv Csv Data...[0m



In [8]:
get_dataframe_table(sqlcontext, data_country_reg, "data_country_reg")
get_dataframe_table(sqlcontext, data_country, "data_country")
get_dataframe_table(sqlcontext, data_currency, "data_currency")
get_dataframe_table(sqlcontext, data_currency_rate, "data_currency_rate")

querypy = sqlcontext.sql('''WITH tablefinal AS (
                                SELECT data_country_reg.Name, 
                                      data_country_reg.CountryRegionCode, 
                                      data_country.CurrencyCode,
                                      data_currency.Name AS currency_name,
                                      data_currency_rate.AverageRate,
                                      data_per.StateProvinceID,
                                      data_sal.TaxRate 
                               FROM data_country_reg 
                               INNER JOIN data_country 
                               ON data_country_reg.CountryRegionCode = data_country.CountryRegionCode
                               INNER JOIN data_currency
                               ON data_country.CurrencyCode = data_currency.CurrencyCode
                               INNER JOIN data_currency_rate
                               ON data_country.CurrencyCode = data_currency_rate.ToCurrencyCode
                               INNER JOIN data_per
                               ON data_country.CountryRegionCode = data_per.CountryRegionCode
                               INNER JOIN data_sal
                               ON data_per.StateProvinceID = data_sal.StateProvinceID)
                            SELECT tablefinal.Name AS country_name,
                                   tablefinal.currency_name AS currency_name,
                                   MAX(ROUND(REPLACE(tablefinal.AverageRate,',','.'),2)) AS currency_rate,
                                   ROUND(AVG(REPLACE(tablefinal.TaxRate,',','.')),2) AS average_tax_rate
                            FROM tablefinal
                            GROUP BY tablefinal.Name, tablefinal.currency_name
                            ORDER BY tablefinal.Name''')
querypy.show(15)

+--------------+--------------------+-------------+----------------+
|  country_name|       currency_name|currency_rate|average_tax_rate|
+--------------+--------------------+-------------+----------------+
|     Australia|   Australian Dollar|         2.09|            10.0|
|        Canada|     Canadian Dollar|         1.62|            8.43|
|        France|        French Franc|         7.37|            19.6|
|        France|                EURO|         1.21|            19.6|
|       Germany|       Deutsche Mark|          2.2|            16.0|
|       Germany|                EURO|         1.21|            16.0|
|United Kingdom|United Kingdom Pound|         0.73|            17.5|
| United States|           US Dollar|          1.0|            7.41|
+--------------+--------------------+-------------+----------------+



In [9]:
pyspark_session_sql.stop()