In [None]:
from pyspark.sql.session import SparkSession, SparkConf
import pyspark.sql.functions as f
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
config = SparkConf().setAll([('spark.executor.memory', '4g'), ('spark.executor.cores', '2'), ('spark.cores.max', '2'), ('spark.driver.memory','4g')])

In [None]:
spark = SparkSession.builder.config(conf=config).getOrCreate()

In [None]:
data = spark.read.parquet("Dados_tratados")

In [None]:
data.show(25, False)

In [None]:
data.groupBy("flag_ativa").count().show()

In [None]:
data = (
    data
    .withColumn("idade_dias", f.when(f.col("flag_ativa") == 1, f.datediff(f.current_date(), f.col("inicio_data"),))
                              .otherwise(f.datediff(f.col("situacao_data"), f.col('inicio_data'))))
    .withColumn("idade_anos", f.round(f.col("idade_dias") / 365.25, 2))
    
)

In [None]:
data.show(25, False)

In [None]:
data.sort('idade_anos', ascending=False).show(250, False)

In [None]:
df = data.select("capital_social").toPandas()

In [None]:
df.plot.hist(bins=10, logy=True)

In [None]:
data = (
    data
    .withColumn("capital_social_cat", f.when(f.col('capital_social') < 1000, "0_1k")
                                  .when((f.col('capital_social') >= 1000) & (f.col('capital_social') < 10000), "1k_10k")
                                  .when((f.col('capital_social') >= 10000) & (f.col('capital_social') < 100000), "10k_100k")
                                  .otherwise("100k_mais"))
)

In [None]:
data.groupBy("capital_social_cat").count().show()

In [None]:
df = data.groupBy("capital_social_cat").count().sort('count', ascending=False).toPandas()

In [None]:
sns.catplot(data=df, y='capital_social_cat', x='count', kind='bar', palette="ch:.25")

In [None]:
df = data.toPandas()

In [None]:
sns.catplot(data=df, kind='count', x='natureza_juridica')

In [None]:
fechadas = (
    data
    .filter(f.col('flag_ativa') == 0)
    .filter(f.year(f.col("situacao_data")) > 2010)
)

In [None]:
ativas = data.filter(f.col("flag_ativa") == 1)

In [None]:
fechadas.count()

In [None]:
ativas.count()

In [None]:
fechadas.withColumn("ano_fechamento", f.year(f.col("situacao_data"))).groupBy("ano_fechamento").count().sort("count", ascending=False).show(25, False)

In [None]:
cnae = spark.read.csv("CNAESECAO_UNIDECODE.csv", header=True, sep=';')

In [None]:
ativas.show(25, False)

In [None]:
fechadas.show(25, False)

In [None]:
data = fechadas.unionByName(ativas)

In [None]:
data.count()

In [None]:
cnae = cnae.select('cod_cnae', 'grande_area')

In [None]:
data = data.join(cnae, cnae.cod_cnae == data.cnae_principal, how='left').drop('cod_cnae')

In [None]:
data.show(25, False)

In [None]:
df = data.groupBy("grande_area").count().sort('count', ascending=False).toPandas()
sns.catplot(data=df, y='grande_area', x='count', kind='bar', palette="ch:.25")

In [None]:
ceps = spark.read.parquet("ceps_h3.parquet")

In [None]:
ceps.show(25, False)

In [None]:
print(data.count())
data = data.join(ceps, on='cep', how='left').drop_duplicates(['cnpj'])

In [None]:
data.show(25, False)

In [None]:
data.filter(f.col('latitude').isNull()).count()

In [None]:
data.filter(f.col('latitude').isNotNull()).count()

In [None]:
data.columns

In [None]:
data = (
    data
    .withColumn('fechada', f.when(f.col('flag_ativa') == 1, 0).otherwise(1))
)

In [None]:
data.select('id_matriz', 'fechada', 'idade_anos', 'latitude', 'longitude', 'natureza_juridica', 'capital_social_cat', 'porte', 'grande_area').write.parquet("dados_modelo.parquet", mode='overwrite')