### imports

In [1]:
import pandas as pd
import altair as alt
from scipy.stats import kruskal
from scipy.stats import mannwhitneyu
import scikit_posthocs as sp

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
import plotly.graph_objects as go

alt.data_transformers.enable("vegafusion")
alt.data_transformers.disable_max_rows()

df = pd.read_csv("amostra.csv", index_col=0)

  df = pd.read_csv("amostra.csv", index_col=0)


In [None]:
def set_global_chart_config():
    return {
        'config': {
            'title': {'fontSize': 18, 'font': 'Arial', 'anchor': 'start', 'color': 'black'},
            'axis': {
                'labelFontSize': 12,
                'titleFontSize': 14,
                'labelFont': 'Arial',
                'titleFont': 'Arial'
            },
            'legend': {
                'titleFontSize': 14,
                'labelFontSize': 12,
                'labelFont': 'Arial',
                'titleFont': 'Arial'
            }
        }
    }
    


# Aplicar a configuração global
alt.themes.register('custom_theme', lambda: set_global_chart_config())
alt.themes.enable('custom_theme')

ThemeRegistry.enable('custom_theme')

### Very high cardinality variables

In [43]:
def plot_top_categories(df, column, top_n=12):
    
    color_palette = ["#0b132b", "#152139", "#202e47", 
                    "#2a3c56", "#354964", "#3a506b",
                    "#467787", "#4c8a95",  "#57b1b1",
                     "#5dc5bf", "#63d8cd", "#69ecdb", "#6fffe9"]
    
    vis = df[column].value_counts().reset_index().head(top_n)
    
    color_scale = alt.Scale(
        domain=vis[column].unique(),
        range=color_palette    
    )

    highlight = alt.selection_point(on='mouseover', fields=[column], empty='none')
    chart = alt.Chart(vis).mark_bar().encode(
        x=alt.X(f"{column}:O", title=column.capitalize(), sort='-y'),
        y=alt.Y("count:Q", title="Contagem das classes"),
        
        tooltip=[
            alt.Tooltip(f"{column}:O"),
            alt.Tooltip("count:Q", title="Contagem da classe -")
        ],
        color=alt.condition(
            highlight,
            alt.Color(f"{column}:O", scale=color_scale, title=f'{column}'),
            alt.value('lightgray')
        ),
                
    ).properties(
        width=800,
        height=300,
    ).add_params(
        highlight
    ).configure(
    background="rgba(0, 0, 0, 0)"
    ).configure_view(
        fill='white'
    ).configure_axis(
        grid=False,
    ).configure_axis(
        labelAngle=-45
    )
    
    return chart
vis1 = plot_top_categories(df, 'occupation')
vis1.display()

In [46]:
def cumulative_percentage_lineplot(df, column):
    
    value_counts = df[column].value_counts(ascending=True)
    total = value_counts.sum()
    individual_percentage = (value_counts / total) * 100
    cumulative_percentage = (value_counts / total).cumsum() * 100
    
    cumulative_df = pd.DataFrame({
        column: value_counts.index,
        'individual_percentage': individual_percentage.values,
        'cumulative_percentage': cumulative_percentage.values
        
    }).reset_index(drop=True)
    
    line = alt.Chart(cumulative_df).mark_line().encode(
        x=alt.X(f'{column}:O', title=column.capitalize(), sort='y', axis=alt.Axis(labels=False)),
        y=alt.Y('cumulative_percentage:Q', title='Cumulative Percentage (%)'),
        tooltip=[alt.Tooltip(f"{column}:O", title=column.capitalize()), 
                 alt.Tooltip("individual_percentage:Q", title="Individual Percentage (%)", format=".2f"),
                 alt.Tooltip("cumulative_percentage:Q", title="Cumulative Percentage (%)", format='.2f')]
    )
    
    points = alt.Chart(cumulative_df).mark_point(size=50).encode(
    x=alt.X(f'{column}:O', sort='y'),
    y=alt.Y('cumulative_percentage:Q'),
    tooltip=[alt.Tooltip(f"{column}:O", title=column.capitalize()),
            alt.Tooltip("individual_percentage:Q", title="Individual Percentage (%)", format=".2f"), 
            alt.Tooltip("cumulative_percentage:Q", title="Cumulative Percentage (%)", format='.2f'),
            ]
    )
    
    chart = (line + points).properties(
        title=f'Cumulative Percentage Distribution of {column.capitalize()}',
        width=600,
        height=300
    ).configure(
    background="rgba(0, 0, 0, 0)"
    ).configure_view(
        fill='white'
    ).configure_axis(
        grid=False,
    ).configure_axis(
        labelAngle=-45
    )
    
    return chart

vis2 = cumulative_percentage_lineplot(df, 'cmte_id')
vis2.display()

### Geospatial analysis

In [None]:
grouby_city = df.

Unnamed: 0,zip_code,latitude,longitude,cmte_id,amndt_ind,rpt_tp,transaction_pgi,transaction_tp,entity_tp,city,state,employer,occupation,transaction_dt,transaction_amt,file_num
0,22303,38.792127,-77.081296,C00030718,N,M4,Desconhecido,15,IND,ALEXANDRIA,VA,COLDWELL BANKER RESIDENTIAL BK,REAL ESTATE BROKER,3092011.0,5.703782,724491.0
1,22303,38.792127,-77.081296,C00075820,N,M8,Desconhecido,15,IND,ALEXANDRIA,VA,INSTITUTE OF DEFENSE ANALYSES,INSTITUTE OF DEFENSE ANALYSES,7132011.0,6.907755,742738.0
2,22303,38.792127,-77.081296,C00287045,A,Q2,P,15,IND,ALEXANDRIA,VA,CAPTIAL STRATEGIES DC,PRINCIPAL,5312011.0,5.521461,743781.0
3,22303,38.792127,-77.081296,C00384818,N,M9,Desconhecido,15,IND,ALEXANDRIA,VA,"CAREMARK, L.L.C",SVP GOVNMT RELATIONS CVS CMK,8122011.0,6.030685,745409.0
4,22303,38.792127,-77.081296,C00193433,N,YE,Desconhecido,15,IND,ALEXANDRIA,VA,FINANCIAL ADVISOR,,12142012.0,5.521461,848470.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80788,66610,38.986972,-95.768752,C00433730,N,12G,G,15,IND,TOPEKA,KS,BRB CONTRACTORS,COMMERCIAL CONSTRUCTION,10042012.0,5.521461,827557.0
80789,66610,38.986972,-95.768752,C00001636,N,30G,Desconhecido,15,IND,TOPEKA,KS,UP RR,TRAINMAN,10202012.0,5.337538,836434.0
80790,66610,38.986972,-95.768752,C00518282,A,30G,P,15,IND,TOPEKA,KS,STORMONTVAIL HEALTH CARE,PHYSICIAN,10242012.0,5.521461,890355.0
80791,66610,38.986972,-95.768752,C00518282,A,Q3,P,15,IND,TOPEKA,KS,,,7162012.0,6.214608,889722.0


In [None]:
Oi chat, tudo bem? Gostaria de fazer uma análise espacial da variável "transaction_amt" agrupada por estado, ou seja, deve-se criar uma forma modularizada para selecionar o estado dos EUA, e a partir disso, realizar as análises que aqui serão feitas.

As variáveis estado e cidade são respectivamente "state" e "city".

Baseie-se nesses códigos:

