In [1]:
# Requirements: pandas, numpy, plotly, seaborn, matplotlib
# pip install pandas numpy plotly seaborn matplotlib

import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt

path = "/Users/user/Downloads/Matka-Aduna-Bots/init_stats_data/matka aduna bots/profession_income_by_year.csv"
df = pd.read_csv(path)
df.head()

# Optional: clip negatives to zero (toggle if desired)
df_clipped = df.copy()
df_clipped["avg_income"] = df_clipped["avg_income"].clip(lower=0)

# Choose which to use for metrics/plots
use_clipped = False
d = df_clipped if use_clipped else df

# 1) Line chart of avg_income over time by profession
fig = px.line(d, x="year", y="avg_income", color="profession",
              title="Average income by profession over time")
fig.show()

# 2) Heatmap (year x profession)
heat = d.pivot_table(index="profession", columns="year", values="avg_income")
fig_hm = px.imshow(heat, aspect="auto", color_continuous_scale="Viridis",
                   title="Income heatmap (profession x year)")
fig_hm.update_yaxes(title="")
fig_hm.update_xaxes(title="year")
fig_hm.show()

# Gini utility
def gini(x):
    x = np.asarray(x, dtype=float)
    if x.size == 0: return np.nan
    # If all equal
    if np.allclose(x, x[0]): return 0.0
    # Allow negatives; shift so minimum is zero to keep definition stable
    min_x = x.min()
    if min_x < 0:
        x = x - min_x
    mean = x.mean()
    if mean == 0: return 0.0
    # Relative mean absolute difference
    mad = np.abs(x[:, None] - x[None, :]).mean()
    return 0.5 * mad / mean

# 3) Gini across professions per year (equal weight per profession)
gini_by_year = (d.groupby("year")["avg_income"]
                  .apply(lambda s: gini(s.values))
                  .reset_index(name="gini"))
fig_g = px.line(gini_by_year, x="year", y="gini",
                title="Between-profession Gini coefficient over time")
fig_g.show()

# 4) Lorenz curves for selected years
years_to_show = [0, 25, 50, 75, 100]
lorenz_fig = go.Figure()
for y in years_to_show:
    incomes = d.loc[d["year"] == y, "avg_income"].values
    if incomes.size == 0: continue
    # Handle negatives similarly as in Gini: shift if needed
    shift = 0
    if incomes.min() < 0:
        shift = -incomes.min()
        incomes = incomes + shift
    incomes_sorted = np.sort(incomes)
    cum_income = np.cumsum(incomes_sorted)
    cum_income = np.insert(cum_income, 0, 0)
    cum_income = cum_income / cum_income[-1] if cum_income[-1] != 0 else cum_income
    n = len(incomes_sorted)
    cum_pop = np.arange(0, n + 1) / n
    lorenz_fig.add_trace(go.Scatter(x=cum_pop, y=cum_income, mode="lines", name=f"year {y}"))
lorenz_fig.add_trace(go.Scatter(x=[0,1], y=[0,1], mode="lines", name="equality", line=dict(dash="dash", color="gray")))
lorenz_fig.update_layout(title="Lorenz curves across professions (selected years)",
                         xaxis_title="Cumulative share of professions",
                         yaxis_title="Cumulative share of income")
lorenz_fig.show()

# 5) Top/bottom profession ratio (max/mean of bottom 3)
def top_bottom_ratio(s, bottom_k=3):
    v = np.sort(s.values)
    top = v[-1]
    bottom = v[:bottom_k].mean() if len(v) >= bottom_k else v.mean()
    return np.nan if bottom == 0 else top / bottom

ratio = (d.groupby("year")["avg_income"]
           .apply(lambda s: top_bottom_ratio(s, bottom_k=3))
           .reset_index(name="top_to_bottom3_ratio"))
fig_r = px.line(ratio, x="year", y="top_to_bottom3_ratio",
                title="Top profession to bottom-3 mean ratio over time")
fig_r.show()

In [2]:
policies = pd.read_csv("policies_years.csv")


In [5]:
gdp = pd.read_csv("statistics_matka_bots_year115.csv")


FileNotFoundError: [Errno 2] No such file or directory: 'statistics_matka_bots_year115.csv'