In [59]:
import pandas as pd
import numpy as np
import statsmodels.stats.api as sms
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('region_stats.csv', index_col=0)
df['name_en'] = df['name_en'].fillna(df['name'])
df.drop('name', axis=1, inplace=True)
df = df.rename(columns={'name_en': 'name'})
df = df.transpose().reset_index()
df.columns = df.iloc[0]
df = df.rename(columns={'name': 'level'}).drop(index=0, axis=0)
df['level'] = df['level'].str.split('_').str[0].str.split('V').str[1]
df['level'] = pd.to_numeric(df['level'])
df

In [None]:
agg = df.transpose().aggregate(['mean', 'median', 'std'])
agg = agg.transpose()
agg['level'] = df['level']
agg

In [62]:
countries = df.columns.difference(['level'])
melted = pd.melt(df, id_vars=['level'], value_vars=countries, var_name='country', value_name='count')
melted_agg = pd.melt(agg, id_vars=['level'], value_vars=['mean', 'median', 'std'], var_name='metric', value_name='value')

In [None]:
plt.figure(figsize=(10, 6))
#sns.scatterplot(data=melted, x='level', y='count')
sns.lineplot(data=melted, x='level', y='count', errorbar='ci', err_style='bars')
plt.xlabel('Administrative level')
plt.ylabel('Count')
plt.title('Regions per administrative level (mean and confidence intervals)')
#plt.legend(title='Metric', bbox_to_anchor=(1.01, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
#sns.scatterplot(data=melted, x='level', y='count')
sns.barplot(data=melted, x='level', y='count', errorbar='ci')
plt.xlabel('Administrative level')
plt.ylabel('Count')
plt.title('Regions per administrative level (mean and confidence intervals)')
#plt.legend(title='Metric', bbox_to_anchor=(1.01, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
ax = sns.barplot(data=melted_agg.query('metric!="std"'), x='level', y='value', hue='metric')
plt.bar_label(ax.containers[0], fmt='%.2f')
plt.bar_label(ax.containers[1], fmt='%.2f')
plt.xlabel('Administrative level')
plt.ylabel('Value')
plt.title('Regions per administrative level')
plt.legend(title='Metric', bbox_to_anchor=(1.01, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
melted

In [None]:
level = pd.to_numeric(melted['level'])
count = pd.to_numeric(melted['count'])

plt.figure(figsize=(10, 6))
sns.regplot(x=level, y=count, fit_reg=False, x_estimator=np.mean, label=f'Points', color='.3', marker='o')
for order in range(2, 5):  
    sns.regplot(x=level, y=count, x_estimator=np.mean, order=order, label=f'Order {order}', scatter=False)
plt.xlabel('Administrative level')
plt.ylabel('Count')
plt.title('Regions per administrative level (polynomial regression)')
plt.legend(title='Polynomial order', bbox_to_anchor=(1.01, 1), loc='upper left')
plt.tight_layout()
plt.show()