#### Import
**pandas** for data analysis;
**matplotlib** for data visualization.

In [8]:
import pandas as pd
import matplotlib.pyplot as plt

#### Load 
the **dataset** of client sentiment data

In [None]:
data = pd.read_csv('/drive/data/climatebert-climate-sentiment.csv')
print(data.head())

#### Summarize and plot
the **distribution** of sentiment labels

In [None]:
label_distribution = data['label'].value_counts(normalize=True)
ax = label_distribution.plot(kind='bar', color='#125740', edgecolor='black', ylim=(0, label_distribution.max() * 1.1))
plt.xlabel('Sentiment Labels')
plt.ylabel('Proportion')
plt.title('Distribution of Sentiment Labels')
plt.xticks(ticks=[0, 1, 2], labels=['Risk (0)', 'Neutral (1)', 'Opportunity (2)'], rotation=0)
plt.grid(True, linestyle='--', linewidth=0.5)

# Annotate bars with percentages
for p in ax.patches:
    ax.annotate(f"{p.get_height():.2%}", (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')

plt.show()

#### Gini Coefficient
The **Gini coefficient** measures distribution inequality, ranging from **0** (perfect equality) to **1** (extreme inequality).  
A **lower value** indicates balance, while a **higher value** shows disparity.

In [None]:
import numpy as np

# Calculate Gini coefficient
def gini_coefficient(distribution):
    distribution_sorted = np.sort(distribution)
    n = len(distribution)
    cumulative_values = np.cumsum(distribution_sorted)
    return (2 / n) * np.sum((np.arange(1, n+1) - (n+1)/2) * distribution_sorted) / np.sum(distribution_sorted)

gini = gini_coefficient(label_distribution.values)
print(f"Gini Coefficient: {gini:.2f}")


In [None]:
# Define a threshold for determining if the distribution is balanced
threshold = 0.15  # You can adjust this value as needed

# Check if the Gini coefficient is below the threshold
if gini < threshold:
    print("Balanced Distribution: The sentiment label distribution is balanced.")
else:
    print("Unbalanced Distribution: Consider rebalancing the dataset for better performance.")
