Ai helped me to normalized suggesting me methods and libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

In [None]:
df = pd.read_excel("df_topics_non_normalized.xlsx")

In [3]:
df.columns

Index(['speechID', 'memberID', 'partyID', 'constID', 'title', 'date',
       'member_name', 'party_name', 'const_name', 'speech', 'ni_score',
       'five_year_period', 'ni_similarity', 'clean_speech', 'embeddings',
       'topic_0_prob', 'topic_1_prob', 'topic_2_prob', 'topic_3_prob',
       'topic_4_prob', 'topic_5_prob', 'topic_6_prob', 'topic_7_prob',
       'topic_8_prob', 'topic_9_prob', 'topic_10_prob', 'topic_11_prob',
       'topic_12_prob', 'topic_13_prob', 'topic_14_prob', 'topic_15_prob',
       'topic_16_prob', 'topic_17_prob', 'main_topic', 'nationalist_pop',
       'unionist_pop', 'violence_deaths'],
      dtype='object')


STEP 1: Cleaning and Merging Topics


In [None]:


# Drop irrelevant topics
df.drop(columns=['topic_0_prob', 'topic_1_prob', 'topic_15_prob', 'topic_7_prob'], inplace=True)

# Merge similar topics
df['paramilitarism_ceasefire_prob'] = df['topic_6_prob'] + df['topic_13_prob']
df['agreement_negotiation_implementation_prob'] = df['topic_4_prob'] + df['topic_8_prob']
df['security_reforms_prob'] = df['topic_9_prob'] + df['topic_12_prob']
df['human_rights_and_prisoners_prob'] = df['topic_16_prob'] + df['topic_17_prob']

# Rename topics
rename_dict = {
    'paramilitarism_ceasefire_prob': 'Paramilitary Activities & Ceasefires',
    'human_rights_and_prisoners_prob': 'Human Rights & Prisoner Policies',
    'topic_5_prob': 'Cross-Border Legal Frameworks',
    'topic_11_prob': 'Political Dialogue & Prisoner Releases',
    'agreement_negotiation_implementation_prob': 'Agreement Negotiation & Implementation',
    'security_reforms_prob': 'Security Reforms (Decommissioning & Policing)',
    'topic_14_prob': 'Good Friday Agreement Protections',
    'topic_2_prob': 'Anglo-Irish Agreement',
    'topic_3_prob': 'Governance UK/Ireland',
    'topic_10_prob': 'North-South Coordination'
}
df = df.rename(columns=rename_dict)



 STEP 2: Compute Clusters (on raw topic values)


In [None]:


cluster_map = {
    "Terrorism, Paramilitary Violence, and Security Cooperation": [
        'Cross-Border Legal Frameworks',
        'Paramilitary Activities & Ceasefires',
        'Political Dialogue & Prisoner Releases',
    ],
    "Peace Agreements and Political Settlements": [
        'Agreement Negotiation & Implementation',
        'Security Reforms (Decommissioning & Policing)',
        'Good Friday Agreement Protections',
        'Human Rights & Prisoner Policies'
    ],
    "UK/Ireland & Cross-Border Relations": [
        'Anglo-Irish Agreement',
        'Governance UK/Ireland',
        'North-South Coordination'
    ]
}

# Create cluster columns using the sum of RAW topic values
for cluster_name, topics in cluster_map.items():
    df[cluster_name] = df[topics].sum(axis=1)


 STEP 3: Annual Aggregates


In [None]:

if 'year' not in df.columns:
    df['year'] = pd.to_datetime(df['date']).dt.year

final_topics = list(rename_dict.values())
cluster_cols = list(cluster_map.keys())

df_yearly = df.groupby('year')[final_topics + cluster_cols].mean().reset_index()


In [None]:

# STEP 4a: Normalize yearly averages for plotting


scaler = MinMaxScaler()
df_yearly_normalized = df_yearly.copy()
df_yearly_normalized[final_topics + cluster_cols] = scaler.fit_transform(df_yearly[final_topics + cluster_cols])

df_yearly_normalized.to_excel("normalized_topics_and_clusters_correct.xlsx", index=False)
# Describe statistics for yearly normalized data
print("Yearly Normalized Data Description:")
print(df_yearly_normalized[final_topics + cluster_cols].describe())



# STEP 4b: Normalize full data for regression/analysis


df_normalized = df.copy()
df_normalized[final_topics + cluster_cols] = scaler.fit_transform(df[final_topics + cluster_cols])

df_normalized.to_excel("full_normalized_dataset_correct.xlsx", index=False)
# Describe statistics for full normalized data
print("\nFull Normalized Data Description:")
print(df_normalized[final_topics + cluster_cols].describe())

Yearly Normalized Data Description:
       Paramilitary Activities & Ceasefires  Human Rights & Prisoner Policies  \
count                             70.000000                         70.000000   
mean                               0.192535                          0.292770   
std                                0.173069                          0.165844   
min                                0.000000                          0.000000   
25%                                0.093990                          0.186455   
50%                                0.144838                          0.270270   
75%                                0.229410                          0.371975   
max                                1.000000                          1.000000   

       Cross-Border Legal Frameworks  Political Dialogue & Prisoner Releases  \
count                      70.000000                               70.000000   
mean                        0.141578                                0.1784