For this code AI helped me a lot, since i never runned a logistic regression

STEP 1: Preparing the DATA

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
df = pd.read_excel('normalized_controls_dataset_correct_pt2.xlsx')

In [3]:
print(df.columns)
print(df.shape)

Index(['speechID', 'memberID', 'partyID', 'constID', 'title', 'date', 'year',
       'member_name', 'party_name', 'const_name', 'speech', 'clean_speech',
       'main_topic', 'Anglo-Irish Agreement', 'Governance UK/Ireland',
       'Cross-Border Legal Frameworks', 'North-South Coordination',
       'Political Dialogue & Prisoner Releases',
       'Good Friday Agreement Protections',
       'Paramilitary Activities & Ceasefires',
       'Agreement Negotiation & Implementation',
       'Security Reforms (Decommissioning & Policing)',
       'Human Rights & Prisoner Policies',
       'Terrorism, Paramilitary Violence, and Security Cooperation',
       'Peace Agreements and Political Settlements',
       'UK/Ireland & Cross-Border Relations', 'ni_score', 'ni_similarity',
       'violence_deaths', 'nationalist_pop', 'unionist_pop',
       'five_year_period', 'border_constituency', 'partycode', 'rile',
       'embeddings'],
      dtype='object')
(4619, 36)


In [None]:
#  Cluster columns 
cluster_cols = [
    'Terrorism, Paramilitary Violence, and Security Cooperation',
    'Peace Agreements and Political Settlements',
    'UK/Ireland & Cross-Border Relations'
]

iv_cols = ['violence_deaths', 'nationalist_pop', 'unionist_pop', 'rile']



In [5]:
# Grouping by year, party, and constituency ID
grouped = df.groupby(['year', 'partyID', 'constID']).agg({
    **{col: 'mean' for col in cluster_cols},
    **{col: 'mean' for col in iv_cols},  
    'border_constituency': 'first',
}).reset_index()


In [6]:
# Create binary versions of the cluster columns based on a threshold
threshold = 0.15
for col in cluster_cols:
    grouped[f'{col}_bin'] = (grouped[col] >= threshold).astype(int)

# Update the list of dependent variables (DVs) to include the new binary columns
dv_columns = [f'{col}_bin' for col in cluster_cols]

In [7]:
# Check the shape and columns of the grouped DataFrame after adding binary columns
print(grouped.shape)
print(grouped.columns)

(1049, 14)
Index(['year', 'partyID', 'constID',
       'Terrorism, Paramilitary Violence, and Security Cooperation',
       'Peace Agreements and Political Settlements',
       'UK/Ireland & Cross-Border Relations', 'violence_deaths',
       'nationalist_pop', 'unionist_pop', 'rile', 'border_constituency',
       'Terrorism, Paramilitary Violence, and Security Cooperation_bin',
       'Peace Agreements and Political Settlements_bin',
       'UK/Ireland & Cross-Border Relations_bin'],
      dtype='object')


In [None]:
# Defining violence levels based on violence_deaths
def violence_level(val):
    if val == 0:
        return 0
    elif val <= 25:
        return 1
    else:
        return 2

def rile_to_category(r):
    if r < -10:
        return 0  # left
    elif r > 10:
        return 2  # right
    else:
        return 1  # center

grouped['rile_cat'] = grouped['rile'].apply(rile_to_category)
grouped['violence_level'] = grouped['violence_deaths'].apply(violence_level)


In [None]:
def create_milestone_variable(df):
    # Add a new column 'milestone' initialized to 0
    df['milestone'] = 0
    
    # Set the variable to 1 for significant years
    important_years = [1969, 1972, 1985, 1993, 1998, 2001, 2007, 2016, 2019]
    df.loc[df['year'].isin(important_years), 'milestone'] = 1
    
    return df

# Apply the function to the 'grouped' DataFrame
grouped = create_milestone_variable(grouped)

In [10]:
print(grouped.shape)
print(grouped.columns)

(1049, 17)
Index(['year', 'partyID', 'constID',
       'Terrorism, Paramilitary Violence, and Security Cooperation',
       'Peace Agreements and Political Settlements',
       'UK/Ireland & Cross-Border Relations', 'violence_deaths',
       'nationalist_pop', 'unionist_pop', 'rile', 'border_constituency',
       'Terrorism, Paramilitary Violence, and Security Cooperation_bin',
       'Peace Agreements and Political Settlements_bin',
       'UK/Ireland & Cross-Border Relations_bin', 'rile_cat', 'violence_level',
       'milestone'],
      dtype='object')


In [11]:
# Define IV variables (independent variables)
iv_cols = ['violence_level', 'nationalist_pop', 'unionist_pop']
control_cols = ['decade_50s_70s', 'decade_70s_90s', 'decade_90s_10s', 'milestone', 'border_constituency', 'rile_cat']


In [12]:
# Adding decade binary columns based on year
grouped['decade_50s_70s'] = ((grouped['year'] >= 1950) & (grouped['year'] < 1980)).astype(int)
grouped['decade_70s_90s'] = ((grouped['year'] >= 1970) & (grouped['year'] < 2000)).astype(int)
grouped['decade_90s_10s'] = ((grouped['year'] >= 1990) & (grouped['year'] < 2020)).astype(int)

# Convert border_constituency to integer
grouped['border_constituency'] = grouped['border_constituency'].astype(int)

In [13]:
print(grouped.shape)
print(grouped.columns)

(1049, 20)
Index(['year', 'partyID', 'constID',
       'Terrorism, Paramilitary Violence, and Security Cooperation',
       'Peace Agreements and Political Settlements',
       'UK/Ireland & Cross-Border Relations', 'violence_deaths',
       'nationalist_pop', 'unionist_pop', 'rile', 'border_constituency',
       'Terrorism, Paramilitary Violence, and Security Cooperation_bin',
       'Peace Agreements and Political Settlements_bin',
       'UK/Ireland & Cross-Border Relations_bin', 'rile_cat', 'violence_level',
       'milestone', 'decade_50s_70s', 'decade_70s_90s', 'decade_90s_10s'],
      dtype='object')


STEP 2: Calculating VIF

In [14]:
# Subset for VIF calculation with violence-level related controls
# This step ensures that we check for multicollinearity among the independent variables (IVs).
# By calculating the Variance Inflation Factor (VIF), we can identify and address any high correlation between IVs,
# which is crucial for maintaining the validity of our regression models.

vif_cols_violence = ['violence_level', 'decade_50s_70s', 'decade_70s_90s', 'decade_90s_10s', 'milestone', 'border_constituency', 'rile_cat']
df_vif_violence = grouped[vif_cols_violence].dropna()
X_vif_violence = sm.add_constant(df_vif_violence)
vif_data_violence = pd.DataFrame()
vif_data_violence["variable"] = X_vif_violence.columns
vif_data_violence["VIF"] = [variance_inflation_factor(X_vif_violence.values, i) for i in range(X_vif_violence.shape[1])]

print("### VIF – Violence Model ###")
print(vif_data_violence)


### VIF – Violence Model ###
              variable        VIF
0                const  19.486111
1       violence_level   3.112652
2       decade_50s_70s   2.381656
3       decade_70s_90s   2.693947
4       decade_90s_10s   2.199331
5            milestone   1.136384
6  border_constituency   1.003997
7             rile_cat   1.177170


In [15]:
# Filter to post-1998
df_opinion = grouped[grouped['year'] >= 1998].copy()

# IVs for opinion-based model
vif_cols_opinion = ['nationalist_pop', 'unionist_pop', 'milestone', 'border_constituency', 'rile_cat']
df_vif_opinion = df_opinion[vif_cols_opinion].dropna()
X_vif_opinion = sm.add_constant(df_vif_opinion)
vif_data_opinion = pd.DataFrame()
vif_data_opinion["variable"] = X_vif_opinion.columns
vif_data_opinion["VIF"] = [variance_inflation_factor(X_vif_opinion.values, i)
                           for i in range(X_vif_opinion.shape[1])]

print("\n### VIF – Opinion Model (Post-1998) ###")
print(vif_data_opinion)


### VIF – Opinion Model (Post-1998) ###
              variable         VIF
0                const  222.886541
1      nationalist_pop    1.134489
2         unionist_pop    1.267311
3            milestone    1.255923
4  border_constituency    1.029271
5             rile_cat    1.191224


Violence Regression Model

In [16]:
grouped["violence_border"] = grouped["violence_level"] * grouped["border_constituency"]
grouped["milestone_border"] = grouped["milestone"] * grouped["border_constituency"]
grouped["violence_milestone"] = grouped["violence_level"] * grouped["milestone"]
grouped["violence_rile"] = grouped["violence_level"] * grouped["rile_cat"]
grouped["border_rile"] = grouped["border_constituency"] * grouped["rile_cat"]
grouped["milestone_rile"] = grouped["milestone"] * grouped["rile_cat"]

violence_iv_controls = vif_cols_violence + ["violence_border", "milestone_border", "violence_milestone", "violence_rile", "border_rile","milestone_rile"]
# Logistic Regression for each DV
results_violence = {}
for dv in dv_columns:
    if dv not in grouped.columns:
        print(f"❌ Error: Column '{dv}' not found in the grouped DataFrame.")
        continue

    print(f"\n🔍 Analysis for DV: {dv}")
    print("Value distribution:")
    print(grouped[dv].value_counts())

    print("Step-by-step test for IVs...\n")
    for i in range(len(violence_iv_controls) + 1):
        try:
            iv_subset = violence_iv_controls[:i]
            X = grouped[iv_subset]
            X = sm.add_constant(X)
            y = grouped[dv]
            model = sm.Logit(y, X).fit(disp=0)
            print(f"✅ OK with IV: {iv_subset}")
        except Exception as e:
            print(f"❌ Error with IV: {iv_subset} --> {e}")
            model = None

    if model is not None:
        results_violence[dv] = model.summary()

for dv, result in results_violence.items():
    print(f"\nRegression result for {dv} (Violence Model):")
    print(result)


🔍 Analysis for DV: Terrorism, Paramilitary Violence, and Security Cooperation_bin
Value distribution:
Terrorism, Paramilitary Violence, and Security Cooperation_bin
0    751
1    298
Name: count, dtype: int64
Step-by-step test for IVs...

✅ OK with IV: []
✅ OK with IV: ['violence_level']
✅ OK with IV: ['violence_level', 'decade_50s_70s']
✅ OK with IV: ['violence_level', 'decade_50s_70s', 'decade_70s_90s']
✅ OK with IV: ['violence_level', 'decade_50s_70s', 'decade_70s_90s', 'decade_90s_10s']
✅ OK with IV: ['violence_level', 'decade_50s_70s', 'decade_70s_90s', 'decade_90s_10s', 'milestone']
✅ OK with IV: ['violence_level', 'decade_50s_70s', 'decade_70s_90s', 'decade_90s_10s', 'milestone', 'border_constituency']
✅ OK with IV: ['violence_level', 'decade_50s_70s', 'decade_70s_90s', 'decade_90s_10s', 'milestone', 'border_constituency', 'rile_cat']
✅ OK with IV: ['violence_level', 'decade_50s_70s', 'decade_70s_90s', 'decade_90s_10s', 'milestone', 'border_constituency', 'rile_cat', 'violence_

Public opinion Regression Model

In [17]:
df_opinion["milestone_nationalist"] = df_opinion["milestone"] * df_opinion["nationalist_pop"]
df_opinion["milestone_unionist"] = df_opinion["milestone"] * df_opinion["unionist_pop"]
df_opinion["rile_nationalist"] = df_opinion["rile_cat"] * df_opinion["nationalist_pop"]
df_opinion["rile_unionist"] = df_opinion["rile_cat"] * df_opinion["unionist_pop"]
df_opinion["border_nationalist"] = df_opinion["border_constituency"] * df_opinion["nationalist_pop"]
df_opinion["border_unionist"] = df_opinion["border_constituency"] * df_opinion["unionist_pop"]


# Define opinion-related IVs
opinion_iv_controls = vif_cols_opinion + [
    "milestone_nationalist", "milestone_unionist", 
    "rile_nationalist", "rile_unionist", 
    "border_nationalist", "border_unionist"
]

# Logistic Regression for each DV (Post-1998)
results_opinion = {}
for dv in dv_columns:
    if dv not in df_opinion.columns:
        print(f"❌ Error: Column '{dv}' not found in df_opinion.")
        continue

    print(f"\n🔍 Opinion Model – DV: {dv}")
    print("Value distribution:")
    print(df_opinion[dv].value_counts())

    print("Step-by-step test for IVs...\n")
    for i in range(len(opinion_iv_controls) + 1):
        try:
            iv_subset = opinion_iv_controls[:i]
            X = df_opinion[iv_subset]
            X = sm.add_constant(X)
            y = df_opinion[dv]
            model = sm.Logit(y, X).fit(disp=0)
            print(f"✅ OK with IV: {iv_subset}")
        except Exception as e:
            print(f"❌ Error with IV: {iv_subset} --> {e}")
            model = None

    if model is not None:
        results_opinion[dv] = model.summary()

# Display results
for dv, result in results_opinion.items():
    print(f"\nRegression result for {dv} (Opinion Model):")
    print(result)


🔍 Opinion Model – DV: Terrorism, Paramilitary Violence, and Security Cooperation_bin
Value distribution:
Terrorism, Paramilitary Violence, and Security Cooperation_bin
0    240
1     82
Name: count, dtype: int64
Step-by-step test for IVs...

✅ OK with IV: []
✅ OK with IV: ['nationalist_pop']
✅ OK with IV: ['nationalist_pop', 'unionist_pop']
✅ OK with IV: ['nationalist_pop', 'unionist_pop', 'milestone']
✅ OK with IV: ['nationalist_pop', 'unionist_pop', 'milestone', 'border_constituency']
✅ OK with IV: ['nationalist_pop', 'unionist_pop', 'milestone', 'border_constituency', 'rile_cat']
✅ OK with IV: ['nationalist_pop', 'unionist_pop', 'milestone', 'border_constituency', 'rile_cat', 'milestone_nationalist']
✅ OK with IV: ['nationalist_pop', 'unionist_pop', 'milestone', 'border_constituency', 'rile_cat', 'milestone_nationalist', 'milestone_unionist']
✅ OK with IV: ['nationalist_pop', 'unionist_pop', 'milestone', 'border_constituency', 'rile_cat', 'milestone_nationalist', 'milestone_unionis

In [18]:
# Save the results of the logistic regression models to a text file
with open('model_results.txt', 'w') as f:
    f.write("### Results of Violence Models ###\n")
    for dv, result in results_violence.items():
        f.write(f"\n--- Results for {dv} ---\n")
        f.write(result.as_text())
    
    f.write("\n\n### Results of Opinion Models ###\n")
    for dv, result in results_opinion.items():
        f.write(f"\n--- Results for {dv} ---\n")
        f.write(result.as_text())

In [19]:
grouped.to_excel('grouped_data.xlsx', index=False)