**Dataset used:** https://ucdp.uu.se/country/771

**Goal of ML:** To predict the likelihood of cross-border conflict events involving Bangladesh using historical event-level data, in a theoretical early-warning scenario intended to inform preparedness and alertness planning.

# Initial Setup

In [None]:
# %pip install ydata-profiling gradio

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import OneClassSVM
from sklearn.metrics import f1_score, recall_score
from ydata_profiling import ProfileReport
import gradio as gr
import pickle

print("Libraries loaded successfully.")

Libraries loaded successfully.


# Task 1: Data Loading

In [None]:
df = pd.read_csv('gedevents-2026-01-17.csv', index_col=False)
print(f"Data loaded. Shape: {df.shape}")
df.head()

Data loaded. Shape: (693, 47)


Unnamed: 0,id,relid,year,active_year,code_status,type_of_violence,conflict_dset_id,conflict_new_id,conflict_name,dyad_dset_id,...,date_prec,date_start,date_end,deaths_a,deaths_b,deaths_civilians,deaths_unknown,best_est,high_est,low_est
0,210432,BNG-2015-3-2087-0,2015,False,Clear,3,1218,2009,JMB - Civilians,1218,...,1,09/04/2015 00:00:00,09/04/2015 00:00:00,0,0,2,0,2,2,2
1,210433,BNG-2015-3-1076-1,2015,True,Clear,3,234,506,IS - Civilians,234,...,1,09/28/2015 00:00:00,09/28/2015 00:00:00,0,0,1,0,1,1,1
2,210436,BNG-2015-3-1076-2,2015,True,Clear,3,234,506,IS - Civilians,234,...,1,10/03/2015 00:00:00,10/03/2015 00:00:00,0,0,1,0,1,1,1
3,210459,BNG-2015-3-1076-3,2015,True,Clear,3,234,506,IS - Civilians,234,...,1,10/24/2015 00:00:00,10/24/2015 00:00:00,0,0,1,0,1,1,1
4,210470,BNG-2015-1-14718-0,2015,False,Clear,1,13674,13674,Bangladesh: Islamic State,14718,...,1,11/04/2015 00:00:00,11/04/2015 00:00:00,1,0,0,0,1,1,1


In [None]:
# Subtask: EDA
# ProfileReport(df, title="EDA", explorative=True).to_file("EDA-Report.html") #For generating downloadeable HTML
# ProfileReport(df, title="EDA", explorative=True) # To view inside colab without requiring download



*   The dataset contains event-level armed conflict records from the Uppsala Conflict Data Program (UCDP).

*   Each row represents a single conflict event with temporal, geographic, and conflict-related attributes.

*   The objective is to predict whether an event involved a cross-border incident.

# Task 2: Data Preprocessing

In [None]:
# Step 1: Target Definition (Feature Engineering)
# Identify Border Violations (Class 1) vs Internal (Class 0)
def define_target(row: pd.Series) -> int:
    text = f"{str(row['side_a'])} {str(row['side_b'])} {str(row.get('source_headline', ''))}".lower()
    keywords = ['border', 'bsf', 'bgb', 'bgp', 'bdr', 'rifles','crossing', 'fence', 'pushback', "no man's land", 'zero line']
    return 1 if any(k in text for k in keywords) else 0

df['is_border_violation'] = df.apply(define_target, axis=1)

# Step 2: Feature Extraction (Temporal)
df['month'] = pd.to_datetime(df['date_start']).dt.month

# Step 3: Handling Missing Values
df = df.dropna(subset=['latitude', 'longitude', 'year', 'type_of_violence', 'month'])

# Step 4: Feature Selection
X = df[['latitude', 'longitude', 'year', 'month', 'type_of_violence']]
y = df['is_border_violation']

# Step 5: Stratified Split (80/20)
# Uses stratify=y to maintain the 8/693 ratio in both sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

print(f"Total Events: {len(df)}")
print(f"Identified Border Violations: {df['is_border_violation'].sum()}")


Total Events: 693
Identified Border Violations: 8


# Task 3: Pipeline Creation

In [None]:
# Scaling numerical coordinates/years and encoding categorical violence types
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), ['latitude', 'longitude', 'year', 'month']),
    ('cat', OneHotEncoder(handle_unknown='ignore'), ['type_of_violence'])
])

custom_weights = {0: 1, 1: 100} # Forces the model to value 1 border event as much as 100 internal ones

pipeline = Pipeline([
    ('pre', preprocessor),
    ('clf', RandomForestClassifier(
        n_estimators=500,        # More trees to reduce variance on the 8 samples
        class_weight=custom_weights,
        min_samples_leaf=1,      # REQUIRED: With only 8 samples, we can't require 2 per leaf
        max_depth=None,          # Let the tree grow deep enough to capture the specific border coords
        random_state=42,
        criterion='entropy'      # Often better for heavily imbalanced classes
    ))
])

# Task 4: Primary Model Selection


**Selection:** Random Forest Classifier.

**Justification:** The YData showed that conflict events are non-linearly distributed across coordinates. Random Forest is ideal for this dataset because it handles the non-linear relationship between geography and conflict types well.
It is inherently robust to the noise present in UCDP event reporting and offers 'class_weight' parameters to handle the rarity of border events.

# Task 5: Model Training

In [None]:
pipeline.fit(X_train, y_train)
print("Initial Model Training Complete.")

Initial Model Training Complete.


# Task 6: Cross-Validation

In [None]:
# --- 1. Robust Validation Setup ---
rskf = RepeatedStratifiedKFold(n_splits=4, n_repeats=25, random_state=42)

# --- 2. Optimized Manual Upsampling Function (O(N) vs O(N^2)) ---
def upsample_minority(X: pd.DataFrame, y: pd.Series, target_count: int = 500) -> tuple[pd.DataFrame, pd.Series]:
    # Work with indices to avoid expensive dataframe copying
    minority_indices = y[y == 1].index.to_numpy()
    majority_indices = y[y == 0].index.to_numpy()
    
    # Randomly sample minority indices with replacement
    upsampled_min_idx = np.random.choice(minority_indices, size=target_count, replace=True)
    
    # Combine
    new_indices = np.concatenate([majority_indices, upsampled_min_idx])
    np.random.shuffle(new_indices)
    
    return X.loc[new_indices], y.loc[new_indices]

# --- 3. Model Configurations ---
models = {
    "RandomForest (Balanced)": RandomForestClassifier(n_estimators=1000, max_depth=5, class_weight='balanced_subsample', random_state=42),
    "AdaBoost (Stumps)": AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=1), n_estimators=50, random_state=42),
    "OneClassSVM (Anomaly)": OneClassSVM(kernel='rbf', nu=0.01) # Warning: Quadratic complexity, limit N < 10k
}

# --- 4. The Comparison Loop ---
results = {name: {'f1': [], 'recall': []} for name in models}
print("Starting Robust Cross-Validation (100 runs per model)...")

for i, (train_idx, val_idx) in enumerate(rskf.split(X_train, y_train)):
    # Slice using indices (View vs Copy optimization)
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    # Pipeline Processing
    X_tr_proc = preprocessor.fit_transform(X_tr)
    X_val_proc = preprocessor.transform(X_val)
    
    # Apply Upsampling only to Training Data (Prevents Leakage)
    X_tr_up, y_tr_up = upsample_minority(pd.DataFrame(X_tr_proc, index=X_tr.index), y_tr, target_count=len(y_tr))
    
    for name, model in models.items():
        if "OneClass" in name:
            # Anomaly detection trains only on Majority Class (Internal Conflict)
            model.fit(X_tr_proc[y_tr == 0])
            preds = np.where(model.predict(X_val_proc) == -1, 1, 0)
        else:
            # Classifiers train on Upsampled Balanced Data
            model.fit(X_tr_up, y_tr_up)
            preds = model.predict(X_val_proc)
            
        results[name]['f1'].append(f1_score(y_val, preds, average='macro'))
        results[name]['recall'].append(recall_score(y_val, preds, pos_label=1))

# --- 5. Display Results ---
print("--- BATTLE RESULTS ---")
for name, metrics in results.items():
    print(f"{name}: F1={np.mean(metrics['f1']):.3f} (+/- {np.std(metrics['f1']):.3f}) | Border Recall={np.mean(metrics['recall']):.3f}")

Starting Robust Cross-Validation (100 runs per model)...

--- BATTLE RESULTS ---
RandomForest (Balanced): F1=0.577 (+/- 0.151) | Border Recall=0.170
AdaBoost (Stumps): F1=0.595 (+/- 0.133) | Border Recall=0.320
OneClassSVM (Anomaly): F1=0.556 (+/- 0.056) | Border Recall=0.745


# Task 7: Hyperparameter Tuning


In [None]:
# We optimize the best performing model (AdaBoost) from the Cross-Validation step.
# Parameters tested: n_estimators, learning_rate

prob_model_name = "AdaBoost (Stumps)" 
base_model = models[prob_model_name]

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.5, 1.0, 1.5]
}

print(f"Tuning {prob_model_name}...")
print(f"Parameters to test: {param_grid}")

# Use a simplified CV for tuning speed, utilizing the preprocessor
grid = GridSearchCV(base_model, param_grid, cv=3, scoring='f1_macro', n_jobs=-1)

# Preprocess full training set for grid search
X_train_proc = preprocessor.fit_transform(X_train)
# Upsample once for grid search to handle imbalance during tuning
X_train_up, y_train_up = upsample_minority(pd.DataFrame(X_train_proc, index=X_train.index), y_train, target_count=len(y_train))

grid.fit(X_train_up, y_train_up)

print(f"
Best Parameters Found: {grid.best_params_}")
print(f"Best Grid Score: {grid.best_score_:.4f}")


# Task 8: Best Model Selection


In [None]:
# Selecting the final best-performing model based on tuning results
best_prob_model = grid.best_estimator_
print(f"Final Selected Probability Model: {best_prob_model}")

# Refitting the complementary Anomaly Model (OneClassSVM) for the Hybrid System
anomaly_model_name = "OneClassSVM (Anomaly)"
anomaly_model = models[anomaly_model_name]
anomaly_model.fit(X_train_proc[y_train == 0])
print(f"Final Selected Anomaly Model: {anomaly_model}")

# Wrap into Hybrid System for Deployment
class HybridTacticalModel:
    def __init__(self, preprocessor, prob_model, anomaly_model):
        self.preprocessor = preprocessor
        self.prob_model = prob_model
        self.anomaly_model = anomaly_model
        
    def analyze(self, X_raw):
        X_proc = self.preprocessor.transform(X_raw)
        risk_prob = self.prob_model.predict_proba(X_proc)[0][1]
        is_anomaly = self.anomaly_model.predict(X_proc)[0] == -1
        return {
            "probability": float(risk_prob),
            "anomaly_detected": bool(is_anomaly)
        }

hybrid_system = HybridTacticalModel(preprocessor, best_prob_model, anomaly_model)
print("
Best Models Selected and Unified into Hybrid System.")


# Task 9: Model Performance Evaluation


In [None]:
# Evaluate the model on the test set and print comprehensive metrics suitable for the problem type.
print("--- FINAL TEST SET PERFORMANCE ---")

# We evaluate the probability component (Precision/Recall)
X_test_proc = preprocessor.transform(X_test)
y_pred_prob = best_prob_model.predict(X_test_proc)

print(classification_report(y_test, y_pred_prob))
ConfusionMatrixDisplay.from_predictions(y_test, y_pred_prob, cmap='Greens')
plt.title("Final Model Performance (Test Set)")
plt.show()


# Task 10: Web Interface (Gradio)


In [None]:
# # Ensure these match your preprocessing exactly
# SECTORS = {
#     "Teknaf Border": (20.86, 92.30), 
#     "Ukhiya Zone": (21.16, 92.14),
#     "Bandarban Hills": (22.19, 92.21), 
#     "Sylhet Border": (24.89, 91.86),
#     "Dhaka Central": (23.81, 90.41)
# }
# MONTHS = {"Jan":1,"Feb":2,"Mar":3,"Apr":4,"May":5,"Jun":6,"Jul":7,"Aug":8,"Sep":9,"Oct":10,"Nov":11,"Dec":12}
# V_TYPES = {"State-based Action": 1, "Non-state Activity": 2, "Civilian Attacks": 3}

# def hybrid_predict(year, sector, month, violence):
#     try:
#         # A. Map Inputs
#         lat, lon = SECTORS[sector]
#         m_num = MONTHS[month]
#         vt = V_TYPES[violence]

#         # B. Create input DataFrame
#         input_data = pd.DataFrame(
#             [[float(lat), float(lon), int(year), int(m_num), int(vt)]],
#             columns=['latitude', 'longitude', 'year', 'month', 'type_of_violence']
#         )

#         # C. Get Hybrid Intelligence
#         analysis = hybrid_system.analyze(input_data)
#         risk_prob = analysis["probability"]
#         anomaly = analysis["anomaly_detected"]

#         # D. Unified Status Logic
#         if risk_prob >= 0.20:
#             status = "ðŸ”´ RED ALERT: High Probability Border Threat"
#         elif anomaly:
#             status = "ðŸŸ¡ YELLOW ALERT: Unusual Activity (Anomaly Detected)"
#         else:
#             status = "ðŸŸ¢ GREEN: Routine Internal Patterns"

#         return {
#             "Tactical_Status": status,
#             "Intelligence_Signals": {
#                 "Historical_Pattern_Match_Prob": f"{risk_prob:.2%}",
#                 "Outlier_Detection_Warning": "ACTIVE" if anomaly else "None"
#             },
#             "Commander_Note": "Yellow alerts indicate events that don't match standard internal conflict patterns and should be verified by ground sensors/scouts.",
#             "Input_Context": {"Year": int(year), "Sector": sector, "Month": month, "Conflict_Type": violence}
#         }

#     except Exception as e:
#         return {"Error": str(e)}

# ui = gr.Interface(
#     fn=hybrid_predict,
#     inputs=[
#         gr.Number(value=2026, label="Forecast Year"),
#         gr.Dropdown(choices=list(SECTORS.keys()), label="Sector Selection", value="Teknaf Border"),
#         gr.Dropdown(choices=list(MONTHS.keys()), label="Forecast Month", value="Jan"),
#         gr.Dropdown(choices=list(V_TYPES.keys()), label="Conflict Category", value="State-based Action")
#     ],
#     outputs=gr.JSON(label="Tactical Intelligence Report"),
#     title="Hybrid Early Warning System (HEWS)",
#     description="Dual-signal intelligence system combining Historical Probability (AdaBoost) and Anomaly Detection (SVM). Designed for cross-border conflict prediction."
# )

# # ui.launch(share=True, pwa=True)


# Pickling the Model

In [None]:
# with open('model.pkl', 'wb') as f:
#     pickle.dump(hybrid_system, f)
# print("Unified Hybrid System saved to 'model.pkl'. Ready for deployment.")