In [3]:
import pandas as pd
import numpy as np
import requests
import json

In [4]:
# data fetching 

class APIFetcher:
    def __init__(self):
        self.api_keys = {
            'cyber': 'b7aceffb1657bfac01d8cc7563d5e91c91ee239bfe2c99e30ab1e0030ee9e4b4'
        }

        self.urls = {
            'conflict': 'https://services8.arcgis.com/xu983xJB6fIDCjpX/arcgis/rest/services/ACLED/FeatureServer/0/query?outFields=*&where=1%3D1&f=geojson',
            'cyber': 'https://otx.alienvault.com/api/v1/pulses/subscribed'
        }

    def fetch(self, name):
        headers = {}
        if name == 'cyber':
            headers['X-OTX-API-KEY'] = self.api_keys['cyber']

        try:
            response = requests.get(self.urls[name], headers=headers, timeout=10)
            response.raise_for_status()
            print(f"[INFO] Successfully fetched {name} data")
            return response.json()
        except requests.RequestException as e:
            print(f"[ERROR] Failed to fetch '{name}': {e}")
            return None

# Example Usage
fetcher = APIFetcher()
conflict_data = fetcher.fetch('conflict')
cyber_data = fetcher.fetch('cyber')


[INFO] Successfully fetched conflict data
[INFO] Successfully fetched cyber data


In [5]:
print("conflict data -> ",conflict_data.keys())
print("cyber data ->",cyber_data.keys())

conflict data ->  dict_keys(['type', 'properties', 'features'])
cyber data -> dict_keys(['results', 'count', 'prefetch_pulse_ids', 't', 't2', 't3', 'previous', 'next'])


DATA GATHERING

In [6]:
# parsing conflict data for merging

conflict_events = []

for event in conflict_data['features']:
    props = event['properties']
    conflict_events.append({
        'Country':props.get('country'),
        'Protests':props.get('protests'),
        'Riots': props.get('riots'),
        'Admin': props.get('admin1')
    })


import pandas as pd
conflict_df = pd.DataFrame(conflict_events)    

In [7]:
normalized_cyber = []

for event in cyber_data.get('results', []):
    for indicator in event.get('indicators', []):
        normalized_cyber.append({
            'source': 'cyber',
            'country': 'Global',  # Could infer from title/description later
            'region': 'Cyber Domain',
            'date': event.get('created'),
            'threat_type': indicator.get('type'),
            'value': indicator.get('indicator')
        })

cyber_df = pd.DataFrame(normalized_cyber)


In [8]:
cyber_df

Unnamed: 0,source,country,region,date,threat_type,value
0,cyber,Global,Cyber Domain,2025-05-07T18:05:20.713000,domain,cloudmediaportal.com
1,cyber,Global,Cyber Domain,2025-05-07T18:05:20.713000,domain,njala.dev
2,cyber,Global,Cyber Domain,2025-05-07T18:05:20.713000,FileHash-MD5,09b740bb082b465fcc9f8a7766984317
3,cyber,Global,Cyber Domain,2025-05-07T18:05:20.713000,FileHash-MD5,15ecd6b5a2df7ccabbab3cd3b42c443d
4,cyber,Global,Cyber Domain,2025-05-07T18:05:20.713000,FileHash-MD5,688c01c49525df877d4bc28aa534d31d
...,...,...,...,...,...,...
325,cyber,Global,Cyber Domain,2025-06-05T16:53:44.054000,domain,syarousi-search.com
326,cyber,Global,Cyber Domain,2025-06-05T16:53:44.054000,domain,webrelayapi.online
327,cyber,Global,Cyber Domain,2025-06-05T16:53:44.054000,hostname,enota.clientepj.com
328,cyber,Global,Cyber Domain,2025-06-05T16:53:44.054000,hostname,mesh.computadorpj.com


Integrating with scikit learn 

Threats Prediction

In [23]:
# predicting with cyber indicators -- predicting threats 

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# encode categorical values
encoders = {}

for col in ['country', 'region', 'value', 'threat_type']:
    le = LabelEncoder()
    cyber_df[col] = le.fit_transform(cyber_df[col].astype(str))
    encoders[col] = le
    
X = cyber_df[['country','region','value']]
y = cyber_df['threat_type']

# spliting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# classifier
model = RandomForestClassifier()
model.fit(X_train, y_train)

# checks marks
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

# 7. Predict new cyber event
new_event = pd.DataFrame([{
    'country': 'India',
    'region': 'Asia',
    'value': 'a1b2c3'
}])


def encode_with_fallback(encoder,value):
    if value not in encoder.classes_:
        encoder.classes_ = np.append(encoder.classes_, value)
        return encoder.transform([value])[0]

# 8. Encode new event using stored encoders
for col in ['country', 'region', 'value']:
    new_event[col] = new_event[col].apply(lambda val: encode_with_fallback(encoders[col], val))

# 9. Predict threat type
predicted = model.predict(new_event)
predicted_label = encoders['threat_type'].inverse_transform(predicted)

print("Predicted Threat Type:", predicted_label[0])

Accuracy: 0.4393939393939394
Predicted Threat Type: 2


In [None]:
print(cyber_df['threat_type'].value_counts())


In [10]:
conflict_df

Unnamed: 0,Country,Protests,Riots,Admin
0,Iraq,0,0,Duhok
1,Colombia,24,8,"Bogota, D.C."
2,Iraq,0,0,Duhok
3,Colombia,0,0,"Bogota, D.C."
4,Iraq,0,0,Duhok
...,...,...,...,...
995,Bolivia,33,7,Cochabamba
996,Bolivia,29,2,Cochabamba
997,Bolivia,18,3,Cochabamba
998,Bolivia,14,3,Cochabamba


Predicting riots in [conflict_df] 

In [22]:
# Riots prediction by conflict_df

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd

# preprocess
conflict_df.columns = conflict_df.columns.str.strip().str.lower()

if 'India' not in conflict_df['country'].values:
    conflict_df = pd.concat([
        conflict_df,
        pd.DataFrame([{'country': 'India', 'protests': 0, 'riots': 0}])
    ], ignore_index=True)


le = LabelEncoder()
conflict_df['country_ncoded'] = le.fit_transform(conflict_df['country'])

# feature, target
X = conflict_df[['country_ncoded', 'protests']]
y = conflict_df['riots']

# split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# train
model = RandomForestClassifier()
model.fit(X_train, y_train)

# evaluate
y_pred = model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))

# new data
new_data = pd.DataFrame([{
    'Country': 'India',
    'protests': 34
}])

# fix casing here 👇
new_data['country_ncoded'] = le.transform(new_data['Country'])
new_data = new_data[['country_ncoded', 'protests']]

# predict
final = model.predict(new_data)
print('Riots:', final[0])


Accuracy: 0.5920398009950248
Riots: 5


In [24]:
print(conflict_df['country'].shape)
print(type(conflict_df['country']))


(1001,)
<class 'pandas.core.series.Series'>


In [29]:
import plotly.express as px

# For threats
fig = px.histogram(cyber_df, x='threat_type', title='Distribution of Threat Types')
fig.show()


In [31]:
fig = px.bar(cyber_df, x='country', y='protests', title='Protests by Country')
fig.show()


ValueError: Value of 'y' is not the name of a column in 'data_frame'. Expected one of ['source', 'country', 'region', 'date', 'threat_type', 'value'] but received: protests