In [230]:
# Load Libraries

import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.covariance import MinCovDet
from scipy.stats.mstats import mquantiles


In [231]:
# Load Data

GTD = pd.read_csv('C:/R Portfolio/Global_Terrorism_Prediction/globalterrorismdb_0522dist.csv', 
                     encoding = 'latin1',
                     low_memory = False)
GTD_1 = pd.read_csv('C:/R Portfolio/Global_Terrorism_Prediction/globalterrorismdb_2021Jan-June_1222dist.csv', 
                     encoding = 'latin1',
                     low_memory = False)
GTD_combined = pd.concat([GTD, GTD_1], ignore_index = True)
GTD_combined

Unnamed: 0,eventid,iyear,imonth,iday,approxdate,extended,resolution,country,country_txt,region,...,addnotes,scite1,scite2,scite3,dbsource,INT_LOG,INT_IDEO,INT_MISC,INT_ANY,related
0,197000000001,1970,7,2,,0,,58,Dominican Republic,2,...,,,,,PGIS,0,0,0,0,
1,197000000002,1970,0,0,,0,,130,Mexico,1,...,,,,,PGIS,0,1,1,1,
2,197001000001,1970,1,0,,0,,160,Philippines,5,...,,,,,PGIS,-9,-9,1,1,
3,197001000002,1970,1,0,,0,,78,Greece,8,...,,,,,PGIS,-9,-9,1,1,
4,197001000003,1970,1,0,,0,,101,Japan,4,...,,,,,PGIS,-9,-9,1,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214661,202106300023,2021,6,30,,0,,4,Afghanistan,6,...,,"""Gunmen blow up power pylon in Parwan,"" Afghan...","""Provinces hit by blackout after power pylon d...",,START Primary Collection,-9,-9,0,-9,
214662,202106300029,2021,6,30,06/30/2021,0,,138,Myanmar,5,...,,"""Spring Revolution Daily News for 16-30 June 2...",,,START Primary Collection,-9,-9,0,-9,
214663,202106300030,2021,6,30,,1,08/07/2021,147,Nigeria,11,...,,"""Boko Haram Releases Abducted Catholic Priest ...","""Kidnapped Maiduguri Catholic Priest regains f...","""ISWAP-Boko Haram Abduct Catholic Priest In Bo...",START Primary Collection,0,0,0,0,
214664,202106300038,2021,6,30,,0,,45,Colombia,3,...,,"""Two dead and one wounded after clashes betwee...",,,START Primary Collection,0,0,0,0,


In [232]:
# Load Functions

execfile('C:/Python_Data_Sets/Functions 10_07_2023.py')

In [233]:
# Process Data

GTD_New = preprocess_data(GTD_combined)
GTD_New

Unnamed: 0,Year,Month,Day,Country,Region,Province,City,Attack,Target,Group,Weapon,Dead,Lethal
0,1970,7,2,OtherCountry,Central America & Caribbean,OtherProvince,OtherCity,Assassination,Private,OtherGroup,OtherWeapon,1.0,1
1,1970,0,0,OtherCountry,North America,OtherProvince,OtherCity,HostageKidnapAttack,OtherTarget,OtherGroup,OtherWeapon,0.0,0
2,1970,1,0,OtherCountry,Southeast Asia,OtherProvince,Unknown,Assassination,OtherTarget,OtherGroup,OtherWeapon,1.0,1
3,1970,1,0,OtherCountry,Western Europe,OtherProvince,OtherCity,BombAttack,OtherTarget,OtherGroup,Explosives,0.0,0
4,1970,1,0,OtherCountry,East Asia,OtherProvince,OtherCity,InfrastructureAttack,OtherTarget,OtherGroup,Incendiary,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
214661,2021,6,30,Afghanistan,South Asia,OtherProvince,OtherCity,BombAttack,OtherTarget,OtherGroup,Explosives,0.0,0
214662,2021,6,30,OtherCountry,Southeast Asia,OtherProvince,OtherCity,HostageKidnapAttack,OtherTarget,OtherGroup,OtherWeapon,1.0,1
214663,2021,6,30,OtherCountry,Sub-Saharan Africa,OtherProvince,Unknown,HostageKidnapAttack,OtherTarget,OtherGroup,Firearms,0.0,0
214664,2021,6,30,OtherCountry,South America,OtherProvince,Unknown,BombAttack,OtherTarget,OtherGroup,Explosives,0.0,0


In [234]:
# Region Data

# EE Region Data
EE_data = GTD_New[GTD_New['Region'] == 'Eastern Europe']
EE_data

Unnamed: 0,Year,Month,Day,Country,Region,Province,City,Attack,Target,Group,Weapon,Dead,Lethal
15,1970,1,10,OtherCountry,Eastern Europe,OtherProvince,OtherCity,BombAttack,GovtGen,OtherGroup,Explosives,0.0,0
34,1970,1,28,OtherCountry,Eastern Europe,OtherProvince,OtherCity,InfrastructureAttack,Police,OtherGroup,Incendiary,0.0,0
241,1970,4,28,OtherCountry,Eastern Europe,OtherProvince,OtherCity,InfrastructureAttack,GovtGen,OtherGroup,Incendiary,0.0,0
258,1970,5,5,OtherCountry,Eastern Europe,OtherProvince,OtherCity,InfrastructureAttack,OtherTarget,OtherGroup,Incendiary,0.0,0
275,1970,5,10,OtherCountry,Eastern Europe,OtherProvince,OtherCity,InfrastructureAttack,Business,OtherGroup,Incendiary,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
213322,2021,5,4,OtherCountry,Eastern Europe,OtherProvince,OtherCity,BombAttack,Business,OtherGroup,Explosives,0.0,0
213341,2021,5,5,OtherCountry,Eastern Europe,OtherProvince,OtherCity,BombAttack,Private,OtherGroup,Explosives,0.0,0
213775,2021,5,25,OtherCountry,Eastern Europe,OtherProvince,OtherCity,BombAttack,Private,OtherGroup,Explosives,0.0,0
213813,2021,5,27,OtherCountry,Eastern Europe,OtherProvince,OtherCity,ArmedAssaultAttack,Military,OtherGroup,Explosives,1.0,1


In [235]:
# Drop the specified columns
columns_to_drop = ['Month', 'Day', 'Region']
EE_data_2 = EE_data.drop(columns = columns_to_drop)
# Convert the 'Category' column to categorical type
EE_data_2['Country'] = EE_data_2['Country'].astype('category')
# Convert Data into Dummy format

EE_categorical_cols = ['Country', 'Province', 'City', 'Attack', 'Target', 'Group', 'Weapon']
EE_data_2 = pd.get_dummies(EE_data_2, columns = EE_categorical_cols)
# Remove "Attack_" and "Target_" from column names
EE_data_2.columns = EE_data_2.columns.str.replace('Attack_', '').str.replace('Target_', '').str.replace('Weapon_', '').str.replace('Country_', '').str.replace('Province_', '').str.replace('City_', '')
EE_data_2

Unnamed: 0,Year,Dead,Lethal,OtherCountry,OtherProvince,OtherCity,Unknown,ArmedAssaultAttack,Assassination,BombAttack,...,GovtGen,Military,OtherTarget,Police,Private,Group_OtherGroup,Explosives,Firearms,Incendiary,OtherWeapon
15,1970,0.0,0,1,1,1,0,0,0,1,...,1,0,0,0,0,1,1,0,0,0
34,1970,0.0,0,1,1,1,0,0,0,0,...,0,0,0,1,0,1,0,0,1,0
241,1970,0.0,0,1,1,1,0,0,0,0,...,1,0,0,0,0,1,0,0,1,0
258,1970,0.0,0,1,1,1,0,0,0,0,...,0,0,1,0,0,1,0,0,1,0
275,1970,0.0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213322,2021,0.0,0,1,1,1,0,0,0,1,...,0,0,0,0,0,1,1,0,0,0
213341,2021,0.0,0,1,1,1,0,0,0,1,...,0,0,0,0,1,1,1,0,0,0
213775,2021,0.0,0,1,1,1,0,0,0,1,...,0,0,0,0,1,1,1,0,0,0
213813,2021,1.0,1,1,1,1,0,1,0,0,...,0,1,0,0,0,1,1,0,0,0


In [236]:
EE_data_3 = EE_data_2.groupby(['Year'])[['Dead',
        'Lethal',
        'OtherCountry',
        'OtherProvince',
        'OtherCity',
        'Unknown',
        'ArmedAssaultAttack',
        'Assassination',
        'BombAttack',
        'HostageKidnapAttack',
        'InfrastructureAttack',
        'OtherAttack',
        'Business',
        'GovtGen',
        'Military',
        'OtherTarget',
        'Police',
        'Private',
        'Group_OtherGroup',
        'Explosives',
        'Firearms',
        'Incendiary',
        'OtherWeapon']].sum()
EE_data_3


Unnamed: 0_level_0,Dead,Lethal,OtherCountry,OtherProvince,OtherCity,Unknown,Unknown,ArmedAssaultAttack,Assassination,BombAttack,...,GovtGen,Military,OtherTarget,Police,Private,Group_OtherGroup,Explosives,Firearms,Incendiary,OtherWeapon
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1970,0.0,0,12,12,12,0,0,0,0,8,...,3,0,3,3,1,12,8,0,4,0
1971,0.0,0,5,5,5,0,0,0,0,1,...,0,0,1,4,0,5,1,0,4,0
1972,26.0,1,1,1,0,1,0,0,0,1,...,0,0,1,0,0,1,1,0,0,0
1973,1.0,1,1,1,1,0,0,0,0,1,...,0,0,1,0,0,1,1,0,0,0
1974,1.0,1,2,2,2,0,0,0,0,1,...,1,0,1,0,0,2,1,0,0,1
1977,1.0,1,2,2,2,0,0,1,0,1,...,0,0,1,0,0,2,1,1,0,0
1978,4.0,1,2,2,2,0,0,0,1,1,...,0,0,2,0,0,2,1,1,0,0
1979,0.0,0,1,1,1,0,0,0,0,1,...,0,0,0,0,1,1,1,0,0,0
1980,0.0,0,1,1,1,0,0,0,0,1,...,0,0,1,0,0,1,1,0,0,0
1981,0.0,0,4,4,4,0,0,0,1,2,...,0,1,3,0,0,4,2,1,0,1


In [238]:
# Replace the sample data with your actual data
EE_data_3 = np.array(EE_data_3)

# Remove NaN values from the NumPy array
EE_data_3 = EE_data_3[~np.isnan(EE_data_3).any(axis=1)]
EE_data_3

array([[ 0.,  0., 12., ...,  0.,  4.,  0.],
       [ 0.,  0.,  5., ...,  0.,  4.,  0.],
       [26.,  1.,  1., ...,  0.,  0.,  0.],
       ...,
       [34., 17., 44., ...,  7.,  8., 10.],
       [15., 12., 60., ...,  4., 22.,  5.],
       [ 8.,  5., 16., ...,  0.,  5.,  3.]])

In [239]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.covariance import MinCovDet

def perform_outlier_analysis(data, remove_outliers=False):
    data = np.array(data)

    # Remove NaN values from the NumPy array
    data = data[~np.isnan(data).any(axis=1)]

    outliers_dist = []
    outliers_pca = []

    # Calculate robust Mahalanobis distance
    robust_cov = MinCovDet().fit(data)
    robust_dist = robust_cov.mahalanobis(data)
    quantile_value = np.percentile(robust_dist, 95)
    outliers_indices_dist = np.where(robust_dist > quantile_value)
    outliers_indices_dist = np.unique(outliers_indices_dist[0])
    outliers_dist = data[outliers_indices_dist]

    # Calculate robust PCA
    robust_pca = PCA()
    robust_pca.components_ = robust_cov.location_
    residuals = np.zeros_like(data)
    for i in range(data.shape[0]):
        residuals[i] = data[i] - np.dot(np.dot(data[i], robust_pca.components_.T), robust_pca.components_)
    residuals_pca = np.zeros_like(data)
    for i in range(data.shape[0]):
        residuals_pca[i] = residuals[i] - np.dot(np.dot(residuals[i], robust_pca.components_.T), robust_pca.components_)
    robust_cov_pca = MinCovDet().fit(residuals_pca)
    robust_pca.components_ = robust_cov_pca.location_
    residuals_pca = np.zeros_like(data)
    for i in range(data.shape[0]):
        residuals_pca[i] = residuals[i] - np.dot(np.dot(residuals[i], robust_pca.components_.T), robust_pca.components_)
    robust_dist_pca = robust_cov_pca.mahalanobis(residuals_pca)
    quantile_value_pca = np.percentile(robust_dist_pca, 95)
    outliers_indices_pca = np.where(robust_dist_pca > quantile_value_pca)
    outliers_indices_pca = np.unique(outliers_indices_pca[0])
    outliers_pca = data[outliers_indices_pca]

    # Combine outliers from both methods
    all_outliers = np.concatenate((outliers_dist, outliers_pca), axis=0)
    unique_outliers = np.unique(all_outliers, axis=0)

    if remove_outliers:
        data = np.array([row for row in data if row not in unique_outliers])

    return outliers_dist, outliers_pca, unique_outliers, data

In [240]:
# Replace the sample data with your actual data
EE_data_3 = np.array(EE_data_3)

# Perform outlier analysis
outliers_dist, outliers_pca, combined_outliers, data_without_outliers = perform_outlier_analysis(EE_data_3, remove_outliers=False)




In [241]:
# Print the outliers based on robust Mahalanobis distance
print("Outliers based on Mahalanobis distance:")
for outlier in outliers_dist:
    print(outlier)

Outliers based on Mahalanobis distance:
[115.  64. 248. 248. 234.  14.   3.  46.  31. 125.  33.   5.   5.  39.
  45.  12.  73.  34.  45. 248. 128.  77.   8.  35.]
[1468.  244.  963.  963.  922.   41.   49.  270.   15.  454.   94.   66.
   15.   46.   64.  495.  145.   64.  149.  963.  550.  249.   22.  142.]
[791. 159. 685. 685. 671.  14.  18. 158.  12. 456.  13.  26.   2.  39.
  22. 347. 105.  24. 148. 685. 566.  62.  25.  32.]


In [242]:
# Print the outliers based on robust PCA
print("Outliers based on PCA:")
for outlier in outliers_pca:
    print(outlier)

Outliers based on PCA:
[402. 107. 234. 234. 231.   3.   3.  61.  17. 142.   2.   6.   3.  16.
  49.  50.  35.  22.  62. 234. 147.  74.   4.   9.]
[1468.  244.  963.  963.  922.   41.   49.  270.   15.  454.   94.   66.
   15.   46.   64.  495.  145.   64.  149.  963.  550.  249.   22.  142.]
[791. 159. 685. 685. 671.  14.  18. 158.  12. 456.  13.  26.   2.  39.
  22. 347. 105.  24. 148. 685. 566.  62.  25.  32.]


In [243]:
# Print the combined list of outliers
print("Combined outliers:")
for outlier in combined_outliers:
    print(outlier)

Combined outliers:
[115.  64. 248. 248. 234.  14.   3.  46.  31. 125.  33.   5.   5.  39.
  45.  12.  73.  34.  45. 248. 128.  77.   8.  35.]
[402. 107. 234. 234. 231.   3.   3.  61.  17. 142.   2.   6.   3.  16.
  49.  50.  35.  22.  62. 234. 147.  74.   4.   9.]
[791. 159. 685. 685. 671.  14.  18. 158.  12. 456.  13.  26.   2.  39.
  22. 347. 105.  24. 148. 685. 566.  62.  25.  32.]
[1468.  244.  963.  963.  922.   41.   49.  270.   15.  454.   94.   66.
   15.   46.   64.  495.  145.   64.  149.  963.  550.  249.   22.  142.]
