<a href="https://colab.research.google.com/github/Maha3061/Global-Terrorism-Analysis/blob/main/gtdcleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Configure notebook output
#from IPython.core.interactiveshell import InteractiveShell
#InteractiveShell.ast_node_interactivity = "all"

# Display up to 150 rows and columns
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 150)

# Set the figure size for plots
mpl.rcParams['figure.figsize'] = (14.6, 9.0)

In [None]:
gtd_df = pd.read_csv('/content/drive/MyDrive/data/gtdpreprocess.csv', low_memory=False, index_col = 0,
                      na_values=[''])

In [None]:
# Display a summary of the data frame
gtd_df.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
Float64Index: 181691 entries, 197000000000.0 to 201712000000.0
Data columns (total 48 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   iyear             181691 non-null  int64  
 1   imonth            181691 non-null  int64  
 2   iday              181691 non-null  int64  
 3   country_txt       181691 non-null  object 
 4   region_txt        181691 non-null  object 
 5   provstate         181691 non-null  object 
 6   city              181691 non-null  object 
 7   latitude          177135 non-null  float64
 8   longitude         177134 non-null  float64
 9   specificity       181691 non-null  float64
 10  summary           181691 non-null  object 
 11  attacktype1_txt   181691 non-null  object 
 12  targtype1_txt     181691 non-null  object 
 13  targsubtype1_txt  181691 non-null  object 
 14  corp1             181691 non-null  object 
 15  target1           181691 non-null  object 
 1

In [None]:
gtd_df.loc[gtd_df['weaptype1_txt'] ==
           'Vehicle (not to include vehicle-borne explosives, i.e., car or truck bombs)',
           'weaptype1_txt'] = 'Vehicle (non-explosives)'

gtd_df.loc[gtd_df['attacktype1_txt'] ==
           'Hostage Taking (Barricade Incident)',
           'attacktype1_txt'] = 'Hostage Taking (Barricade)'

In [None]:
# List of attributes that are categorical
cat_attrs = ['extended_txt', 'country_txt', 'region_txt', 'specificity', 'vicinity_txt',
             'crit1_txt', 'crit2_txt', 'crit3_txt', 'doubtterr_txt', 'multiple_txt',
             'success_txt', 'suicide_txt', 'attacktype1_txt', 'targtype1_txt',
             'targsubtype1_txt', 'natlty1_txt', 'guncertain1_txt', 'individual_txt',
             'claimed_txt', 'weaptype1_txt', 'weapsubtype1_txt', 'property_txt',
             'ishostkid_txt', 'INT_LOG_txt', 'INT_IDEO_txt','INT_MISC_txt', 'INT_ANY_txt']

for cat in cat_attrs:
    gtd_df[cat] = gtd_df[cat].astype('category')

gtd_df.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
Float64Index: 181691 entries, 197000000000.0 to 201712000000.0
Data columns (total 48 columns):
 #   Column            Non-Null Count   Dtype   
---  ------            --------------   -----   
 0   iyear             181691 non-null  int64   
 1   imonth            181691 non-null  int64   
 2   iday              181691 non-null  int64   
 3   country_txt       181691 non-null  category
 4   region_txt        181691 non-null  category
 5   provstate         181691 non-null  object  
 6   city              181691 non-null  object  
 7   latitude          177135 non-null  float64 
 8   longitude         177134 non-null  float64 
 9   specificity       181691 non-null  category
 10  summary           181691 non-null  object  
 11  attacktype1_txt   181691 non-null  category
 12  targtype1_txt     181691 non-null  category
 13  targsubtype1_txt  181691 non-null  category
 14  corp1             181691 non-null  object  
 15  target1           181691 non

In [None]:
gtd_df[['nperpcap', 'nkill', 'nkillus', 'nkillter', 'nwound',
        'nwoundus', 'nwoundte']].dropna().describe(
    percentiles = [0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 1.0]).transpose()

Unnamed: 0,count,mean,std,min,10%,20%,30%,40%,50%,60%,70%,80%,90%,100%,max
nperpcap,100627.0,0.121617,1.824019,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,406.0,406.0
nkill,100627.0,1.956622,7.194329,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0,4.0,670.0,670.0
nkillus,100627.0,0.010643,0.276509,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,44.0,44.0
nkillter,100627.0,0.342224,2.673829,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,500.0,500.0
nwound,100627.0,3.315144,13.233305,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,4.0,7.0,1500.0,1500.0
nwoundus,100627.0,0.015195,0.650163,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,151.0,151.0
nwoundte,100627.0,0.114572,1.561454,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,200.0,200.0


In [None]:
# Function to impute either the median or mean
def fill_value(attr):
    fill = 0.0
    threshold = 3
    attr_clean = attr.dropna()
    attr_std = attr_clean.std()
    outliers = attr_clean[attr_clean > (threshold * attr_std)]

    if (outliers.count() > 0):
        fill = attr_clean.median()
    else:
        fill = attr_clean.mean()

    return fill

In [None]:
# Impute each of the numeric attributes that contain missing values
gtd_df['nperpcap'] = gtd_df['nperpcap'].fillna(fill_value(gtd_df['nperpcap']))
gtd_df['nkill'] = gtd_df['nkill'].fillna(fill_value(gtd_df['nkill']))
gtd_df['nkillus'] = gtd_df['nkillus'].fillna(fill_value(gtd_df['nkillus']))
gtd_df['nkillter'] = gtd_df['nkillter'].fillna(fill_value(gtd_df['nkillter']))
gtd_df['nwound'] = gtd_df['nwound'].fillna(fill_value(gtd_df['nwound']))
gtd_df['nwoundus'] = gtd_df['nwoundus'].fillna(fill_value(gtd_df['nwoundus']))
gtd_df['nwoundte'] = gtd_df['nwoundte'].fillna(fill_value(gtd_df['nwoundte']))

In [None]:
gtd_df[['nperpcap', 'nkill', 'nkillus', 'nkillter', 'nwound',
        'nwoundus', 'nwoundte']].describe(
    percentiles=[0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 1.0]).transpose()

Unnamed: 0,count,mean,std,min,10%,20%,30%,40%,50%,60%,70%,80%,90%,100%,max
nperpcap,181691.0,0.077505,1.621754,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,406.0,406.0
nkill,181691.0,2.26686,11.227057,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,5.0,1570.0,1570.0
nkillus,181691.0,0.029671,4.564308,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1360.0,1360.0
nkillter,181691.0,0.320825,3.346474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,500.0,500.0
nwound,181691.0,2.883296,34.309747,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,6.0,8191.0,8191.0
nwoundus,181691.0,0.025076,2.453378,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,751.0,751.0
nwoundte,181691.0,0.066382,1.172976,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,200.0,200.0


In [None]:
gtd_df.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
Float64Index: 181691 entries, 197000000000.0 to 201712000000.0
Data columns (total 48 columns):
 #   Column            Non-Null Count   Dtype   
---  ------            --------------   -----   
 0   iyear             181691 non-null  int64   
 1   imonth            181691 non-null  int64   
 2   iday              181691 non-null  int64   
 3   country_txt       181691 non-null  category
 4   region_txt        181691 non-null  category
 5   provstate         181691 non-null  object  
 6   city              181691 non-null  object  
 7   latitude          177135 non-null  float64 
 8   longitude         177134 non-null  float64 
 9   specificity       181691 non-null  category
 10  summary           181691 non-null  object  
 11  attacktype1_txt   181691 non-null  category
 12  targtype1_txt     181691 non-null  category
 13  targsubtype1_txt  181691 non-null  category
 14  corp1             181691 non-null  object  
 15  target1           181691 non

In [None]:
# Select the observations that contain null
ll_df = gtd_df[np.isnan(gtd_df.latitude)]
print(ll_df.shape)

# Chech how many observations have city set to Unknown
city_df = ll_df[(ll_df['city'] == "UNKNOWN")]
print(city_df['city'].value_counts())

# Remove observations containing missing missing values for latitude and longitude
gtd_clean = gtd_df.dropna().copy()
gtd_clean.info(verbose = True)

(4556, 48)
UNKNOWN    2249
Name: city, dtype: int64
<class 'pandas.core.frame.DataFrame'>
Float64Index: 110698 entries, 197001000000.0 to 201712000000.0
Data columns (total 48 columns):
 #   Column            Non-Null Count   Dtype   
---  ------            --------------   -----   
 0   iyear             110698 non-null  int64   
 1   imonth            110698 non-null  int64   
 2   iday              110698 non-null  int64   
 3   country_txt       110698 non-null  category
 4   region_txt        110698 non-null  category
 5   provstate         110698 non-null  object  
 6   city              110698 non-null  object  
 7   latitude          110698 non-null  float64 
 8   longitude         110698 non-null  float64 
 9   specificity       110698 non-null  category
 10  summary           110698 non-null  object  
 11  attacktype1_txt   110698 non-null  category
 12  targtype1_txt     110698 non-null  category
 13  targsubtype1_txt  110698 non-null  category
 14  corp1             110698 

In [None]:
# 297 iday attributes contain 0 to represent unknown, setting 1
gtd_clean.loc[gtd_clean['iday'] == 0, 'iday'] = 1

gtd_clean.loc[gtd_clean['imonth'] == 0, 'imonth'] = 1

gtd_clean['incident_date'] = (gtd_clean['iyear'].astype(str) + '-' +
                              gtd_clean['imonth'].astype(str) + '-' +
                              gtd_clean['iday'].astype(str))

gtd_clean['incident_date'] = pd.to_datetime(gtd_clean['incident_date'],format="%Y-%m-%d")
gtd_clean.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
Float64Index: 110698 entries, 197001000000.0 to 201712000000.0
Data columns (total 49 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   iyear             110698 non-null  int64         
 1   imonth            110698 non-null  int64         
 2   iday              110698 non-null  int64         
 3   country_txt       110698 non-null  category      
 4   region_txt        110698 non-null  category      
 5   provstate         110698 non-null  object        
 6   city              110698 non-null  object        
 7   latitude          110698 non-null  float64       
 8   longitude         110698 non-null  float64       
 9   specificity       110698 non-null  category      
 10  summary           110698 non-null  object        
 11  attacktype1_txt   110698 non-null  category      
 12  targtype1_txt     110698 non-null  category      
 13  targsubtype1_txt  110698 non-null  c

In [None]:
gtd_clean.to_csv("/content/drive/MyDrive/data/gtdeda.csv", sep = ",")