<a href="https://colab.research.google.com/github/Maha3061/Global-Terrorism-Analysis/blob/main/Global_Terrorism_Data_cleansing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Configure notebook output
#from IPython.core.interactiveshell import InteractiveShell
#InteractiveShell.ast_node_interactivity = "all"

# Display up to 150 rows and columns
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 150)

In [None]:
gtd_df = pd.read_csv('/content/drive/MyDrive/data/gtd_preprocessed_95t016.csv', low_memory=False, index_col = 0,
                      na_values=[''])

In [None]:
# Display a summary of the data frame
gtd_df.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5109 entries, 0 to 5108
Data columns (total 126 columns):
 #    Column              Dtype  
---   ------              -----  
 0    eventid             float64
 1    iyear               int64  
 2    imonth              int64  
 3    iday                int64  
 4    approxdate          object 
 5    resolution          object 
 6    country_txt         object 
 7    region_txt          object 
 8    provstate           object 
 9    city                object 
 10   latitude            float64
 11   longitude           float64
 12   specificity         int64  
 13   location            object 
 14   summary             object 
 15   alternative         float64
 16   alternative_txt     object 
 17   attacktype1_txt     object 
 18   attacktype2         float64
 19   attacktype2_txt     object 
 20   attacktype3         float64
 21   attacktype3_txt     object 
 22   targtype1_txt       object 
 23   targsubtype1_txt    object 
 24   co

In [None]:
gtd_df.loc[gtd_df['weaptype1_txt'] ==
           'Vehicle (not to include vehicle-borne explosives, i.e., car or truck bombs)',
           'weaptype1_txt'] = 'Vehicle (non-explosives)'

gtd_df.loc[gtd_df['attacktype1_txt'] ==
           'Hostage Taking (Barricade Incident)',
           'attacktype1_txt'] = 'Hostage Taking (Barricade)'

In [None]:
# List of attributes that are categorical
cat_attrs = ['extended_txt', 'country_txt', 'region_txt', 'specificity', 'vicinity_txt',
             'crit1_txt', 'crit2_txt', 'crit3_txt', 'doubtterr_txt', 'multiple_txt',
             'success_txt', 'suicide_txt', 'attacktype1_txt', 'targtype1_txt',
             'targsubtype1_txt', 'natlty1_txt', 'guncertain1_txt', 'individual_txt',
             'claimed_txt', 'weaptype1_txt', 'weapsubtype1_txt', 'property_txt',
             'ishostkid_txt', 'INT_LOG_txt', 'INT_IDEO_txt','INT_MISC_txt', 'INT_ANY_txt']

for cat in cat_attrs:
    gtd_df[cat] = gtd_df[cat].astype('category')

gtd_df.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5109 entries, 0 to 5108
Data columns (total 126 columns):
 #    Column              Dtype   
---   ------              -----   
 0    eventid             float64 
 1    iyear               int64   
 2    imonth              int64   
 3    iday                int64   
 4    approxdate          object  
 5    resolution          object  
 6    country_txt         category
 7    region_txt          category
 8    provstate           object  
 9    city                object  
 10   latitude            float64 
 11   longitude           float64 
 12   specificity         category
 13   location            object  
 14   summary             object  
 15   alternative         float64 
 16   alternative_txt     object  
 17   attacktype1_txt     category
 18   attacktype2         float64 
 19   attacktype2_txt     object  
 20   attacktype3         float64 
 21   attacktype3_txt     object  
 22   targtype1_txt       category
 23   targsubtype

In [None]:
gtd_df[['nperpcap', 'nkill', 'nkillus', 'nkillter', 'nwound',
        'nwoundus', 'nwoundte']].dropna().describe(
    percentiles = [0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 1.0]).transpose()

Unnamed: 0,count,mean,std,min,10%,20%,30%,40%,50%,60%,70%,80%,90%,100%,max
nperpcap,110.0,1.618182,1.420569,0.0,0.0,1.0,1.0,1.0,1.0,1.0,2.0,3.0,4.0,6.0,6.0
nkill,110.0,0.454545,1.037254,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,8.0,8.0
nkillus,110.0,0.372727,0.946835,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,8.0,8.0
nkillter,110.0,0.063636,0.280147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0
nwound,110.0,1.409091,6.739307,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.1,56.0,56.0
nwoundus,110.0,0.9,4.223026,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,36.0,36.0
nwoundte,110.0,0.036364,0.18805,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [None]:
# Function to impute either the median or mean
def fill_value(attr):
    fill = 0.0
    threshold = 3
    attr_clean = attr.dropna()
    attr_std = attr_clean.std()
    outliers = attr_clean[attr_clean > (threshold * attr_std)]

    if (outliers.count() > 0):
        fill = attr_clean.median()
    else:
        fill = attr_clean.mean()

    return fill

In [None]:
# Impute each of the numeric attributes that contain missing values
gtd_df['nperpcap'] = gtd_df['nperpcap'].fillna(fill_value(gtd_df['nperpcap']))
gtd_df['nkill'] = gtd_df['nkill'].fillna(fill_value(gtd_df['nkill']))
gtd_df['nkillus'] = gtd_df['nkillus'].fillna(fill_value(gtd_df['nkillus']))
gtd_df['nkillter'] = gtd_df['nkillter'].fillna(fill_value(gtd_df['nkillter']))
gtd_df['nwound'] = gtd_df['nwound'].fillna(fill_value(gtd_df['nwound']))
gtd_df['nwoundus'] = gtd_df['nwoundus'].fillna(fill_value(gtd_df['nwoundus']))
gtd_df['nwoundte'] = gtd_df['nwoundte'].fillna(fill_value(gtd_df['nwoundte']))

In [None]:
gtd_df[['nperpcap', 'nkill', 'nkillus', 'nkillter', 'nwound',
        'nwoundus', 'nwoundte']].describe(
    percentiles=[0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 1.0]).transpose()

Unnamed: 0,count,mean,std,min,10%,20%,30%,40%,50%,60%,70%,80%,90%,100%,max
nperpcap,5109.0,1.015071,0.244693,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,7.0,7.0
nkill,5109.0,0.646506,3.029339,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,92.0,92.0
nkillus,5109.0,0.034645,0.509285,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0,30.0
nkillter,5109.0,0.042083,1.231754,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,85.0,85.0
nwound,5109.0,0.717753,7.155573,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,285.0,285.0
nwoundus,5109.0,0.079076,1.183948,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,50.0
nwoundte,5109.0,0.003915,0.094816,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0


In [None]:
gtd_df.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5109 entries, 0 to 5108
Data columns (total 126 columns):
 #    Column              Dtype   
---   ------              -----   
 0    eventid             float64 
 1    iyear               int64   
 2    imonth              int64   
 3    iday                int64   
 4    approxdate          object  
 5    resolution          object  
 6    country_txt         category
 7    region_txt          category
 8    provstate           object  
 9    city                object  
 10   latitude            float64 
 11   longitude           float64 
 12   specificity         category
 13   location            object  
 14   summary             object  
 15   alternative         float64 
 16   alternative_txt     object  
 17   attacktype1_txt     category
 18   attacktype2         float64 
 19   attacktype2_txt     object  
 20   attacktype3         float64 
 21   attacktype3_txt     object  
 22   targtype1_txt       category
 23   targsubtype

In [None]:
# Select the observations that contain null
ll_df = gtd_df[np.isnan(gtd_df.latitude)]
print(ll_df.shape)

# Chech how many observations have city set to Unknown
city_df = ll_df[(ll_df['city'] == "UNKNOWN")]
print(city_df['city'].value_counts())

# Remove observations containing missing missing values for latitude and longitude
gtd_clean = gtd_df.dropna().copy()
gtd_clean.info(verbose = True)

(90, 126)
UNKNOWN    80
Name: city, dtype: int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Data columns (total 126 columns):
 #    Column              Dtype   
---   ------              -----   
 0    eventid             float64 
 1    iyear               int64   
 2    imonth              int64   
 3    iday                int64   
 4    approxdate          object  
 5    resolution          object  
 6    country_txt         category
 7    region_txt          category
 8    provstate           object  
 9    city                object  
 10   latitude            float64 
 11   longitude           float64 
 12   specificity         category
 13   location            object  
 14   summary             object  
 15   alternative         float64 
 16   alternative_txt     object  
 17   attacktype1_txt     category
 18   attacktype2         float64 
 19   attacktype2_txt     object  
 20   attacktype3         float64 
 21   attacktype3_txt     object  
 22   targtype1_t

In [None]:
# 297 iday attributes contain 0 to represent unknown, setting 1
gtd_clean.loc[gtd_clean['iday'] == 0, 'iday'] = 1

gtd_clean['incident_date'] = (gtd_clean['iyear'].astype(str) + '-' +
                              gtd_clean['imonth'].astype(str) + '-' +
                              gtd_clean['iday'].astype(str))

gtd_clean['incident_date'] = pd.to_datetime(gtd_clean['incident_date'],
                                            format="%Y-%m-%d")
gtd_clean.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Data columns (total 127 columns):
 #    Column              Dtype         
---   ------              -----         
 0    eventid             float64       
 1    iyear               int64         
 2    imonth              int64         
 3    iday                int64         
 4    approxdate          object        
 5    resolution          object        
 6    country_txt         category      
 7    region_txt          category      
 8    provstate           object        
 9    city                object        
 10   latitude            float64       
 11   longitude           float64       
 12   specificity         category      
 13   location            object        
 14   summary             object        
 15   alternative         float64       
 16   alternative_txt     object        
 17   attacktype1_txt     category      
 18   attacktype2         float64       
 19   attacktype2_txt     object        
 20 

In [None]:
gtd_clean.to_csv("/content/drive/MyDrive/data/gtd_clean.csv", sep = ",")