<a href="https://colab.research.google.com/github/Maha3061/Global-Terrorism-Analysis/blob/main/gtdpreprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Configure notebook output
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Number of rows and columns
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 150)

In [None]:
gtd_df = pd.read_csv('/content/drive/MyDrive/data/globalterrorism.csv', low_memory=False, index_col = 0,
                      na_values=[''],encoding='latin1')


In [None]:
gtd_df.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
Float64Index: 181691 entries, 197000000000.0 to 201712000000.0
Data columns (total 134 columns):
 #    Column              Dtype  
---   ------              -----  
 0    iyear               int64  
 1    imonth              int64  
 2    iday                int64  
 3    approxdate          object 
 4    extended            int64  
 5    resolution          object 
 6    country             int64  
 7    country_txt         object 
 8    region              int64  
 9    region_txt          object 
 10   provstate           object 
 11   city                object 
 12   latitude            float64
 13   longitude           float64
 14   specificity         float64
 15   vicinity            int64  
 16   location            object 
 17   summary             object 
 18   crit1               int64  
 19   crit2               int64  
 20   crit3               int64  
 21   doubtterr           float64
 22   alternative         float64
 23   alternati

In [None]:
# Check the number of missing values in each attribute
count = gtd_df.isnull().sum()
percent = round(count / 181692 * 100, 2)
series = [count, percent]
result = pd.concat(series, axis=1, keys=['Count','Percent'])
result.sort_values(by='Count', ascending=False)

Unnamed: 0,Count,Percent
gsubname3,181671,99.99
weapsubtype4_txt,181621,99.96
weapsubtype4,181621,99.96
weaptype4_txt,181618,99.96
weaptype4,181618,99.96
claimmode3,181558,99.93
claimmode3_txt,181558,99.93
gsubname2,181531,99.91
claim3,181373,99.82
guncertain3,181371,99.82


In [None]:
target_attrs = result[result['Percent'] < 40.0]
keep_attrs = target_attrs.index.values



In [None]:
keep_attrs = keep_attrs[keep_attrs != 'nperps']
keep_attrs

# Remove attributes that duplicate another attribute
keep_attrs = keep_attrs[keep_attrs != 'country']
keep_attrs = keep_attrs[keep_attrs != 'region']
keep_attrs = keep_attrs[keep_attrs != 'attacktype1']
keep_attrs = keep_attrs[keep_attrs != 'targtype1']
keep_attrs = keep_attrs[keep_attrs != 'targsubtype1']
keep_attrs = keep_attrs[keep_attrs != 'natlty1']
keep_attrs = keep_attrs[keep_attrs != 'weaptype1']
keep_attrs = keep_attrs[keep_attrs != 'weapsubtype1']

array(['iyear', 'imonth', 'iday', 'extended', 'country', 'country_txt',
       'region', 'region_txt', 'provstate', 'city', 'latitude',
       'longitude', 'specificity', 'vicinity', 'summary', 'crit1',
       'crit2', 'crit3', 'doubtterr', 'multiple', 'success', 'suicide',
       'attacktype1', 'attacktype1_txt', 'targtype1', 'targtype1_txt',
       'targsubtype1', 'targsubtype1_txt', 'corp1', 'target1', 'natlty1',
       'natlty1_txt', 'gname', 'guncertain1', 'individual', 'nperpcap',
       'claimed', 'weaptype1', 'weaptype1_txt', 'weapsubtype1',
       'weapsubtype1_txt', 'weapdetail', 'nkill', 'nkillus', 'nkillter',
       'nwound', 'nwoundus', 'nwoundte', 'property', 'ishostkid',
       'scite1', 'dbsource', 'INT_LOG', 'INT_IDEO', 'INT_MISC', 'INT_ANY'],
      dtype=object)

In [None]:
subset_df = gtd_df.loc[:, keep_attrs]
subset_df.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
Float64Index: 181691 entries, 197000000000.0 to 201712000000.0
Data columns (total 48 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   iyear             181691 non-null  int64  
 1   imonth            181691 non-null  int64  
 2   iday              181691 non-null  int64  
 3   extended          181691 non-null  int64  
 4   country_txt       181691 non-null  object 
 5   region_txt        181691 non-null  object 
 6   provstate         181270 non-null  object 
 7   city              181257 non-null  object 
 8   latitude          177135 non-null  float64
 9   longitude         177134 non-null  float64
 10  specificity       181685 non-null  float64
 11  vicinity          181691 non-null  int64  
 12  summary           115562 non-null  object 
 13  crit1             181691 non-null  int64  
 14  crit2             181691 non-null  int64  
 15  crit3             181691 non-null  int64  
 1

In [None]:
# Categorical Variables
# ---------------------
subset_df['specificity'].fillna(-1, inplace=True)

subset_df.loc[subset_df['vicinity'] == -9, 'vicinity'] = -1

subset_df.loc[subset_df['doubtterr'] == -9, 'doubtterr'] = -1

subset_df['targsubtype1_txt'].fillna('UNKNOWN', inplace=True)

subset_df['natlty1_txt'].fillna('UNKNOWN', inplace=True)

subset_df['guncertain1'].fillna(-1, inplace=True)

subset_df['claimed'].fillna(-1, inplace=True)
subset_df.loc[subset_df['claimed'] == -9, 'claimed'] = -1
subset_df['weapsubtype1_txt'].fillna('UNKNOWN', inplace=True)

subset_df.loc[subset_df['property'] == -9, 'property'] = -1

subset_df['ishostkid'].fillna(-1, inplace=True)
subset_df.loc[subset_df['ishostkid'] == -9, 'ishostkid'] = -1

subset_df.loc[subset_df['INT_LOG'] == -9, 'INT_LOG'] = -1

subset_df.loc[subset_df['INT_IDEO'] == -9, 'INT_IDEO'] = -1

subset_df.loc[subset_df['INT_MISC'] == -9, 'INT_MISC'] = -1

subset_df.loc[subset_df['INT_ANY'] == -9, 'INT_ANY'] = -1


# Numeric Variables
# -----------------
subset_df.loc[subset_df['nperpcap'] == -9, 'nperpcap'] = np.nan
subset_df.loc[subset_df['nperpcap'] == -99, 'nperpcap'] = np.nan


# Text Variables
# --------------
subset_df['provstate'].fillna('UNKNOWN', inplace=True)
subset_df['city'].fillna('UNKNOWN', inplace=True)
subset_df.loc[subset_df['city'] == 'Unknown', 'city'] = 'UNKNOWN'
subset_df['summary'].fillna('UNKNOWN', inplace=True)
subset_df['corp1'].fillna('UNKNOWN', inplace=True)
subset_df['target1'].fillna('UNKNOWN', inplace=True)
subset_df['scite1'].fillna('UNKNOWN', inplace=True)


In [None]:
# Map the codes to labels
ynu_map = {1: 'YES', 0: 'NO', -1: 'UKNOWN'}

# List of target attributes to map
ynu_attrs =['extended', 'vicinity', 'crit1', 'crit2', 'crit3', 'doubtterr', 'multiple',
            'success', 'suicide', 'guncertain1', 'individual', 'claimed', 'property',
            'ishostkid', 'INT_LOG', 'INT_IDEO', 'INT_MISC', 'INT_ANY']

# Iterate over each target attribute and map it
for att in ynu_attrs:
    att_txt = att + '_txt'
    subset_df[att_txt] = subset_df[att].map(ynu_map)

# Get the list of attributes, dropping the coded for labeled attributes
final_attrs = []

for attr in subset_df.columns.values:
    if attr not in ynu_attrs:
        final_attrs.append(attr)

subset_df2 = subset_df.loc[:, final_attrs]
subset_df2.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
Float64Index: 181691 entries, 197000000000.0 to 201712000000.0
Data columns (total 48 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   iyear             181691 non-null  int64  
 1   imonth            181691 non-null  int64  
 2   iday              181691 non-null  int64  
 3   country_txt       181691 non-null  object 
 4   region_txt        181691 non-null  object 
 5   provstate         181691 non-null  object 
 6   city              181691 non-null  object 
 7   latitude          177135 non-null  float64
 8   longitude         177134 non-null  float64
 9   specificity       181691 non-null  float64
 10  summary           181691 non-null  object 
 11  attacktype1_txt   181691 non-null  object 
 12  targtype1_txt     181691 non-null  object 
 13  targsubtype1_txt  181691 non-null  object 
 14  corp1             181691 non-null  object 
 15  target1           181691 non-null  object 
 1

In [None]:
subset_df2.to_csv("/content/drive/MyDrive/data/gtdpreprocess.csv", sep = ",")