# **Global Terrorism Database (GTD) - Exploratory Data Analysis**

In [1]:
%config Completer.use_jedi = False

In [2]:
### importing required libraries 
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from geopy.geocoders import Nominatim
import langid
import multiprocessing
from multiprocessing import Pool

In [4]:
### Setting plotting background
sns.set_style('darkgrid')
plt.rcParams['font.size'] = 11
plt.rcParams['figure.facecolor'] = '#00000000'

In [6]:
#multiprocessing.cpu_count()

In [5]:
import warnings
warnings.filterwarnings("ignore")

##   **1. Data Preparation and Cleaning** 
a.   Loading the dataset into pandas dataframe <br>
b.   Checking the info. about data and columns <br>
c.   Feature selection <br>
d.   Fixing the missing & incorrect values.<br>


In [None]:
# ### mounting google drive
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
### reading the csv file into colab 
#GTA_data  = pd.read_csv('/content/drive/MyDrive/AlmaBetter/Team Capstone Projects/1. EDA/Global Terrorism Data.csv',encoding = "ISO-8859-1")

In [None]:
GTA_data  = pd.read_csv('/kaggle/input/gtd/globalterrorismdb_0718dist.csv',encoding = "ISO-8859-1")

In [None]:
### creating a copy of orginal data 
data = GTA_data.copy()

In [None]:
data.head()

#### **Checking the info. about data and columns**

In [None]:
data.shape

In [None]:
data.info()

In [None]:
list(data.columns)

In [None]:
""" Since the data consists of 135 feature columns I am checking the amount of missing/null values for each features which consists of 
                    more than  30% of missing values.  """
data.isna().sum()

In [None]:
###  Percentage of missing values per columns
Missing_values_percent = ((data.isna().sum()/data.shape[0])*100)
Missing_values_percent.sort_values(ascending=False)

In [None]:
print(f'The number of columns which have more than 30% of null values are:{ len(Missing_values_percent [Missing_values_percent >= 30])}')

In [None]:
#### name of columns that consists of less than 30% of null values
Missing_values_percent [Missing_values_percent <= 30].index

In [None]:
#### We are dropping all the columns that consits of more than 30% of null values.  
# perc = 30
# thresh_count =  int(((100-perc)/100)*data.shape[0] + 1)
# data.dropna(axis=1, thresh=thresh_count,inplace = True)

In [None]:
# data.columns

#### **Feature Selections**

In [None]:
###  Selecting only required features from the above data for further analysis 
keep_cols = ['eventid', 'iyear', 'imonth', 'iday', 'extended','country_txt','region_txt','city', 'latitude',
       'longitude','success','attacktype1_txt', 'targtype1_txt','corp1', 'target1','natlty1_txt','targsubtype1_txt',
       'gname','weaptype1_txt','nkill','nwound','property']

### creating a new dataframe with selected features 
data_new = data[keep_cols]

In [None]:
### Updating the headers
data_new.rename(columns = {'iyear':'year','imonth':'month','iday':'day','country_txt':'country_name','region_txt':'region',
                           'attacktype1_txt':'attack_type','targtype1_txt':'victim_cata','target1':'specific_victim','targsubtype1_txt':'victim_subtype_cata',
                           'corp1':'entity_name','natlty1_txt':'victim_nationaliy','gname':'perpetrator_group','weaptype1_txt':'weapon','nkill':'num_fatalities',
                           'nwound':'num_injured'},inplace = True)

In [None]:
data_new.head()

#### **Missing Values**

In [None]:
missing_values = ((data_new.isna().sum()/data_new.shape[0])*100).sort_values(ascending= False)
missing_values

In [None]:
data_new[['num_injured','num_fatalities']] = data_new[['num_injured','num_fatalities']].fillna(method='ffill')
data_new[['entity_name','specific_victim','victim_nationaliy']] =data_new[['entity_name','specific_victim','victim_nationaliy']].fillna('Unknown')
data_new['victim_subtype_cata'] = data_new['victim_subtype_cata'].fillna('Unnamed Civilian/Unspecified')

In [None]:
print(data_new[data_new.city == 'Unknown']['city'].count())
print(data_new[data_new.city == 'unknown']['city'].count())
print(len(data_new[data_new.city.isna() == True]['city']))

In [None]:
data_new.city = data_new.city.replace('unknown','Unknown')
data_new['city'] = data_new['city'].fillna('Unknown')
percent_city_unknown = len(data_new[data_new.city =='Unknown'])/len(data_new)*100
print(f'The percentage of total unkown values in the city columns {percent_city_unknown}')

In [None]:
data.city.value_counts()

In [None]:
unknown_city_data = data_new[(data_new['longitude'].isna()== False) & (data_new['latitude'].isna()== False) & (data_new['city'] == 'Unknown')][['longitude','latitude','city']]
unknown_city_data.head()

In [None]:
print(f'The length of data which have unknown value for city column whose latitude and longitude is present is {len(unknown_city_data)}')

### Multiprocessing 

In [None]:
geolocator = Nominatim(user_agent="http")

def update_city(row):
    try:
        Longitude = str(row['longitude'])
        Latitude = str(row['latitude'])
        location = geolocator.reverse(Latitude+","+Longitude, exactly_one=True)
        address = location.raw['address']
        city = address.get('city', '')

    ### Selecting the city name only in english language
        if langid.classify(city)[0] =='en':
            row['city'] = row['city'].replace('Unknown', str(city))
        else:
            row['city'] = row['city']
        return row
    
    except Exception as e:
        print(e)
        
        
def update_city2(df):
    df2 = df.apply(update_city , axis = 1)
    return df2


df1 = unknown_city_data[0:2000]
df2 = unknown_city_data[2000:4000]   
#df1 = unknown_city_data[4000:6000]
#df2 = unknown_city_data[6000:]   

if __name__ == '__main__':
    df = unknown_city_data
    p = Pool()
    result = p.map(update_city2,[df1,df2
                                 ])
    p.close()
    p.join()
    print(result)

In [None]:
pd.concat(result,axis =0).to_csv('Geocoder_city.csv')

In [None]:
df1 = unknown_city_data[4000:6000]
df2 = unknown_city_data[6000:]   

if __name__ == '__main__':
    df = unknown_city_data
    p = Pool()
    result = p.map(update_city2,[df1,df2
                                 ])
    p.close()
    p.join()
    print(result)

In [None]:
pd.concat(result,axis =0).to_csv('Geocoder_city1.csv')

In [None]:
Geocoder_data_part1 = pd.read_csv('./Geocoder_city.csv')
Geocoder_data_part2 = pd.read_csv('./Geocoder_city1.csv')

city_data = pd.concat([Geocoder_data_part1,Geocoder_data_part2],axis = 0)
city_data = city_data[(city_data.city !='Unknown') & (city_data.city.notnull())]

In [None]:
city_data

In [None]:
### checking out is there any missing values present in lat and long columns where city and region is not null
lat_long_missing = data_new[(data_new['longitude'].isna()== True) & 
                     (data_new['latitude'].isna()== True) & 
                     (data_new['city']!= 'Unknown') &     
                     (data_new['country_name']!= 'Unknown')][['longitude','latitude','city','country_name']]
len(lat_long_missing)

In [None]:
lat_long_missing.head()

In [None]:
# from geopy.geocoders import Nominatim
# geolocator = Nominatim(user_agent="my_user_agent")
# def find_lat_long(row):
#     try:
#         city = str(row['city'])
#         country = str(row['country_name'])
#         loc = geolocator.geocode(city+','+ country)
#         #loc = geolocator.geocode(country,timeout=10)
#         row['longitude'] = loc.longitude
#         row['latitude'] = loc.latitude
#         return row
#     except:
#         pass

In [None]:
#lat_long_missing[:5].apply(find_lat_long , axis = 1)

In [None]:
# def update_lat_long(df):
#     df_2 = df.apply(find_lat_long , axis = 1)
#     return df_2

In [None]:
# df1 = lat_long_missing[:10]
# df2 = lat_long_missing[10:20]

# df1 = unknown_city_data[:1000]
# df2 = unknown_city_data[1000:2000]

In [None]:
# if __name__ == '__main__':
#     df = lat_long_missing
#     p = Pool()
#     result = p.map(update_lat_long,[df1,df2
#                                  ])
#     p.close()
#     p.join()
#     print(result)
    
    
# if __name__ == '__main__':
#     df = unknown_city_data
#     p = Pool()
#     result = p.map(update_city2,[df1,df2
#                                  ])
#     p.close()
#     p.join()
#     print(result)

In [None]:
#pd.concat(result,axis =0)

In [None]:
#pd.concat(result,axis =0).to_csv('Geocoder_data.csv')

In [None]:

# ### Function for finding out the city name from lattude and longitude
# def update_city(row):
#   Longitude = str(row['longitude'])
#   Latitude = str(row['latitude'])
#   location = geolocator.reverse(Latitude+","+Longitude, exactly_one=True)
#   address = location.raw['address']
#   city = address.get('state', '')

# ### Selecting the city name only in english language
#   if langid.classify(city)[0] =='en':
#     row['city'] = row['city'].replace('Unknown', str(city))
#   else:
#     row['city'] = row['city']
#   return row
#   #return row['city']

In [None]:
# #geolocator = Nominatim(user_agent="geoapiExercises")
# geolocator = Nominatim(user_agent="http")
# ### Function for finding out the city name from lattude and longitude
# def update_city(row):
    
#   Longitude = str(row['longitude'])
#   Latitude = str(row['latitude'])
#   location = geolocator.reverse(Latitude+","+Longitude, exactly_one=True)
#   address = location.raw['address']
#   city = address.get('state', '')

# ### Selecting the city name only in english language
#   if langid.classify(city)[0] =='en':
#     row['city'] = row['city'].replace('Unknown', str(city))
#   else:
#     row['city'] = row['city']
#   return row
#   #return row['city']

### Multiprocessing

In [None]:
#pd.concat(result,axis =0).to_csv('Geocoder_city1.csv')

In [None]:
pd.concat(result,axis =0)

In [None]:
#unknown_city_data[0:5].apply(update_city , axis = 1)

In [None]:
#multiprocessing.cpu_count()

In [None]:
# def update_city2(df):
#     df2 = df.apply(update_city , axis = 1)
#     return df2

In [None]:
# df1 = unknown_city_data[7010:7020]
# df2 = unknown_city_data[7020:7030]

In [None]:
# if __name__ == '__main__':
#     df = unknown_city_data
#     p = Pool()
#     result = p.map(update_city2,[df1,df2
#                                  ])
#     p.close()
#     p.join()
#     print(result)

In [None]:
#pd.concat(result,axis =0)

In [None]:
#x.head(50)

In [None]:
#pd.concat(result,axis =0).to_csv('Geocoder_data_part4.csv')

In [None]:
# import multiprocessing
# import time

# def update_city2(df,queue):
#     df2 = df.apply(update_city , axis = 1)
#     queue.put(df)
#     #return df2

# queue = multiprocessing.SimpleQueue()
# tasks = [df1,df2]
                                 

# for task in tasks:
#     multiprocessing.Process(target=update_city2, args=(task, queue,)).start()

# for _ in tasks:
#     #print(pd.concat(queue.get(),axis = 0))
#     print(queue.get())

In [None]:
# import multiprocessing
# import time

# def worker(x, queue):
#     time.sleep(1)
#     queue.put(x)

# queue = multiprocessing.SimpleQueue()
# tasks = range(10)

# for task in tasks:
#     multiprocessing.Process(target=worker, args=(task, queue,)).start()

# for _ in tasks:
#     print(queue.get())

In [None]:
# def update_city(row):
#   Longitude = str(row['longitude'])
#   Latitude = str(row['latitude'])
#   location = geolocator.reverse(Latitude+","+Longitude, exactly_one=True)
#   address = location.raw['address']
#   city = address.get('state', '')
#   row['city'] = row['city'].replace('Unknown', str(city))
#   return row['city']

In [None]:
# unknown_city_data.apply(update_city , axis = 1)

In [None]:
# y = pd.DataFrame(x.apply(update_city , axis = 1),columns =['city'])
# y

In [None]:
data.city.value_counts()

In [None]:
((data_new.isna().sum()/data_new.shape[0])*100).sort_values(ascending= False)

In [None]:
data_new.describe()

### Updating incorrect values

In [None]:
### checking out the columns that contains 0 values for day and month 
data_new[(data_new.month == 0) | (data_new.day ==0)].head()

In [None]:
### replacing month 0 values with month 12
data_new.month.replace(0,12 , inplace = True)

### replacing day 0 values with day 31
data_new.day.replace(0,31, inplace = True)

## Exploratory Analysis and Visualisation

In [None]:
### total number of terrorist attacks per year basesd on success and not success
plt.rcParams['figure.figsize'] = (20, 5)
fig = px.histogram(data_new, 
                   x='year', 
                   color = 'success', 
                   color_discrete_sequence=['green', 'grey'], 
                   title='Number of terrorist attack per year')
fig.update_layout(bargap=0.1)
fig.update_xaxes(tickangle= -90)
fig.show()

In [None]:
### Attacks per regions and top 20 countries where the terrorist attacks happended

plt.figure(figsize=(17, 10))
ax1 = plt.subplot(121)
colors = sns.color_palette('pastel')[0:10]+sns.color_palette('bright')[0:2]
data_new.region.value_counts().plot(kind='pie',autopct="%.1f%%",pctdistance= 1.09, colors = colors,labels = None)
                                              # ,rotatelabels=True,labeldistance=1)
ax1.legend(labels =data_new.region.value_counts().index, bbox_to_anchor = (1,1),loc = 2)
ax1.set_title('Total attacks per each region',size = 14)
ax2 = plt.subplot(122)
sns.barplot(x =data_new.groupby('country_name').eventid.count().sort_values(ascending=False)[:30].values , 
            y = data_new.groupby('country_name').eventid.count().sort_values(ascending=False)[:30].index )
ax2.set_title('Top 20 countries have higher number of attacks',size = 14)
plt.tight_layout()
plt.show()

In [None]:
### Methode of attack
plt.rcParams['figure.figsize'] = (15, 8)
sns.countplot(x="attack_type", data= data_new, order=data_new.attack_type.value_counts().index)
plt.title('Methode of attack', size = 14)
plt.xticks(rotation=90)
plt.show()

In [None]:
### types of victims of attack
plt.rcParams['figure.figsize'] = (20, 8)
sns.countplot(x="victim_cata", data= data_new, order=data_new.victim_cata.value_counts().index)
plt.title('Types of victims', size = 14)
plt.xticks(rotation=90)
plt.show()

In [None]:
plt.rcParams['figure.figsize'] = (20, 5)
plt.title('Trends in Data (Understanding spikes)', size = 14)
data_new.groupby('year')['num_fatalities'].sum().plot()
data_new.groupby('year')['num_injured'].sum().plot()
data_new.groupby('year')['eventid'].count().plot()
data_new[data_new.property  == 1].groupby('year')['property'].sum().plot()

In [None]:
plt.rcParams['figure.figsize'] = (12, 10)
sns.barplot(data_new['perpetrator_group'].value_counts()[1:25].values,data_new['perpetrator_group'].value_counts()[1:25].index)
plt.show()

In [None]:
extended_attack_df = data_new[data_new['extended'] == 1]
non_extended_attack_df = data_new[~ (data_new['extended'] == 1)]

In [None]:
plt.figure(figsize=(20, 5))
ax1 = plt.subplot(121)
sns.barplot(extended_attack_df['attack_type'].value_counts().values,extended_attack_df['attack_type'].value_counts().index, color='blue')
ax1.set_title('Attack extended more than 24 hours ',size = 14)
ax2 = plt.subplot(122)
sns.barplot(non_extended_attack_df['attack_type'].value_counts().values,non_extended_attack_df['attack_type'].value_counts().index, color='green')
ax2.set_title('Attack not extended more than 24 hours ',size = 14)
plt.tight_layout()
plt.show()

In [None]:
data_new.columns

In [None]:
### top 10 countries faced higher property damage
property_damage = data_new[data_new.property  == 1] 

In [None]:
property_damage.groupby('country_name')['property'].sum().sort_values(ascending  =False)