# Project 2: Global Terrorism Data Analysis

In [None]:
# EDA

import pandas as pd
import statsmodels.api as sm
import numpy as np
from sklearn.impute import SimpleImputer
import plotly.express as px

df = pd.read_csv("/content/globalterrorismdb_0718dist.csv", encoding='latin-1', engine='python', on_bad_lines='warn')

pd.set_option('display.max_columns', None)
print(df.head(100))
#prints first 100 data and displays all the columns.

for col in df.columns:
  print(col)
#prints all columns one after the other.

# encoding='latin-1' helps to sort the initial encoding error of the dataset, latin-1 is generally accepted for multilingual dataset or for old excel files
# engine='python', on_bad_lines='warn', helps to skip bad lines with issues to ensure proper loading of the dataset.


# For the EDA I followed some steps, which are;

#  Getting The Shape Of The DataSet
print(df.shape)

# Sorting Out Missing Values Using Simple Imputer
imputer=SimpleImputer(strategy='most_frequent')
df[["approxdate","related"]] = imputer.fit_transform(df[["approxdate","related"]])
print(df.isnull().sum())

#Checking For Duplicates: Since im trying to do plain analysis using visualization there was no need to delete the duplicate values,
# but had it been i was working on a model i would have cleared it off
df1 = df.duplicated()
print(df1)

# # Dataset Description
print(df.describe())




  df = pd.read_csv("/content/globalterrorismdb_0718dist.csv", encoding='latin-1', engine='python', on_bad_lines='warn')


         eventid iyear imonth iday approxdate extended resolution  country  \
0   197000000001  1970      7    2        NaN        0        NaN       58   
1   197000000002  1970      0    0        NaN        0        NaN      130   
2   197001000001  1970      1    0        NaN        0        NaN      160   
3   197001000002  1970      1    0        NaN        0        NaN       78   
4   197001000003  1970      1    0        NaN        0        NaN      101   
..           ...   ...    ...  ...        ...      ...        ...      ...   
95  197002280001  1970      2   28        NaN        0        NaN      102   
96  197003000001  1970      3    0        NaN        0        NaN      160   
97  197003010001  1970      3    1        NaN        0        NaN       98   
98  197003010005  1970      3    1        NaN        0        NaN      217   
99  197003010006  1970      3    1        NaN        0        NaN      217   

           country_txt  region                   region_txt  \


In [None]:
# 1. Which countries experience the highest number of terrorist attacks?
top_countries = df['country_txt'].value_counts()

print(top_countries.head(10))
#Based on the result i have, Iraq is the country with the highest number of terrorist attacks

country_txt
Iraq              24974
Pakistan          14527
Afghanistan       13020
India             12068
Colombia           8313
Philippines        7012
Peru               6096
El Salvador        5320
United Kingdom     5256
Turkey             4305
Name: count, dtype: int64


In [None]:
# 2. What are the most common types of terrorist attacks (bombings, armed assaults, etc.)?
types_terrorist_attacks = df['attacktype1_txt'].value_counts()
print(types_terrorist_attacks.head(10))

#From the result i have, Bombing/Explosion is the most common types of terrorist attacks, followed by Armed Assault, Assassination, Hostage Taking (Kidnapping) etc.

attacktype1_txt
Bombing/Explosion                      89190
Armed Assault                          43162
Assassination                          19437
Hostage Taking (Kidnapping)            11330
Facility/Infrastructure Attack         10427
Unknown                                 7388
Unarmed Assault                         1024
Hostage Taking (Barricade Incident)     1000
Hijacking                                664
Name: count, dtype: int64


In [None]:
# 3. Is terrorism increasing or decreasing over the years?
import plotly.express as px

yearly_counts = df.groupby(['iyear', 'region_txt']).size().reset_index(name='attack_count')


inny = px.bar(
    yearly_counts,
    x='iyear',
    y='attack_count',
    color='region_txt',
    title='Terrorism Trend By Year',
    labels={'iyear': 'Year', 'attack_count': 'Attack Count'},
)
inny.show()

#Based on the visualization i have, terrorism is increasing over the years.

In [None]:
# 4. Which terrorist organizations are the most active?
terrorist_org = df['gname'].value_counts()
print(terrorist_org.head(10))

#Based on the information i have here, the Taliban are the most active.

gname
Unknown                                             83566
Taliban                                              7676
Islamic State of Iraq and the Levant (ISIL)          5789
Shining Path (SL)                                    4555
Farabundo Marti National Liberation Front (FMLN)     3351
Al-Shabaab                                           3337
New People's Army (NPA)                              2815
Irish Republican Army (IRA)                          2671
Revolutionary Armed Forces of Colombia (FARC)        2489
Boko Haram                                           2460
Name: count, dtype: int64


In [None]:
# 5. Are there any seasonal trends in terrorist attacks?
import plotly.express as px

monthly_counts = df.groupby(['imonth', 'country_txt']).size().reset_index(name='attack_count')
inny = px.bar(
    monthly_counts.sort_values(by='attack_count', ascending=False).head(20),
    x='imonth',
    y='attack_count',
    color='country_txt',
    title='Terrorism  Trend By Season',
    labels={'imonth': 'Month', 'attack_count': 'Attack Count'},
)
inny.show()

#Acccording to the visualization i have, there's a trend in terrorist attacks during 4th and 5th month and they both fall under Spring
# which means theres a trend in terrorist attacks during the spring season

In [None]:
# 6. What terrorist group dominates Nigeria most
import plotly.express as px

nigeria = df[df['country_txt'] == 'Nigeria']


grouped_nigeria = nigeria.groupby('gname').size().reset_index(name='attack_count')

fig = px.bar(
    grouped_nigeria.sort_values(by='attack_count', ascending=False).head(20),
    x='gname',
    y='attack_count',
    color='gname',
    title='Terrorism Trend by Terrorist Group in Nigeria',
    labels={'gname': 'Terrorist Group', 'attack_count': 'Attack Count'},
)
fig.show()

#According to the chart, Boko Haram Dominates Nigeria Most.

In [None]:
# 7. What is the greatest target in (6) above.
import plotly.express as px

nigeria = df[df['country_txt'] == 'Nigeria']
grouped_nigeria = nigeria.groupby(['gname','target1']).size().reset_index(name='attack_count')
fig = px.bar(
    grouped_nigeria.sort_values(by='attack_count', ascending=False).head(20),
    x='gname',
    y='attack_count',
    color='target1',
    title='Terrorism Trend by Terrorist Group in Nigeria',
    labels={'gname': 'Terrorist Group', 'attack_count': 'Attack Count'},
)
fig.show()

#According to the chart, Boko Haram Greatest Target are Village areas.