# Pyspark

In [2]:
# Must be included at the beginning of each new notebook.
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('GlobalTerrorism').getOrCreate()

# Importing Dataset

In [3]:
df1 = spark.read.csv('Dataset_2.csv', header=True, inferSchema=True)
df2 = spark.read.csv('Dataset_1.csv', header=True, inferSchema=True)
df1.show()
df2.show()

+-----+------+-----+-------+-----------+------+--------------------+-----------------+----------+-----+-----+-----+-------+-----------+--------------------+---------+--------------------+-------+-----------------+--------------------+----------+---------+-------------+--------------------+-----+---------+------+
|iyear|imonth|idate|country|country_txt|region|          region_txt|        provstate|      city|crit1|crit2|crit3|success|attacktype1|     attacktype1_txt|targtype1|       targtype1_txt|natlty1|      natlty1_txt|               gname|individual|weaptype1|weaptype1_txt|          weapdetail|nkill|ishostkid|ransom|
+-----+------+-----+-------+-----------+------+--------------------+-----------------+----------+-----+-----+-----+-------+-----------+--------------------+---------+--------------------+-------+-----------------+--------------------+----------+---------+-------------+--------------------+-----+---------+------+
| 2000|     1|    1|    139|    Namibia|    11|  Sub-Sahar

# Size of Dataframe 1

In [4]:
len(df1.columns)

27

In [5]:
df1.count()

12273

# Size of Dataframe 2

In [6]:
len(df2.columns)

27

In [7]:
df2.count()

99583

# Merging two datasets

In [8]:
terrorismdf = df1.unionAll(df2)

In [9]:
len(terrorismdf.columns)

27

In [10]:
terrorismdf.count()

111856

# First row of dataframe 1

In [11]:
df1.head()
for item in df1.head():
    print(item)

2000
1
1
139
Namibia
11
Sub-Saharan Africa
Kavango
Muitjiku
1
1
1
1
2
Armed Assault
1
Business
139
Namibia
National Union for the Total Independence of Angola (UNITA)
0
5
Firearms
None
0
0
None


# First row of dataframe 2

In [12]:
df2.head()
for item in df2.head():
    print(item)

2007
1
1
155
West Bank and Gaza Strip
10
Middle East & North Africa
Gaza Strip
Unknown
1
1
1
1
6
Hostage Taking (Kidnapping)
10
Journalists & Media
159
Peru
Unknown
0
5
Firearms
None
0
1
0


# First row of Merged dataframe

In [13]:
terrorismdf.head()
for item in terrorismdf.head():
    print(item)

2000
1
1
139
Namibia
11
Sub-Saharan Africa
Kavango
Muitjiku
1
1
1
1
2
Armed Assault
1
Business
139
Namibia
National Union for the Total Independence of Angola (UNITA)
0
5
Firearms
None
0
0
None


# Schema

In [14]:
#idea of what the data looks like
terrorismdf.printSchema()

root
 |-- iyear: integer (nullable = true)
 |-- imonth: integer (nullable = true)
 |-- idate: integer (nullable = true)
 |-- country: integer (nullable = true)
 |-- country_txt: string (nullable = true)
 |-- region: integer (nullable = true)
 |-- region_txt: string (nullable = true)
 |-- provstate: string (nullable = true)
 |-- city: string (nullable = true)
 |-- crit1: integer (nullable = true)
 |-- crit2: integer (nullable = true)
 |-- crit3: integer (nullable = true)
 |-- success: integer (nullable = true)
 |-- attacktype1: integer (nullable = true)
 |-- attacktype1_txt: string (nullable = true)
 |-- targtype1: integer (nullable = true)
 |-- targtype1_txt: string (nullable = true)
 |-- natlty1: integer (nullable = true)
 |-- natlty1_txt: string (nullable = true)
 |-- gname: string (nullable = true)
 |-- individual: integer (nullable = true)
 |-- weaptype1: integer (nullable = true)
 |-- weaptype1_txt: string (nullable = true)
 |-- weapdetail: string (nullable = true)
 |-- nkill: str

In [15]:
terrorismdf.columns

['iyear',
 'imonth',
 'idate',
 'country',
 'country_txt',
 'region',
 'region_txt',
 'provstate',
 'city',
 'crit1',
 'crit2',
 'crit3',
 'success',
 'attacktype1',
 'attacktype1_txt',
 'targtype1',
 'targtype1_txt',
 'natlty1',
 'natlty1_txt',
 'gname',
 'individual',
 'weaptype1',
 'weaptype1_txt',
 'weapdetail',
 'nkill',
 'ishostkid',
 'ransom']

# Dropping duplicate columns

In [31]:
#removing the columns which are numerical value of the texts ---> duplicate columns
duplicate_columns = ['country', 'region', 'attacktype1', 'targtype1', 'natlty1', 'weaptype1', 'crit1', 'crit2', 'crit3', 'success', 'nkill']
terrorismdf = terrorismdf.drop(*duplicate_columns)
#columns
terrorismdf.columns

['iyear',
 'imonth',
 'idate',
 'country_txt',
 'region_txt',
 'provstate',
 'city',
 'attacktype1_txt',
 'targtype1_txt',
 'natlty1_txt',
 'gname',
 'individual',
 'weaptype1_txt',
 'weapdetail',
 'ishostkid',
 'ransom']

In [32]:
#Length of terrorismdf columns after dropping the duplicate columns
len(terrorismdf.columns)

16

In [33]:
terrorismdf.count()

111856

# Renaming Columns 

In [38]:
terrorismdf = terrorismdf.withColumnRenamed('iyear', 'Year')
terrorismdf = terrorismdf.withColumnRenamed('imonth', 'Month')
terrorismdf = terrorismdf.withColumnRenamed('idate', 'Date')
terrorismdf = terrorismdf.withColumnRenamed('country_txt', 'Country')
terrorismdf = terrorismdf.withColumnRenamed('region_txt', 'Region')
terrorismdf = terrorismdf.withColumnRenamed('provstate', 'Province')
terrorismdf = terrorismdf.withColumnRenamed('city', 'City')
terrorismdf = terrorismdf.withColumnRenamed('attacktype1_txt', 'Attack')
terrorismdf = terrorismdf.withColumnRenamed('targtype1_txt', 'Target')
terrorismdf = terrorismdf.withColumnRenamed('natlty1_txt', 'Nationality')
terrorismdf = terrorismdf.withColumnRenamed('gname', 'Group')
terrorismdf = terrorismdf.withColumnRenamed('individual', 'Individual')
terrorismdf = terrorismdf.withColumnRenamed('weaptype1_txt', 'Weapon')
terrorismdf = terrorismdf.withColumnRenamed('weapdetail', 'Weapon Detail')
terrorismdf = terrorismdf.withColumnRenamed('ishostkid', 'Ishostkid')
terrorismdf = terrorismdf.withColumnRenamed('ransom', 'Ransom')
terrorismdf.columns

['Year',
 'Month',
 'Date',
 'Country',
 'Region',
 'Province',
 'City',
 'Attack',
 'Target',
 'Nationality',
 'Group',
 'Individual',
 'Weapon',
 'Weapon Detail',
 'Ishostkid',
 'Ransom']

# Importing libraries for better visualisation of dataframes

The pandas is imported and the datasets are again imported using pandas.
This is done to visualise the dataframe and the corresponding data preparation, transformation and cleaning are done in pandas and pyspark dataframes. This is done as pyspark does not facilitate good visualisations

In [16]:
# Import the libraries.
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np

In [17]:
#importing datasets from pandas for visualisation
df1_pd = pd.read_csv('Dataset_2.csv')
print(df1_pd.shape)
df2_pd = pd.read_csv('Dataset_1.csv')
print(df2_pd.shape)

(12273, 27)
(99583, 27)


In [18]:
#appending the two datasets
terrorismdf_pd = df1_pd.append(df2_pd, ignore_index = True)

#printing the number of columns and rows
print(terrorismdf_pd.shape)

#printing the columns name to see how to change the names to relevant names
print(terrorismdf_pd.columns)

(111856, 27)
Index(['iyear', 'imonth', 'idate', 'country', 'country_txt', 'region',
       'region_txt', 'provstate', 'city', 'crit1', 'crit2', 'crit3', 'success',
       'attacktype1', 'attacktype1_txt', 'targtype1', 'targtype1_txt',
       'natlty1', 'natlty1_txt', 'gname', 'individual', 'weaptype1',
       'weaptype1_txt', 'weapdetail', 'nkill', 'ishostkid', 'ransom'],
      dtype='object')


In [19]:
df1_pd.head(2)

Unnamed: 0,iyear,imonth,idate,country,country_txt,region,region_txt,provstate,city,crit1,...,natlty1,natlty1_txt,gname,individual,weaptype1,weaptype1_txt,weapdetail,nkill,ishostkid,ransom
0,2000,1,1,139,Namibia,11,Sub-Saharan Africa,Kavango,Muitjiku,1,...,139.0,Namibia,National Union for the Total Independence of A...,0,5,Firearms,,0.0,0.0,
1,2000,1,1,139,Namibia,11,Sub-Saharan Africa,Kavango,Muitjiku,1,...,139.0,Namibia,National Union for the Total Independence of A...,0,5,Firearms,,0.0,1.0,0.0


In [20]:
df1_pd.tail(2)

Unnamed: 0,iyear,imonth,idate,country,country_txt,region,region_txt,provstate,city,crit1,...,natlty1,natlty1_txt,gname,individual,weaptype1,weaptype1_txt,weapdetail,nkill,ishostkid,ransom
12271,2006,12,31,141,Nepal,6,South Asia,Eastern,Khadgapur,1,...,141.0,Nepal,Janatantrik Terai Mukti Morcha- Jwala Singh (J...,0,9,Melee,The attackers first threw rocks at the vehicle...,0.0,0.0,
12272,2006,12,31,141,Nepal,6,South Asia,Central,Sundarpur,1,...,141.0,Nepal,Janatantrik Terai Mukti Morcha (JTMM),0,13,Unknown,,1.0,1.0,0.0


In [21]:
df2_pd.head(2)

Unnamed: 0,iyear,imonth,idate,country,country_txt,region,region_txt,provstate,city,crit1,...,natlty1,natlty1_txt,gname,individual,weaptype1,weaptype1_txt,weapdetail,nkill,ishostkid,ransom
0,2007,1,1,155,West Bank and Gaza Strip,10,Middle East & North Africa,Gaza Strip,Unknown,1,...,159.0,Peru,Unknown,0,5,Firearms,,0.0,1.0,0.0
1,2007,1,1,205,Thailand,5,Southeast Asia,Bangkok ( District ),Bangkok,1,...,205.0,Thailand,Unknown,0,6,Explosives,,3.0,0.0,


In [22]:
df2_pd.tail(2)

Unnamed: 0,iyear,imonth,idate,country,country_txt,region,region_txt,provstate,city,crit1,...,natlty1,natlty1_txt,gname,individual,weaptype1,weaptype1_txt,weapdetail,nkill,ishostkid,ransom
99581,2017,12,31,92,India,6,South Asia,Manipur,Imphal,1,...,92.0,India,Unknown,0,6,Explosives,A thrown grenade was used in the attack.,0.0,0.0,
99582,2017,12,31,160,Philippines,5,Southeast Asia,Maguindanao,Cotabato City,1,...,160.0,Philippines,Unknown,0,6,Explosives,An explosive device containing a detonating co...,0.0,0.0,


In [23]:
terrorismdf_pd.head(2)

Unnamed: 0,iyear,imonth,idate,country,country_txt,region,region_txt,provstate,city,crit1,...,natlty1,natlty1_txt,gname,individual,weaptype1,weaptype1_txt,weapdetail,nkill,ishostkid,ransom
0,2000,1,1,139,Namibia,11,Sub-Saharan Africa,Kavango,Muitjiku,1,...,139.0,Namibia,National Union for the Total Independence of A...,0,5,Firearms,,0.0,0.0,
1,2000,1,1,139,Namibia,11,Sub-Saharan Africa,Kavango,Muitjiku,1,...,139.0,Namibia,National Union for the Total Independence of A...,0,5,Firearms,,0.0,1.0,0.0


In [24]:
terrorismdf_pd.tail(2)

Unnamed: 0,iyear,imonth,idate,country,country_txt,region,region_txt,provstate,city,crit1,...,natlty1,natlty1_txt,gname,individual,weaptype1,weaptype1_txt,weapdetail,nkill,ishostkid,ransom
111854,2017,12,31,92,India,6,South Asia,Manipur,Imphal,1,...,92.0,India,Unknown,0,6,Explosives,A thrown grenade was used in the attack.,0.0,0.0,
111855,2017,12,31,160,Philippines,5,Southeast Asia,Maguindanao,Cotabato City,1,...,160.0,Philippines,Unknown,0,6,Explosives,An explosive device containing a detonating co...,0.0,0.0,


# Initial Exploration and Visualisation 

In [25]:
#finding correlation for country
terrorismdf_pd.corr()["country"]

iyear         -0.003994
imonth        -0.000114
idate         -0.000245
country        1.000000
region         0.142450
crit1         -0.028539
crit2         -0.025207
crit3         -0.036414
success       -0.024552
attacktype1    0.012913
targtype1      0.028591
natlty1        0.651784
individual     0.041537
weaptype1      0.002886
nkill         -0.000830
ishostkid      0.003088
ransom         0.014256
Name: country, dtype: float64

In [27]:
#finding correlation for attack type
terrorismdf_pd.corr()["attacktype1"]

iyear          0.068961
imonth         0.011590
idate         -0.000096
country        0.012913
region        -0.014066
crit1          0.006579
crit2         -0.015658
crit3         -0.020647
success        0.054709
attacktype1    1.000000
targtype1      0.031974
natlty1        0.014219
individual     0.016337
weaptype1      0.715719
nkill          0.016229
ishostkid      0.165597
ransom         0.142915
Name: attacktype1, dtype: float64

In [34]:
#removing the columns which are numerical value of the texts ---> duplicate columns
duplicate_columns = ['country', 'region', 'attacktype1', 'targtype1', 'natlty1', 'weaptype1', 'crit1', 'crit2', 'crit3', 'success', 'nkill']
terrorismdf_pd.drop(duplicate_columns, inplace=True, axis=1)
print(terrorismdf_pd.columns)
print(terrorismdf_pd.shape)

Index(['iyear', 'imonth', 'idate', 'country_txt', 'region_txt', 'provstate',
       'city', 'attacktype1_txt', 'targtype1_txt', 'natlty1_txt', 'gname',
       'individual', 'weaptype1_txt', 'weapdetail', 'ishostkid', 'ransom'],
      dtype='object')
(111856, 16)


In [40]:
#renaming columns for better understanding
terrorismdf_pd.rename(columns = {'iyear':'Year', 'imonth':'Month', 'idate':'Date', 'country_txt':'Country', 'region_txt':'Region', 'provstate':'Province','city':'City','attacktype1_txt':'Attack','targtype1_txt':'Target','natlty1_txt':'Nationality','gname':'Group','individual':'Individual','weaptype1_txt': 'Weapon', 'weapdetail':'Weapon Detail', 'ishostkid':'Ishostkid', 'ransom':'Ransom' },inplace=True)
terrorismdf_pd.head(2)

Unnamed: 0,Year,Month,Date,Country,Region,Province,City,Attack,Target,Nationality,Group,Individual,Weapon,Weapon Detail,Ishostkid,Ransom
0,2000,1,1,Namibia,Sub-Saharan Africa,Kavango,Muitjiku,Armed Assault,Business,Namibia,National Union for the Total Independence of A...,0,Firearms,,0.0,
1,2000,1,1,Namibia,Sub-Saharan Africa,Kavango,Muitjiku,Hostage Taking (Kidnapping),Business,Namibia,National Union for the Total Independence of A...,0,Firearms,,1.0,0.0


In [41]:
print(terrorismdf.columns)

['Year', 'Month', 'Date', 'Country', 'Region', 'Province', 'City', 'Attack', 'Target', 'Nationality', 'Group', 'Individual', 'Weapon', 'Weapon Detail', 'Ishostkid', 'Ransom']


In [43]:
#Printing the country with highest terrorist Attacks
print('Country with Highest Terrorist Attacks:',terrorismdf_pd['Country'].value_counts().index[0])

Country with Highest Terrorist Attacks: Iraq


In [44]:
# Printing the Regions with highest Terrorist Attacks 
print('Regions with Highest Terrorist Attacks:',terrorismdf_pd['Region'].value_counts().index[0])

Regions with Highest Terrorist Attacks: Middle East & North Africa
