# ***Global Terrosism Dataset - Transforming and Loading the Data***
---

### **Setting the environment**

In [1]:
import os
print(os.getcwd())
try:
    os.chdir('../../GlobalTerrorismAnalysis_ETL')
except FileNotFoundError:
    print("""
        Posiblemente ya ejecutaste este bloque dos o más veces o tal vez el directorio está incorrecto. 
        ¿Ya ejecutaste este bloque antes y funcionó? Recuerda no ejecutarlo de nuevo. 
        ¿Estás en el directorio incorrecto? Puedes cambiarlo. 
        Recuerda el directorio donde estás:
        """)
print(os.getcwd())

c:\Users\marti\OneDrive\Escritorio - PC\Ingenieria de Datos e IA - UAO\Semestre 4\ETL\GlobalTerrorismAnalysis_ETL\notebooks
c:\Users\marti\OneDrive\Escritorio - PC\Ingenieria de Datos e IA - UAO\Semestre 4\ETL\GlobalTerrorismAnalysis_ETL


### **Libraries** 

In [2]:
import pandas as pd
from function.database.database import creating_engine, create_table

from sqlalchemy import text

### **Creating the engine and reading the data from the table**

In [3]:
engine = creating_engine()
query = 'SELECT * FROM global_terrorism_db_raw'

In [4]:
df = pd.read_sql_query(query, engine)
df

Unnamed: 0,eventid,iyear,imonth,iday,approxdate,extended,resolution,country,country_txt,region,...,addnotes,scite1,scite2,scite3,dbsource,INT_LOG,INT_IDEO,INT_MISC,INT_ANY,related
0,197000000001,1970,7,2,,0,,58,Dominican Republic,2,...,,,,,PGIS,0,0,0,0,
1,197000000002,1970,0,0,,0,,130,Mexico,1,...,,,,,PGIS,0,1,1,1,
2,197001000001,1970,1,0,,0,,160,Philippines,5,...,,,,,PGIS,-9,-9,1,1,
3,197001000002,1970,1,0,,0,,78,Greece,8,...,,,,,PGIS,-9,-9,1,1,
4,197001000003,1970,1,0,,0,,101,Japan,4,...,,,,,PGIS,-9,-9,1,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181686,201712310020,2017,12,31,,0,,4,Afghanistan,6,...,,"""4 people injured in Farayb explosion,"" Pajhwo...",,,START Primary Collection,-9,-9,0,-9,
181687,201712310022,2017,12,31,,0,,182,Somalia,11,...,,"""Somalia: Al-Shabaab Militants Attack Army Che...","""Highlights: Somalia Daily Media Highlights 2 ...","""Highlights: Somalia Daily Media Highlights 1 ...",START Primary Collection,0,0,0,0,
181688,201712310030,2017,12,31,,0,,160,Philippines,5,...,,"""Maguindanao clashes trap tribe members,"" Phil...",,,START Primary Collection,0,0,0,0,
181689,201712310031,2017,12,31,,0,,92,India,6,...,,"""Trader escapes grenade attack in Imphal,"" Bus...",,,START Primary Collection,-9,-9,0,-9,


## ***Transforming the data***
---

### **Selecting the required columns**

In [5]:
columns_choice = [
    "eventid",
    "iyear",
    "imonth",
    "iday",
    "extended",
    "country_txt",
    "country",
    "region_txt",
    "region",
    "city",
    "latitude",
    "longitude",
    "vicinity",
    "crit1",
    "crit2",
    "crit3",
    "doubtterr",
    "multiple",
    "success",
    "suicide",
    "attacktype1_txt",
    "attacktype1",
    "targtype1_txt",
    "targtype1",
    "natlty1_txt",
    "natlty1",
    "gname",
    "guncertain1",
    "individual",
    "nperps",
    "nperpcap",
    "claimed",
    "weaptype1_txt",
    "weaptype1",
    "nkill",
    "property",
    "ishostkid",
    "INT_ANY"
]

In [6]:
df = df[columns_choice]

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181691 entries, 0 to 181690
Data columns (total 38 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   eventid          181691 non-null  int64  
 1   iyear            181691 non-null  int64  
 2   imonth           181691 non-null  int64  
 3   iday             181691 non-null  int64  
 4   extended         181691 non-null  int64  
 5   country_txt      181691 non-null  object 
 6   country          181691 non-null  int64  
 7   region_txt       181691 non-null  object 
 8   region           181691 non-null  int64  
 9   city             181256 non-null  object 
 10  latitude         177135 non-null  float64
 11  longitude        177134 non-null  float64
 12  vicinity         181691 non-null  int64  
 13  crit1            181691 non-null  int64  
 14  crit2            181691 non-null  int64  
 15  crit3            181691 non-null  int64  
 16  doubtterr        181690 non-null  floa

### **Dropping null values**

In [8]:
df.isnull().sum()

eventid                0
iyear                  0
imonth                 0
iday                   0
extended               0
country_txt            0
country                0
region_txt             0
region                 0
city                 435
latitude            4556
longitude           4557
vicinity               0
crit1                  0
crit2                  0
crit3                  0
doubtterr              1
multiple               1
success                0
suicide                0
attacktype1_txt        0
attacktype1            0
targtype1_txt          0
targtype1              0
natlty1_txt         1559
natlty1             1559
gname                  0
guncertain1          380
individual             0
nperps             71115
nperpcap           69489
claimed            66120
weaptype1_txt          0
weaptype1              0
nkill              10313
property               0
ishostkid            178
INT_ANY                0
dtype: int64

In [9]:
df = df.dropna()
df.shape

(95645, 38)

### **Atypical dates in the iday column**

The iday column has to be inspected, as it has 32 data, which may mean that there is a possible range of days that is out of the normal range of days: **1-31**.

In [10]:
for i in df['iday']:
    if i>31 or i<1:
        print(i)

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


In [11]:
df = df[df['iday'] != 0]
day_count = len(df["iday"].unique())
day_count

31

### **Duplicated rows**

In [12]:
df.duplicated().sum()

np.int64(0)

### **No doubt about the terrorists attacks: cleaning the doubtterr column**

In [13]:
df["doubtterr"].unique()

array([ 0.,  1., -9.])

In [14]:
df = df.query("doubtterr == 0").copy()
df['doubtterr'].head()

5     0.0
8     0.0
9     0.0
13    0.0
17    0.0
Name: doubtterr, dtype: float64

The above should be done, since we only want to inquire about attacks that we are 100% sure are terrorist attacks.

## ***Loading the clean data***
---

### **Reviewing our dataframe**

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 80333 entries, 5 to 181690
Data columns (total 38 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   eventid          80333 non-null  int64  
 1   iyear            80333 non-null  int64  
 2   imonth           80333 non-null  int64  
 3   iday             80333 non-null  int64  
 4   extended         80333 non-null  int64  
 5   country_txt      80333 non-null  object 
 6   country          80333 non-null  int64  
 7   region_txt       80333 non-null  object 
 8   region           80333 non-null  int64  
 9   city             80333 non-null  object 
 10  latitude         80333 non-null  float64
 11  longitude        80333 non-null  float64
 12  vicinity         80333 non-null  int64  
 13  crit1            80333 non-null  int64  
 14  crit2            80333 non-null  int64  
 15  crit3            80333 non-null  int64  
 16  doubtterr        80333 non-null  float64
 17  multiple        

### **Creating the table and loading the data in our PostgreSQL database**

In [16]:
create_table(engine, df, 'global_terrorism_db_cleaned')

### **Checking the loaded data**

In [17]:
query = 'SELECT * FROM global_terrorism_db_cleaned'

In [18]:
df_cleaned = pd.read_sql_query(query, engine)
df_cleaned

Unnamed: 0,eventid,iyear,imonth,iday,extended,country_txt,country,region_txt,region,city,...,individual,nperps,nperpcap,claimed,weaptype1_txt,weaptype1,nkill,property,ishostkid,INT_ANY
0,197001010002,1970,1,1,0,United States,217,North America,1,Cairo,...,0,-99.0,-99.0,0.0,Firearms,5,0.0,1,0.0,-9
1,197001020003,1970,1,2,0,United States,217,North America,1,Madison,...,0,1.0,1.0,1.0,Incendiary,8,0.0,1,0.0,0
2,197001030001,1970,1,3,0,United States,217,North America,1,Madison,...,0,1.0,1.0,0.0,Incendiary,8,0.0,1,0.0,0
3,197001090001,1970,1,9,0,United States,217,North America,1,Detroit,...,0,-99.0,-99.0,0.0,Incendiary,8,0.0,1,0.0,-9
4,197001120001,1970,1,12,0,United States,217,North America,1,New York City,...,0,-99.0,-99.0,0.0,Explosives,6,0.0,1,0.0,-9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80328,201712310019,2017,12,31,0,India,92,South Asia,6,Hungrum,...,0,-99.0,0.0,0.0,Firearms,5,0.0,0,1.0,0
80329,201712310020,2017,12,31,0,Afghanistan,4,South Asia,6,Maymana,...,0,-99.0,0.0,0.0,Explosives,6,0.0,1,0.0,-9
80330,201712310030,2017,12,31,0,Philippines,160,Southeast Asia,5,Kubentog,...,0,-99.0,0.0,0.0,Incendiary,8,0.0,1,0.0,0
80331,201712310031,2017,12,31,0,India,92,South Asia,6,Imphal,...,0,-99.0,0.0,0.0,Explosives,6,0.0,-9,0.0,-9
