In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from dataprep.eda import *
from dataprep.eda.missing import plot_missing
from dataprep.eda import plot_correlation

In [3]:
try:
    data = pd.read_csv("D:\codes/ITI/libariies/final project/globalterrorismdb_0718dist.csv", encoding='latin1', low_memory=False)
except UnicodeDecodeError as e:
    print(f"Encoding error: {e}")

In [4]:
data.head()

Unnamed: 0,eventid,iyear,imonth,iday,approxdate,extended,resolution,country,country_txt,region,...,addnotes,scite1,scite2,scite3,dbsource,INT_LOG,INT_IDEO,INT_MISC,INT_ANY,related
0,197000000001,1970,7,2,,0,,58,Dominican Republic,2,...,,,,,PGIS,0,0,0,0,
1,197000000002,1970,0,0,,0,,130,Mexico,1,...,,,,,PGIS,0,1,1,1,
2,197001000001,1970,1,0,,0,,160,Philippines,5,...,,,,,PGIS,-9,-9,1,1,
3,197001000002,1970,1,0,,0,,78,Greece,8,...,,,,,PGIS,-9,-9,1,1,
4,197001000003,1970,1,0,,0,,101,Japan,4,...,,,,,PGIS,-9,-9,1,1,


In [5]:
rows, columns = data.shape
print(f"no of rows : {rows}")
print(f"no of columns : {columns}")

no of rows : 181691
no of columns : 135


In [6]:
print(data.columns.tolist())

['eventid', 'iyear', 'imonth', 'iday', 'approxdate', 'extended', 'resolution', 'country', 'country_txt', 'region', 'region_txt', 'provstate', 'city', 'latitude', 'longitude', 'specificity', 'vicinity', 'location', 'summary', 'crit1', 'crit2', 'crit3', 'doubtterr', 'alternative', 'alternative_txt', 'multiple', 'success', 'suicide', 'attacktype1', 'attacktype1_txt', 'attacktype2', 'attacktype2_txt', 'attacktype3', 'attacktype3_txt', 'targtype1', 'targtype1_txt', 'targsubtype1', 'targsubtype1_txt', 'corp1', 'target1', 'natlty1', 'natlty1_txt', 'targtype2', 'targtype2_txt', 'targsubtype2', 'targsubtype2_txt', 'corp2', 'target2', 'natlty2', 'natlty2_txt', 'targtype3', 'targtype3_txt', 'targsubtype3', 'targsubtype3_txt', 'corp3', 'target3', 'natlty3', 'natlty3_txt', 'gname', 'gsubname', 'gname2', 'gsubname2', 'gname3', 'gsubname3', 'motive', 'guncertain1', 'guncertain2', 'guncertain3', 'individual', 'nperps', 'nperpcap', 'claimed', 'claimmode', 'claimmode_txt', 'claim2', 'claimmode2', 'claim



### A brief overview of some important columns for EDA

    eventid
    iyear : Year of the incident.
    imonth: Month of the incident.
    iday  : Day of the incident.
    country_txt : Name of the country where the incident occurred.
    region_txt : Name of the region where the incident occurred.
    provstate : Province or state within the country.
    city : City or location where the incident occurred.
    attacktype1_txt : Type of attack (e.g., bombing, assassination).
    targtype1_txt :  Type of target (e.g., government, civilians).
    gname : Name of the perpetrator group.
    nkill : Number of reported kills.
    nwound : Number of reported wounded.
    summary : Brief summary of the incident.
    motive : Motivation behind the incident.
    weaptype1_txt : Type of weapon used.
    target1
    Success Success of a terrorist strike



In [7]:
final_data = data[['eventid', 'iyear', 'imonth', 'iday', 'country_txt','region_txt', 'provstate','city', 'attacktype1_txt',
                   'nkill', 'nwound','target1', 'summary','gname','targtype1_txt', 'weaptype1_txt','motive','success']]

In [8]:
data = pd.DataFrame(final_data)

In [9]:
rows, columns = data.shape
print(f"no of rows : {rows}")
print(f"no of columns : {columns}")

no of rows : 181691
no of columns : 18


### Data cleaning and missing values investigation

In [10]:
data.isnull().sum()

eventid                 0
iyear                   0
imonth                  0
iday                    0
country_txt             0
region_txt              0
provstate             421
city                  435
attacktype1_txt         0
nkill               10313
nwound              16311
target1               638
summary             66129
gname                   0
targtype1_txt           0
weaptype1_txt           0
motive             131130
success                 0
dtype: int64

In [11]:
data[data['provstate'].isna()].head()

Unnamed: 0,eventid,iyear,imonth,iday,country_txt,region_txt,provstate,city,attacktype1_txt,nkill,nwound,target1,summary,gname,targtype1_txt,weaptype1_txt,motive,success
0,197000000001,1970,7,2,Dominican Republic,Central America & Caribbean,,Santo Domingo,Assassination,1.0,0.0,Julio Guzman,,MANO-D,Private Citizens & Property,Unknown,,1
162,197003310002,1970,3,31,Japan,East Asia,,Fukouka,Hijacking,0.0,0.0,Boeing 727,,Japanese Red Army (JRA),Airports & Aircraft,Explosives,,1
1229,197204040002,1972,4,4,Canada,North America,,Montreal,Bombing/Explosion,1.0,7.0,Cuban Trade Office,,Young Cuba,Government (General),Explosives,,1
1568,197209280011,1972,9,28,United States,North America,,Washington,Assassination,0.0,0.0,Embassy,09/28/1972: A letter bomb addressed to the Isr...,Black September,Government (Diplomatic),Explosives,,0
1617,197211080002,1972,11,8,Mexico,North America,,Monterrey,Hijacking,0.0,0.0,B-727,,Unknown,Airports & Aircraft,Firearms,,1


In [12]:
data = data[data['provstate'].notna()]
data = data.reset_index(drop=True)

In [13]:
data = data[data['city'].notna()]
data = data.reset_index(drop=True)

In [14]:
data = data[data['target1'].notna()]
data = data.reset_index(drop=True)

In [15]:
data["nkill"].unique()

array([0.000e+00, 1.000e+00,       nan, 7.000e+00, 4.700e+01, 2.000e+00,
       3.600e+01, 5.000e+00, 3.000e+00, 4.000e+00, 2.500e+01, 1.500e+01,
       2.600e+01, 8.000e+00, 8.100e+01, 6.000e+00, 9.000e+00, 1.600e+01,
       3.000e+01, 3.100e+01, 1.200e+01, 2.100e+01, 1.400e+01, 8.800e+01,
       1.100e+01, 1.000e+01, 2.700e+01, 1.800e+01, 2.200e+01, 1.900e+01,
       9.200e+01, 1.300e+01, 7.300e+01, 4.200e+01, 4.300e+01, 1.700e+01,
       9.800e+01, 4.220e+02, 4.800e+01, 3.400e+01, 5.400e+01, 1.000e+02,
       5.000e+01, 3.500e+01, 2.000e+01, 4.100e+01, 3.700e+01, 2.800e+01,
       4.000e+01, 3.200e+01, 8.500e+01, 2.300e+01, 3.000e+02, 6.000e+01,
       2.400e+01, 5.800e+01, 8.700e+01, 4.500e+01, 3.800e+01, 2.900e+01,
       7.400e+01, 8.300e+01, 9.000e+01, 7.000e+01, 6.600e+01, 8.000e+01,
       6.700e+01, 5.100e+01, 3.900e+01, 1.240e+02, 7.600e+01, 3.300e+01,
       7.500e+01, 6.200e+01, 4.600e+01, 5.600e+01, 6.300e+01, 1.200e+02,
       1.020e+02, 7.800e+01, 7.900e+01, 5.200e+01, 

In [18]:
data["nkill"].fillna(data["nkill"].mean(),inplace=True)

In [19]:
data["nwound"].fillna(data["nwound"].mean(),inplace=True)

In [20]:
data["motive"].mode()[0]

'Unknown'

In [21]:
data["motive"].fillna(data["motive"].mode()[0],inplace=True)

In [22]:
data["summary"].fillna(data["summary"].mode()[0],inplace=True)

In [23]:
data.isnull().sum()

eventid            0
iyear              0
imonth             0
iday               0
country_txt        0
region_txt         0
provstate          0
city               0
attacktype1_txt    0
nkill              0
nwound             0
target1            0
summary            0
gname              0
targtype1_txt      0
weaptype1_txt      0
motive             0
success            0
dtype: int64

### Check the columns datatypes

In [24]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180204 entries, 0 to 180203
Data columns (total 18 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   eventid          180204 non-null  int64  
 1   iyear            180204 non-null  int64  
 2   imonth           180204 non-null  int64  
 3   iday             180204 non-null  int64  
 4   country_txt      180204 non-null  object 
 5   region_txt       180204 non-null  object 
 6   provstate        180204 non-null  object 
 7   city             180204 non-null  object 
 8   attacktype1_txt  180204 non-null  object 
 9   nkill            180204 non-null  float64
 10  nwound           180204 non-null  float64
 11  target1          180204 non-null  object 
 12  summary          180204 non-null  object 
 13  gname            180204 non-null  object 
 14  targtype1_txt    180204 non-null  object 
 15  weaptype1_txt    180204 non-null  object 
 16  motive           180204 non-null  obje

In [25]:
# convert float (nkill & nwound) to int
data['nkill'] = data['nkill'].astype(int)
data['nwound'] = data['nwound'].astype(int)

#### Check the data duplication

    - We observe that there are no duplicate values

In [26]:
data[data.duplicated()]

Unnamed: 0,eventid,iyear,imonth,iday,country_txt,region_txt,provstate,city,attacktype1_txt,nkill,nwound,target1,summary,gname,targtype1_txt,weaptype1_txt,motive,success


#### Store the cleaned dataset

In [27]:
data.to_csv('data.csv')