In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pylab as plt
from scipy.stats import chi2_contingency

pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)

## Load the Data

In [None]:
dataset = pd.read_csv('datasets/globalterrorismdb.csv', encoding='ISO-8859-1')

## Preliminary Analysis
Questions:
1. What is the size of my dataset and what are the variable data types?
2. What does my data look like?
3. Are there any missing variables?

In [None]:
dataset.shape

In [None]:
dataset.head()

In [None]:
# Renaming the columns for readability
dataset.rename(columns={'iyear':'Year', 'imonth':'Month', 'iday':'Day', 'country_txt':'Country', 'region_txt':'Region', 'attacktype1_txt':'Attack_type', 'targtype1_txt':'Target_type', 'target1':'Target', 'weaptype1_txt':'WeaponType', 'nkill':'Killed','nwound':'Wounded', 'gname':'Group'}, inplace=True)

In [None]:
# Columns too many; select desired columns and reorder them
dataset['casualties'] = dataset['Killed']+dataset['Wounded']
dataset = dataset[['Year', 'Month', 'Day', 'Country', 'Region', 'Attack_type', 'Target_type', 'WeaponType', 'motive', 'casualties', 'property', 'Group', 'success']]
dataset.head()

In [None]:
dataset.dtypes

### Missing values

In [None]:
dataset.isnull().sum()

In [None]:
# Handling missing values:


# Drop motive column (too many missing values, subjective)
dataset.drop('motive', axis=1, inplace=True)

# Drop rows with missing values in the 'casualties' column
dataset.dropna(subset=['casualties'], inplace=True)


## General Insights

Distribution of Attacks Over Time

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(data=dataset, x='Year')
plt.xticks(rotation=90)
plt.title('Number of Terrorist Attacks by Year')
plt.xlabel('Year')
plt.ylabel('Number of Attacks')
plt.show()

Distribution of Casualties Over Time

In [None]:
casualties_by_year = dataset.groupby('Year')['casualties'].sum()

plt.figure(figsize=(12, 6))
casualties_by_year.plot()
plt.xlabel('Year')
plt.ylabel('Total Casualties')
plt.title('Trend of Total Casualties Over the Years')
plt.show()

Distribution of Attack Types

In [None]:
plt.figure(figsize=(8, 8))
attack_type_counts = dataset['Attack_type'].value_counts()
plt.pie(attack_type_counts, labels=attack_type_counts.index, autopct='%1.1f%%', startangle=140)
plt.title('Distribution of Attack Types')
plt.show()

Distribution of Target Types

In [None]:
plt.subplots(figsize=(15,6))
sns.countplot(dataset['Target_type'], order=dataset['Target_type'].value_counts().index)
plt.xticks(rotation=90)
plt.title('Attacks by Targets')
plt.show()

Most Active Terrorist Groups

In [None]:
top_terrorist_groups = dataset['Group'].value_counts().drop('Unknown').head(10)
plt.figure(figsize=(12, 6))
top_terrorist_groups.plot(kind='bar')
plt.xlabel('Terrorist Group')
plt.ylabel('Number of Incidents')
plt.title('Top 10 Most Active Terrorist Groups')
plt.xticks(rotation=90)
plt.show()

## Geographical Analysis

Geographical Distribution of Terrorist Incidents

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(x='Country', data=dataset, order=dataset['Country'].value_counts().index[:10])
plt.xlabel('Country')
plt.ylabel('Number of Incidents')
plt.title('Top 10 Countries with Most Terrorist Incidents')
plt.xticks(rotation=90)
plt.show()

In [None]:
print('Country with Highest Terrorist Attacks:',dataset['Country'].value_counts().index[0])
print('Region with Highest Terrorist Attacks:',dataset['Region'].value_counts().index[0])

# Correlation Heatmap

In [None]:
numeric_columns = dataset.select_dtypes(include='number')
corr = numeric_columns.corr()

plt.figure(figsize=(12, 8))
sns.heatmap(corr, cmap='coolwarm', annot=True)
plt.title('Correlation Heatmap')
plt.show()

# Hypothesis Test

Significance test for the Hypothesis 3: The success rate of terrorist attacks is influenced by the type of attack

In [None]:
contingency_table = pd.crosstab(dataset['Attack_type'], dataset['success'])

# Chi-Square test for independence
chi2, p, dof, expected = chi2_contingency(contingency_table)

# significance level
alpha = 0.05


if p < alpha:
    result = "The success rate of terrorist attacks is influenced by the type of attack (Reject Null Hypothesis)."
else:
    result = "There is no significant association between the success of attacks and the type of attack (Fail to Reject Null Hypothesis)."

print(f"Chi-Square Statistic: {chi2}")
print(f"P-value: {p}")

print(result)