# Challenge 1
Elaboró: Leon Palomares

Grupo: 2

Matricula: 325057406

In [None]:
import os, sys

os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"

os.environ["SPARK_LOCAL_IP"] = "127.0.0.1"

Iniciamos Sesión Pyspark

In [None]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("Challenge_1")
    .config("spark.executorEnv.PYSPARK_PYTHON", sys.executable)
    .config("spark.executorEnv.PYARROW_IGNORE_TIMEZONE", "1")
    .config("spark.sql.session.timeZone", "UTC")
    .getOrCreate()
)

print("Spark Version:", spark.version)

Importamos Librerias

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pyspark.sql import functions as f
import pyspark.pandas as ps  # pandas-on-Spark (antes Koalas)

Cargamos la base de datos

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df1 = pd.read_csv('/kaggle/input/globalterrorismdb_0718dist.csv', delimiter=',')
df1.dataframeName = 'globalterrorismdb_0718dist.csv'
nRow, nCol = df1.shape
print(f'There are {nRow} rows and {nCol} columns')

In [None]:
kdf = ps.read_csv("globalterrorismdb_0718dist.csv")
kdf.info()

Renombramos columnas. Rellenamos valores NA con 0 en categorías de interés.

In [None]:
colnames = {
    'iyear': 'Year', 
    'imonth': 'Month',
    'iday': 'Day',
    'country_txt': 'Country',
    'region_txt': 'Region',
    'city': 'City',
    'attacktype1_txt': 'AttackType',
    'targtype1_txt': 'TargetType',
    'gname': 'Group',
    'weaptype1_txt': 'WeaponType',
    'nkill': 'Killed',
    'nwound': 'Wounded'
}

# Aplicar el renombramiento usando Koalas/pyspark.pandas
kdf_cleaned = kdf.rename(columns=colnames)

# Seleccionar solo las columnas renombradas para simplificar
kdf1 = kdf_cleaned[list(colnames.values())]

kdf1['Killed'] = kdf1['Killed'].astype(np.int32)
kdf1['Wounded'] = kdf1['Wounded'].astype(np.int32)

Visualizamos la descripción de los datos

In [None]:
kdf1.head(5)

kdf1.info()

In [None]:
kdf1.describe()

In [None]:
# Conteo de valores nulos por columna
total_rows = len(kdf1)
null_counts = kdf1.isnull().sum()
null_percentage = (null_counts / total_rows) * 100
nulls = ps.concat([null_counts.rename("nulos"), null_percentage.rename("porcentaje")], axis=1)
nulls[nulls['nulos'] > 0]

In [None]:

# Manejo básico de valores nulos (Ejemplo: rellenar Killed/Wounded con 0)
kdf1['Killed'] = kdf1['Killed'].fillna(0)
kdf1['Wounded'] = kdf1['Wounded'].fillna(0)

Visualización de Datos

In [None]:
ps.set_option("plotting.backend", "matplotlib")

df_yearly = kdf1.groupby('Year')[['Killed', 'Wounded']].sum().reset_index()
df_yearly_pd = df_yearly.to_pandas()

# Graficar
fig, ax = plt.subplots(figsize=(14, 6))
ax.bar(df_yearly_pd['Year'] - 0.2, df_yearly_pd['Killed'], width=0.4, color='mediumspringgreen', label='Muertos')
ax.bar(df_yearly_pd['Year'] + 0.2, df_yearly_pd['Wounded'], width=0.4, color='tomato', label='Heridos')

ax.set_xlabel('Año')
ax.set_ylabel('Número de personas')
ax.set_title('Muertos y Heridos por Año')
ax.legend()

# Opcional: rotar etiquetas de años si hay muchas
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
kdf1['Lustro'] = (kdf1['Year'] // 5) * 5
df_grouped = kdf1.groupby('Lustro')[['Killed', 'Wounded']].sum().reset_index()

df_grouped_pd = df_grouped.to_pandas()

# Graficar
fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(df_grouped_pd['Lustro'] - 0.5, df_grouped_pd['Killed'], width=3, color='mediumspringgreen', label='Muertos')
ax.bar(df_grouped_pd['Lustro'] + 0.5, df_grouped_pd['Wounded'], width=3, color='tomato', label='Heridos')
ax.set_xlabel('Lustro')
ax.set_ylabel('Total de personas')
ax.legend()
plt.show()

In [None]:
ax = kdf1['Country'].value_counts().head(10).plot(kind='barh', figsize=(10, 6))
ax.set_xlabel("Número de Incidentes")
ax.set_ylabel("País")
ax.set_title("Top 10 Países con Mayor Número de Incidentes")

In [None]:
#Para Ver qué paises tienen los registros con menos incidentes.
kdf1['Country'].value_counts().tail(15)

In [None]:
ax = kdf1.groupby('AttackType')['Killed'].sum().sort_values(ascending=False).head(10).plot(kind='bar', figsize=(12, 6))
ax.set_xlabel("Tipo de Ataque")
ax.set_ylabel("Número de Muertes")
ax.set_title("Número de Muertes por Tipo de Ataque")

In [None]:
df_grouped = kdf1.groupby(['Country', 'AttackType'])['Killed'].sum().reset_index()

# Ordenar por número de muertes descendente
df_top10 = df_grouped.sort_values(by='Killed', ascending=False).head(10)

# Convertir a pandas para graficar
df_top10_pd = df_top10.to_pandas()

# Configurar estilo
sns.set(style="whitegrid")

plt.figure(figsize=(14,7))
sns.barplot(data=df_top10_pd, x='Country', y='Killed', hue='AttackType')
plt.title('Distribución de Muertes por Tipo de Ataque en los 10 Países más Afectados')
plt.xticks(rotation=45)
plt.tight_layout()

Detenemos la sesión de Spark

In [None]:
spark.stop()