## We need tools to work with our data
#### pandas is for handling tables, numpy is for numbers and matplotlib is for graphics or plotting

In [None]:
%pip install seaborn;
%pip install pandas-profiling;
%pip install pydantic-settings;

import pandas as pd  # noqa: F401
import numpy as np  # noqa: F401
import matplotlib.pyplot as plt  # noqa: F401
import seaborn as sns  # noqa: F401

: 

### Step 1: Load the data

In [None]:
# Read the CSV file into a table (called a DataFrame)
df = pd.read_csv(
    "../../1_datasets/data/01_category_war_events_data/gaza_war_events/casualties_daily/data.csv"
)

In [51]:
df.head()  # Display the first few rows

Unnamed: 0,report_date,report_source,report_period,ext_massacres_cum,killed,killed_cum,ext_killed,ext_killed_cum,ext_killed_children_cum,ext_killed_women_cum,...,ext_injured_cum,ext_civdef_killed_cum,med_killed_cum,ext_med_killed_cum,press_killed_cum,ext_press_killed_cum,killed_recovered,killed_succumbed,killed_truce_new,killed_committee
0,2023-10-07,mohtel,24,0,232.0,232.0,232,232,0,0,...,1610,0,6.0,6,1.0,1,,,,
1,2023-10-08,mohtel,24,0,138.0,370.0,138,370,78,41,...,1788,0,,6,1.0,1,,,,
2,2023-10-09,mohtel,24,8,190.0,560.0,190,560,91,61,...,2271,0,6.0,6,3.0,3,,,,
3,2023-10-10,mohtel,24,8,340.0,900.0,340,900,260,230,...,4000,0,,6,7.0,7,,,,
4,2023-10-11,gmotel,24,23,200.0,1100.0,200,1100,398,230,...,5184,0,10.0,10,,7,,,,


In [52]:
df.info()  # Display information about the DataFrame, including data types and non-null counts

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 631 entries, 0 to 630
Data columns (total 22 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   report_date              631 non-null    object 
 1   report_source            631 non-null    object 
 2   report_period            631 non-null    int64  
 3   ext_massacres_cum        631 non-null    int64  
 4   killed                   604 non-null    float64
 5   killed_cum               624 non-null    float64
 6   ext_killed               631 non-null    int64  
 7   ext_killed_cum           631 non-null    int64  
 8   ext_killed_children_cum  631 non-null    int64  
 9   ext_killed_women_cum     631 non-null    int64  
 10  injured_cum              616 non-null    float64
 11  ext_injured              631 non-null    int64  
 12  ext_injured_cum          631 non-null    int64  
 13  ext_civdef_killed_cum    631 non-null    int64  
 14  med_killed_cum           1

### Step 2: Handle missing numbers

In [None]:
# define the columns first

number_columns = df.columns

# Identify columns with numeric data types
for column in number_columns:
    df[column] = df[column].fillna(0)  # Replace empty cells with 0

### Step 3: Handle missing text

In [55]:
# For 'report_source' column, use 'unknown' for empty cells
df["report_source"] = df["report_source"].fillna("unknown")

### Step 4: Fix the date column

In [56]:
# Convert 'report_date' to a proper date format
df["report_date"] = pd.to_datetime(df["report_date"], errors="coerce")

### Step 5: Remove duplicate rows

In [None]:
# check duplicated first
df.duplicated

<bound method DataFrame.duplicated of     report_date report_source  report_period  ext_massacres_cum  killed  \
0    2023-10-07        mohtel             24                  0   232.0   
1    2023-10-08        mohtel             24                  0   138.0   
2    2023-10-09        mohtel             24                  8   190.0   
3    2023-10-10        mohtel             24                  8   340.0   
4    2023-10-11        gmotel             24                 23   200.0   
..          ...           ...            ...                ...     ...   
626  2025-06-24        mohtel             24              12000    79.0   
627  2025-06-25        mohtel             24              12000    79.0   
628  2025-06-26        mohtel             24              12000   103.0   
629  2025-06-27        mohtel             24              12000    72.0   
630  2025-06-28        mohtel             24              12000    81.0   

     killed_cum  ext_killed  ext_killed_cum  ext_killed_child

In [59]:
# If any rows are exactly the same, keep only one
df = df.drop_duplicates()

### Step 6: Ensure numbers are stored as numbers

In [64]:
# Convert only numeric columns to whole numbers (integers)
for column in df.select_dtypes(include=["float64", "int64"]).columns:
    df[column] = df[column].fillna(0).astype(int)

In [65]:
# Convert report_period to a number
df["report_period"] = df["report_period"].astype(int)

### Step 7: Fix negative numbers

In [67]:
# Negative counts don't make sense, so change them to 0
for column in df.select_dtypes(include=["number"]).columns:
    df[column] = df[column].clip(lower=0)

### Step 8: Check cumulative columns

In [68]:
# Cumulative columns (like total deaths) should never decrease
cumulative_columns = [
    "ext_massacres_cum",
    "killed_cum",
    "ext_killed_cum",
    "ext_killed_children_cum",
    "ext_killed_women_cum",
    "injured_cum",
    "ext_injured_cum",
    "ext_civdef_killed_cum",
    "med_killed_cum",
    "ext_med_killed_cum",
    "press_killed_cum",
    "ext_press_killed_cum",
]
for column in cumulative_columns:
    for i in range(1, len(df)):
        # If current value is less than previous, use the previous value
        if df[column].iloc[i] < df[column].iloc[i - 1]:
            df.loc[df.index[i], column] = df[column].iloc[i - 1]

### Step 9: Check daily vs. cumulative numbers

In [69]:
# Cumulative counts should match the sum of daily counts
df["temp_killed_cum"] = df["killed"].cumsum()  # Calculate sum of daily 'killed'
df["killed_cum"] = df["temp_killed_cum"]  # Update cumulative column
df = df.drop(columns=["temp_killed_cum"])  # Remove temporary column

### Step 10: Fix report_period values

In [70]:
# Report period should be 0, 24, or 48 hours; set others to 24
df.loc[~df["report_period"].isin([0, 24, 48]), "report_period"] = 24

### Step 11: Clean up report_source

In [71]:
# Make text lowercase and remove extra spaces
df["report_source"] = df["report_source"].str.lower().str.strip()
# Standardize similar source names
df["report_source"] = df["report_source"].replace(
    {"gmotel": "mohtel", "missing": "unknown"}
)

### Step 12: Save the cleaned data

In [None]:
# Save the cleaned table to a new CSV file
df.to_csv(
    "../../1_datasets/data/clean_datasets/casualties_cleaned_data.csv", index=False
)