# (-) Data

Data downloaded from [The Committee to Protect Journalists](https://cpj.org/about/).

[Link](https://cpj.org/data/killed/) to the data.

# (-) Loading Tools & Data

In [187]:
# !pip install pycountry
# !pip install gender-detector
# !pip install unidecode
import pandas as pd
import numpy as np
import os
import re
import pycountry
import scipy.stats as stats
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

from unidecode import unidecode
from gender_detector import gender_detector as gd

file_path = 'Journalists and Media Workers killed since 1992.csv'
data = pd.read_csv(file_path) #raw file

In [188]:
print('Required libraries imported.')
print('Data loaded.\n')
print("Tools: pandas, unidecode, gender_detector, scipy.stats, numpy, os, re,  plotly, seaborn, pycountry")


Required libraries imported.
Data loaded.

Tools: pandas, unidecode, gender_detector, scipy.stats, numpy, os, re,  plotly, seaborn, pycountry


In [189]:
data

Unnamed: 0,year,combinedStatus,fullName,sortName,primaryNationality,secondaryNationality,tertiaryNationality,gender,photoUrl,photoCredit,...,threatened,charges,motive,lengthOfSentence,healthProblems,impCountry,entry,sentenceDate,sentence,locationImprisoned
0,2006,Confirmed,Ahmad [full name unavailable],,,,,,,,...,,,,,,,,,,
1,2007,Confirmed,Khaled Mohammad Nofan,,,,,,,,...,,,,,,,,,,
2,2007,Confirmed,Sabah Salman,,,,,,,,...,,,,,,,,,,
3,2005,Confirmed,Unidentified,,,,,,,,...,,,,,,,,,,
4,2018,Confirmed,Abadullah Hananzai,,Afghanistan,,,Male,,,...,No,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2196,1993,Unconfirmed,Zivko Filipovic,,,,,,,,...,,,,,,,,,,
2197,2022,Confirmed,Zoreslav Zamoysky,,Ukraine,,,Male,,,...,,,,,,,,,,
2198,2007,Confirmed,Zubair Ahmed Mujahid,,Pakistan,,,Male,,,...,No,,,,,,,,,
2199,2014,Confirmed,Zubair Hatami,,Afghanistan,,,Male,,,...,,,,,,,,,,


One look at the data and we can see that there is so much missing data and potentially other issues to deal with before we can start working with the data.

Here is a list potential list of To-Do procedures that might be useful in the preparation of the data.

1. Data Cleaning:

    - Remove unnecessary columns that are not relevant to your analysis.
    - Check for duplicate records and remove them if necessary.
    - Identify and handle missing values by either imputing them or removing the corresponding rows/columns, depending on the context and impact on the analysis.
    - Handle inconsistent or erroneous data entries by performing data validation checks and correcting or removing the problematic entries.
    - Standardize and clean up text data, such as removing leading/trailing spaces, converting to consistent case (e.g., lowercase), and correcting misspellings.

2. Data Validation:

    - Check data types of each column and ensure they are assigned correctly. Convert data types if necessary.
    - Validate the range and boundaries of numeric data to identify outliers or extreme values that may need further investigation or handling.
    - Verify the consistency and correctness of categorical variables, such as checking for valid values and identifying potential discrepancies or inconsistencies.

3. Data Transformation:

    - Reshape the data if needed, such as pivoting, melting, or splitting columns to align with the desired analysis or modeling requirements.
    - Perform feature engineering, which involves creating new meaningful variables based on existing data, such as deriving age from a birthdate column or calculating ratios from existing numerical columns.

4. Identify other potential issues.

5. Apply appropriate fixes and improvements.

The example list is not exhaustive, but it is a good place to start.

# (+) Defining useful functions

## (a) Dataset X-Ray


In [190]:
def xray(df):
  print('Table shape:', df.shape[0], 'rows and', df.shape[1], 'columns\n')


  #number of duplicates
  xdups = df.duplicated().sum()

  # print('Number of duplicated rows:', dups)
  if xdups > 0:
    if xdups == 1:
      print(f'There is 1 duplicated row.\n')
    else:
      print(f'There are {xdups} duplicated rows.\n')
  else:
    print('There are no duplicated rows.\n')

  print(df.isnull().sum().sum(), 'missing values in total, or', round((df.isnull().sum().sum()/df.size)*100,2), '% of the data.')

  xray = px.imshow(df.isnull())
  xray.update_layout(coloraxis_showscale=False, coloraxis=dict(colorscale=[(0, 'black'), (1, 'ivory')]))
  xray.show()

## (b) Info 2

In [191]:
def info2(df,h=1000):
  import plotly.graph_objects as go

  header = ['column name', 'dtype', 'entries', 'missing', 'missing %','unique values', 'most freaquent', 'sample value']
  dtype = [str(x) for x in list(df.dtypes)]
  missing = list(df.isnull().sum())
  per = list((df.isnull().mean() * 100).round(2))
  count = list(df.count().values)
  unique = list(df.nunique())
  sample = list(df.sample(n=1, random_state=98).copy().iloc[0])
  mf = []
  for x in df.columns:
    try:
      y = df[x].value_counts().idxmax()
      mf.append(y)
    except:
      mf.append(np.nan)



  info_frame=pd.DataFrame(columns=header)

  info_frame['column name'] = df.columns
  info_frame['dtype'] = dtype
  info_frame['entries'] = count
  info_frame['missing'] = missing
  info_frame['missing %'] = per
  info_frame['unique values'] = unique
  info_frame['most freaquent'] = mf
  info_frame['sample value'] = sample
  # print('Table shape:', df.shape[0], 'rows and', df.shape[1], 'columns\n')

  # del header, missing, per, count, sample, #info_frame

  info_table = go.Figure(data=[go.Table(
    header=dict(values=list(info_frame.columns),
                line_color='ivory',
                fill_color='darkgrey',
                align='left',
                # font=dict(color='ivory')
                ),
    cells=dict(values= info_frame.values.T.tolist(), # list of lists where each list is a column
               line_color='#eeeedd',
               fill_color='ivory',
               align='left',
              #  font=dict(color='#4d1919')
               ))])

  info_table.update_layout(title='Information about the dataset', height=h)
  info_table.show()

  return info_frame

## (c) Check strings

In [192]:
#Check strings special characters
def check_strings(df):
    special_chars = r'[^a-zA-Z,-]'
    result = {}

    for column in df.select_dtypes(include=['object', 'category']):
        invalid_values = df[column].astype(str).apply(lambda x: re.findall(special_chars, x)).explode().unique().tolist()
        if invalid_values:
            result[column] = invalid_values

    return result

## (d) Splitting string

In [193]:
#splitting string into list of words and selecting the one starting with capital letter.
def extract(text):
  # print(text)
  temp = []
  text = text.split() # = ['an', 'area', 'outside', 'Kirkuk']

  for x in text:
    if x.istitle():
      temp.append(x)

    text = ' '.join(temp)

  return text

## (e) Name to gender

In [194]:
#defining function for detecting gender from name
def detect(name):
  try:
    name = name.split(' ')[0]
    return gd.GenderDetector().guess(name)
  except:
    return 'oops!'

---
# (+) Basic Exploratory Data Analysis
---

---
---
## (1) Data Assessment
---
---

### Quick Info

In [195]:
xray(data)
info_frame = info2(data, 1150)

Table shape: 2201 rows and 43 columns

There is 1 duplicated row.

53014 missing values in total, or 56.01 % of the data.


We need less columns. The first dropped columns will be those with missing values >  60%.

In [196]:
columns = list(data.columns)
unwanted = info_frame[info_frame['missing %'] > 60]['column name'].unique().tolist()
wanted = [x for x in columns if x not in unwanted]

print('unwanted:', unwanted)
print('wanted:', wanted)

data = data[wanted]

del wanted, unwanted

unwanted: ['sortName', 'secondaryNationality', 'tertiaryNationality', 'photoUrl', 'photoCredit', 'lastStatus', 'region', 'state', 'province', 'accountabilityCrossfire', 'accountabilityAssignment', 'charges', 'motive', 'lengthOfSentence', 'healthProblems', 'impCountry', 'entry', 'sentenceDate', 'sentence', 'locationImprisoned']
wanted: ['year', 'combinedStatus', 'fullName', 'primaryNationality', 'gender', 'type', 'typeOfDeath', 'status', 'employedAs', 'organizations', 'jobs', 'coverage', 'mediums', 'country', 'location', 'locality', 'localOrForeign', 'sourcesOfFire', 'motiveConfirmed', 'impunityMurder', 'tortured', 'captive', 'threatened']


And we will be left with the following columns:

1. **year:** The year in which the incident occurred. (Example: 2010)
1. **combinedStatus:** The combined status of the incident, indicating if it has been confirmed or not. (Example: Confirmed)
1. **fullName:** The full name of the journalist or media worker. (Example: Aleh Byabenin)
1. **primaryNationality:** The primary nationality of the individual. (Example: Belarus)
1. **gender:** The gender of the individual. (Example: Male)
1. **type:** The type of the individual, specifying if they were a journalist or another type of media worker. (Example: Journalist)
1. **typeOfDeath:** The type of death the individual suffered. (Example: Murder)
1. **status:** The status of the individual, indicating if they were killed. (Example: Killed)
1. **employedAs:** The role or position held by the individual. (Example: Staff)
1. **organizations:** The organizations the individual was associated with. (Example: Charter 97)
1. **jobs:** The specific job roles or responsibilities of the individual. (Example: Editor, Publisher/Owner)
1. **coverage:** The areas or topics covered by the individual. (Example: Corruption, Politics)
1. **mediums:** The mediums or platforms used by the individual for their work. (Example: Internet)
1. **country:** The country where the incident occurred. (Example: Belarus)
1. **location:** The general location of the incident. (Example: An area outside Minsk)
1. **locality:** The specific locality or region of the incident. (Example: An area outside Minsk)
1. **localOrForeign:** Indicates if the individual was a local or foreign journalist/media worker. (Example: Local)
1. **sourcesOfFire:** The sources of fire or the entities responsible for the attack. (Example: Millitary Officials)
1. **motiveConfirmed:** Indicates if the motive behind the incident has been confirmed. (Example: Confirmed)
1. **impunityMurder:** The level of impunity associated with the murder. (Example: Complete Impunity)
1. **tortured:** Indicates if the individual was tortured. (Example: No)
1. **captive:** Indicates if the individual was held captive. (Example: No)
1. **threatened:** Indicates if the individual was threatened. (Example: Yes)

In [197]:
xray(data)
info_frame = info2(data,750)

Table shape: 2201 rows and 23 columns

There is 1 duplicated row.

10733 missing values in total, or 21.2 % of the data.


### Duplicates

1 duplicated row have to be removed.

---
### Missing data - *`part 1`*
---






In [198]:
info_frame[['column name','dtype','entries', 'missing', 'missing %']]

Unnamed: 0,column name,dtype,entries,missing,missing %
0,year,int64,2201,0,0.0
1,combinedStatus,object,2201,0,0.0
2,fullName,object,2201,0,0.0
3,primaryNationality,object,1487,714,32.44
4,gender,object,1620,581,26.4
5,type,object,2201,0,0.0
6,typeOfDeath,object,1476,725,32.94
7,status,object,2201,0,0.0
8,employedAs,object,2192,9,0.41
9,organizations,object,2195,6,0.27


The dataset has missing data in different columns, ranging from less than **1%** to around **57%**. Each row represents a unique case of a killed journalist or media worker, it's important to keep all records to preserve valuable information about these events. Each event (even with incomplete details) is a outweighting the need for complete details. Removing rows or columns with missing data would result in a significant loss of information, which is not suitable in this context. It's recommended to keep the dataset as it is and analyze the available data.

Although there are missing values, we can address this issue by using available information and logical reasoning to restore some values. For example, we can assume that individuals killed in crossfire were not captive, tortured, or threatened.

Exploratory data analysis, visualization, and descriptive statistics can help us understand the dataset, identify patterns or trends, and potentially restore more missing values.

Before proceeding with restoration, it's a good idea to fix other potential issues like misspellings or incorrect data types.

### Data types

In [199]:
print(data.dtypes.value_counts(),'\n')
info_frame[['column name','dtype','unique values']].sort_values(by='unique values', ascending=False)

object    22
int64      1
dtype: int64 



Unnamed: 0,column name,dtype,unique values
2,fullName,object,2198
9,organizations,object,1505
14,location,object,1169
15,locality,object,1159
11,coverage,object,125
13,country,object,116
3,primaryNationality,object,109
10,jobs,object,100
17,sourcesOfFire,object,41
0,year,int64,32


Data types could be left as they are. However, all columns `object` dtype with less than 5 uniuque values could be converted to `category`. Except for `captive`, `tortured`, `threatened`, and `motiveConfirmed`, which will be converted to `boolean` dtypes.

### Values

#### unique values

In [200]:
#get years
print(f"Number of unique values in 'year': {len(data['year'].unique().tolist())}")
print(sorted(data['year'].unique().tolist()))

#get combinedStatus
print(f"\nNumber of unique values in 'year': {len(data['combinedStatus'].unique().tolist())}")
print(data['combinedStatus'].unique().tolist())

#get primaryNationality
print(f"\nNumber of unique values in 'primaryNationality': {len(data['primaryNationality'].unique().tolist())}")
print(data['primaryNationality'].unique().tolist())

#get gender
print(f"\nNumber of unique values in 'gender': {len(data['gender'].unique().tolist())}")
print(data['gender'].unique().tolist())

#get type
print(f"\nNumber of unique values in 'type': {len(data['type'].unique().tolist())}")
print(data['type'].unique().tolist())

#get typeOfDeath
print(f"\nNumber of unique values in 'typeOfDeath': {len(data['typeOfDeath'].unique().tolist())}")
print(data['typeOfDeath'].unique().tolist())

#get employedAs
print(f"\nNumber of unique values in 'employedAs': {len(data['employedAs'].unique().tolist())}")
print(data['employedAs'].unique().tolist())

#get organizations
organizations = data['organizations'].str.split(',', expand=True).stack().str.strip().unique()
print(f"\nNumber of unique values in 'organizations': {len(organizations)}")
print(organizations)

#get mediums
mediums = data['mediums'].str.split(',', expand=True).stack().str.strip().unique()
print("\nNumber of unique values in 'mediums':", len(mediums))
print(mediums)

#get coverage
coverage = data['coverage'].str.split(',', expand=True).stack().str.strip().unique()
print("\nNumber of unique values in 'coverage':", len(coverage))
print(coverage)

#get jobs
jobs = data['jobs'].str.split(',', expand=True).stack().str.strip().unique()
print("\nNumber of unique values in 'jobs':", len(jobs))
print(jobs)

#get country
print(f"\nNumber of unique values in 'country': {len(data['country'].unique().tolist())}")
print(data['country'].unique().tolist())

#get location
print(f"\nNumber of unique values in 'location': {len(data['location'].unique().tolist())}")
print(data['location'].unique().tolist())

#get locality
print(f"\nNumber of unique values in 'locality': {len(data['locality'].unique().tolist())}")
print(data['locality'].unique().tolist())

#get localOrForeign
print(f"\nNumber of unique values in 'localOrForeign': {len(data['localOrForeign'].unique().tolist())}")
print(data['localOrForeign'].unique().tolist())

#get sourcesOfFire
sources = data['sourcesOfFire'].str.split(',', expand=True).stack().str.strip().unique()
print(f"\nNumber of unique values in 'sourcesOfFire': {len(sources)}")
print(sources)

#get impunityMurder
print(f"\nNumber of unique values in 'impunityMurder': {len(data['impunityMurder'].unique().tolist())}")
print(data['impunityMurder'].unique().tolist())

del organizations, mediums, jobs, sources, coverage


Number of unique values in 'year': 32
[1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]

Number of unique values in 'year': 3
['Confirmed', 'Unconfirmed', 'Killed']

Number of unique values in 'primaryNationality': 110
[nan, 'Afghanistan', 'Ethiopia', 'Syria', 'Algeria', 'Iraq', 'Libya', 'Somalia', 'Pakistan', 'Bangladesh', 'South Africa', 'Sierra Leone', 'Yemen', 'Russia', 'India', 'South Sudan', 'USA', 'Azerbaijan', 'Peru', 'Democratic Republic of the Congo', 'Mexico', 'East Timor', 'Egypt', 'Israel and the Occupied Palestinian Territory', 'Lebanon', 'Ghana', 'Bahrain', 'Maldives', 'Turkey', 'Sri Lanka', 'Angola', 'Belarus', 'Philippines', 'Georgia', 'Nigeria', 'Paraguay', 'Burundi', 'Colombia', 'El Salvador', 'Indonesia', 'Kyrgyzstan', 'Rwanda', 'Madagascar', 'Nicaragua', 'Ivory Coast', 'Brazil', 'France', 'Thailand', 'Myanmar', 'Serbia', 'Ne

#### special characters

In [201]:
column_content = check_strings(data)

for key, value in column_content.items():
    # if len(value) > 1:
    if isinstance(value, list) and len(value) > 1 and not all(pd.isnull(x) for x in value):
        print(f"{key}: {value}")

fullName: [' ', '[', ']', '’', 'ó', 'á', 'í', "'", '(', ')', 'ğ', 'é', 'Á', 'ú', 'ñ', '"', nan, '.', 'ı', 'ç', 'ã', 'É', 'ü', 'ö', 'Í', 'ł', 'ę', 'è', 'ë', 'â', 'ô', 'Ł', 'č', '“', '”', 'ş', 'ï']
primaryNationality: [nan, ' ']
type: [' ', nan]
typeOfDeath: [nan, ' ']
organizations: [nan, ' ', '/', 'é', "'", '(', ')', ';', 'ó', '7', '3', '2', '5', 'ü', 'á', '9', '’', '1', '"', 'Ó', 'ã', '0', '.', 'í', 'ş', '4', 'ï', '6', 'Ú', 'ñ', 'ú', 'Í', 'ń', 'É', 'Á', 'ł', '–', 'Ö', 'ç', '8', 'è', 'Ñ', '#']
jobs: [nan, ' ', '/']
coverage: [nan, ' ']
mediums: [nan, ' ']
country: [nan, ' ']
location: [nan, ' ', "'", '’', 'á', 'ó', 'é', 'ï', '.', 'ã', 'í', 'â', 'ı', 'ç', 'ú', 'ñ', 'İ', '(', ')', 'č', 'ń', 'ô', 'ü', 'ł', '/']
locality: [nan, ' ', "'", '’', 'á', 'ó', 'ï', '.', 'é', 'ã', 'í', 'â', 'ı', 'ç', 'ú', 'ñ', 'İ', '(', ')', 'č', 'ń', 'ô', 'ü', 'ł', '/']
sourcesOfFire: [nan, ' ']
impunityMurder: [nan, ' ']


After analyzing the values in each column, we now see that most columns contain various symbols and non-alphabetic characters such as brackets, accents, quotes, and special characters, empty string, and/or extra spaces.

**Recommended action**: Clean and standardize the data in the columns to ensure consistency and reliability in further analysis. This will involve removing special characters, handling missing values, and properly formatting the data where necessary.

### Testing country names

In [202]:
# Get unique country names from the column
unique_countries = data['country'].unique()

# Initialize a list to store unrecognized country names
unrecognized_countries = []

# Validate each unique country name
for country_name in unique_countries:
    try:
        pycountry.countries.search_fuzzy(country_name)
    except LookupError:
        unrecognized_countries.append(country_name)

# Print the list of unrecognized country names

print('The following country names were not recognized:\n')
print(unrecognized_countries)


The following country names were not recognized:

['Ivory Coast', 'Democratic Republic of the Congo', 'East Timor', 'Israel and the Occupied Palestinian Territory', 'Yugoslavia', 'Republic of Congo']


Yugoslavia does not exist anymore. The rest of the counties are valid but name correction is required for 5 country names.

### Quick Insights

1.   **How big is the data? How mant rows? How many columns?**
  -  *The **dataset** has 2201 rows and 43 columns.*

1.   **What are the data types of the columns?**
  - *Most data types are object. While this is ok, it might be beneficial to convert the data type of some columns to category.*

1.   **Are there any duplicates?**
  - *There is 1 duplicated row.*

1.   **Are there any missing values?**
  - *~50% of the data is missing. Columns with 60%+ missing data were removed after which the missing values dropped down to ~20%*

1.   **Do we need all the columns?**
  - *For the purpose of this project, we do not need all columns. The list of columns is already shortened to 23 and even less columns might be needed. The project aims to uncover general insights around journalist deaths worldwide like which are the most dangerous places to be journalist, deadliest places on the planet, what is the maximum number of killed journalist for a year, etc.*

1. **What additional procedures are required?**

  - removing accents, diacritics, extra empty spaces
  - removing duplicates
  - standardization for some columns
  - removing columns
  - removing duplicates
  - transforming some columns
  - creating additional columns
  - fixing country names
  - ...

**jobs** column contains '`editor`' and '`Editor`' and '`Broadcast Reporter`' and '`Broadcast reporter`'.

Honestly, data is not that big so there is no need for writing extra code. All values seem to be following specific standard no further interventions are required.

What seems to be left of the preparation:

- '`location'` and `'locality`' have whole phrases for places so this will be either fixed or column dropped out of the game.

- the values of '`coverage`', '`medium`', and '`jobs`' will be unpacked into columns with **True** or **False** value.

- a desperate attempt to restore some missing values



## (2) Cleaning & Transformation

### Duplicates

In [203]:
# remove duplicates
data = data.drop_duplicates()
print('Duplicates removed.')

Duplicates removed.


### Data types

In [204]:
#gender, type, combinedStatus, typeOfDeath, employedAs, localOrForeign, impunityMurder to category
data['gender'] = data['gender'].astype('category')
data['type'] = data['type'].astype('category')
data['combinedStatus'] = data['combinedStatus'].astype('category')
data['typeOfDeath'] = data['typeOfDeath'].astype('category')
data['employedAs'] = data['employedAs'].astype('category')
data['localOrForeign'] = data['localOrForeign'].astype('category')
data['impunityMurder'] = data['impunityMurder'].astype('category')


#motiveConfirmed, tortured, captive, threatened to boolean
data['captive'] = data['captive'].map({'Yes': True, 'No': False}).fillna(np.nan).astype('boolean')
data['tortured'] = data['tortured'].map({'Yes': True, 'No': False}).fillna(np.nan).astype('boolean')
data['threatened'] = data['threatened'].map({'Yes': True, 'No': False}).fillna(np.nan).astype('boolean')
data['motiveConfirmed'] = data['motiveConfirmed'].map({'Confirmed': True, 'Unconfirmed': False}).fillna(np.nan).astype('boolean')

info_frame = info2(data, 750)
# info_frame[info_frame['unique values'] < 6]

### Stripping whitespace, removing accents and diacritics

In [205]:
#unidecode for fullName, country, location
data['fullName'] = data['fullName'].apply(unidecode)
data['country'] = data['country'].apply(unidecode)
# data['location'] = data['location'].apply(unidecode)

columns = list(data.select_dtypes(include=['object', 'category']).columns)

for c in list(data.select_dtypes(include=['object', 'category']).columns):
    try:
      data[c] = data[c].str.strip()
    except:
      print('Kyp with', c)

print('Extra spaces, accents, and diacritics removed.')

Extra spaces, accents, and diacritics removed.


### Fixing country names

In [206]:
#Ivory Coast to Republic of Côte d'Ivoire
data.loc[data['country'] == 'Ivory Coast', 'country'] = "Republic of Côte d'Ivoire"

#Democratic Republic of the Congo to Congo, The Democratic Republic of the
data.loc[data['country'] == 'Democratic Republic of the Congo', 'country'] = "Congo, The Democratic Republic of the"

#East Timor to Democratic Republic of Timor-Leste
data.loc[data['country'] == 'East Timor', 'country'] = "Democratic Republic of Timor-Leste"

#Israel and the Occupied Palestinian Territory to State of Israel
data.loc[data['country'] == 'Israel and the Occupied Palestinian Territory', 'country'] = "State of Israel"

#Republic of Congo to Republic of the Congo
data.loc[data['country'] == 'Republic of Congo', 'country'] = "Republic of the Congo"

#removing accents and diacritics
data['country'] = data['country'].apply(unidecode)

print('Country names fixed.')

Country names fixed.


### Removing extra words from values in the '`location`' column

In [207]:
#fixing location names with extra words
print('Number of unique locations:', data['location'].astype('string').nunique())

data['location'] = data['location'].astype(str).apply(extract)
print()

print('Number of unique locations:', data['location'].astype('string').nunique())

Number of unique locations: 1155

Number of unique locations: 1101


### Missing data - *`part 2`*

#### Logical Reasoning Imputation

In this dataset, we have missing data in columns related to whether journalists were captives, tortured, or threatened when killed in crossfire incidents. To address these missing values, we can make assumptions based on available data. For example, we assume that journalists in crossfire are less likely to experience captivity, torture, or threats.

However, it's important to acknowledge the limitations of this logic-based imputation approach. The missing values indicate a lack of specific information or reporting for these cases. It's important to note that each situation is unique, and there can be exceptions or specific circumstances where journalists in crossfire faced threats, captivity, or torture before the event of deat. We cannot draw definitive conclusions about the chances of journalists in crossfire being captives, tortured, or threatened.

To gain more accurate insights, additional data explicitly documenting these circumstances would be valuable. Unfortunately, such data is not available. Therefore, we can only attempt to restore some missing values based on our assumptions.

It's crucial to approach descisions with caution, considering the data limitations and potential biases introduced by the missing values. For this, we will only use this approach to restore missing values for the columns '`captive`' and '`tortured`' and then try another way of restoration.

In [208]:
#selecting rows where type of Death is Crossfire assigning False for columns captive, tortured, and threatened
data.loc[data['typeOfDeath'] == 'Crossfire', ['captive', 'tortured']] = False

info_frame=info2(data[['captive','tortured']], 300)

And just like that, the number of missing values dropped to xx%  from 57%. More similar type of logical overlaps could be applied for restoring even more missing data but complete data restoration is way out of the project's scope so let's try just one more thing.

#### Sampling & testing

In [209]:
#selecting rows where gender is not missing for test_pool
test_pool = data[data['gender'].isnull() == False][['fullName','gender']]
test_pool.shape#testing random samples

def sample_tester(pool, count=10, size=10):
  c=count
  scores = []

  while c > 0:
    #random sample of xx
    test = pool.sample(n=size)
    test['test'] = 0
    # test['fullName'] = test['fullName'].apply(unidecode)
    test['test'] = test['fullName'].apply(detect)

    #calculating rate
    result = test['gender'].str.lower() == test['test']
    rate = result.mean()*100
    scores.append(rate)
    c -= 1  #-1 samples left to take

  #plot scores distribution
  q = np.percentile(scores, [25, 50, 75])
  s = px.histogram(scores, nbins=10,marginal="violin", template="plotly_white", opacity=0.6)
  s.update_layout(title=f"Testing {count} samples of {size} names - {pd.Series(scores).mean()}% average success rate", height=400, width= 800,showlegend=False)
  s.add_vline(x=q[0], line_dash="dot", line_color="silver")
  s.add_vline(x=q[1], line_dash="dot", line_color="gray")
  s.add_vline(x=q[2], line_dash="dot", line_color="grey")

  s.show()
  return scores

In [210]:
tests=20
count=tests
all_scores = []
while tests > 0:
  test = sample_tester(test_pool,10 ,100)
  all_scores.extend(test)
  tests -= 1

q = np.percentile(all_scores, [25, 50, 75])

s = px.histogram(all_scores, nbins=10,marginal="violin", template="plotly_white", opacity=0.6)
s.update_layout(title=f"Average success rate from {count} tests: {np.mean(all_scores)}%", height=400, width= 800,showlegend=False)
s.add_vline(x=q[0], line_dash="dot", line_color="silver")
s.add_vline(x=q[1], line_dash="dot", line_color="gray")
s.add_vline(x=q[2], line_dash="dot", line_color="grey")
s.show()

The tests show that an attempt to restore missing values in '`gender`' column will result in ~60-90% accuracity.

In [211]:
# Define the success rates (example data)
scores = all_scores.copy()

# Define the expected success rate and its range
expected_success_rate = 0.65
confidence_level = 0.95
margin_of_error = 0.02
lower_bound = expected_success_rate - margin_of_error
upper_bound = expected_success_rate + margin_of_error

# Perform one-sample t-test
t_stat, p_value = stats.ttest_1samp(scores, expected_success_rate, alternative='less')

# Calculate the confidence interval
mean = np.mean(scores)
std_error = np.std(scores, ddof=1) / np.sqrt(len(scores))
confidence_interval = stats.t.interval(confidence_level, len(scores)-1, loc=mean, scale=std_error)

# Print the results
print("Observed Mean: {:.4f}".format(mean))
print("Expected Success Rate: ~{:.4f}".format(expected_success_rate))
print("Confidence Interval ({}%): {:.4f} - {:.4f}".format(confidence_level * 100, confidence_interval[0], confidence_interval[1]))
print("H0: The success rate is approximately {:.2%} (+-2%)".format(expected_success_rate))
print("H1: The success rate is less than {:.2%}".format(expected_success_rate))

# Check if the null hypothesis can be rejected
if p_value < 0.05:
    print("\nReject the null hypothesis. The success rate is significantly less than {:.2%}.".format(expected_success_rate))
else:
    print("Fail to reject the null hypothesis. The success rate is not significantly less than {:.2%}.".format(expected_success_rate))

# Perform Shapiro-Wilk test for normality
statistic, p_value = stats.shapiro(all_scores)

# Check the p-value
alpha = 0.05
print('\nShapiro-Wilk test for normality:')
if p_value > alpha:
    print("Data appears to be normally distributed (fail to reject H0)\n")
else:
    print("Data does not appear to be normally distributed (reject H0)\n")

# Convert the scores to a numpy array
scores_arr = np.array(all_scores)
expected_rate = 0.65

# Calculate the deviations from the expected success rate
deviations = scores_arr - expected_rate

# Perform one-sample Wilcoxon signed-rank test
statistic, p_value = stats.wilcoxon(deviations)

# Check the p-value
alpha = 0.05
print('Wilcoxon signed-rank test:')
if p_value < alpha:
    print("Reject H0: The success rate is significantly different from", expected_rate)
else:
    print("Fail to reject H0: The success rate is not significantly different from", expected_rate)


Observed Mean: 64.1350
Expected Success Rate: ~0.6500
Confidence Interval (95.0%): 63.4929 - 64.7771
H0: The success rate is approximately 65.00% (+-2%)
H1: The success rate is less than 65.00%
Fail to reject the null hypothesis. The success rate is not significantly less than 65.00%.

Shapiro-Wilk test for normality:
Data appears to be normally distributed (fail to reject H0)

Wilcoxon signed-rank test:
Reject H0: The success rate is significantly different from 0.65


Based on the results of the tests, here are the conclusions:

**T-test**:

The observed mean success rate is 64.61%, which is close to the expected success rate of approximately 65.00%.
The 95% confidence interval for the success rate is 63.9360% to 65.2840%.
H0: The success rate is approximately 65.00% (+-2%).
H1: The success rate is less than 65.00%.
Fail to reject the null hypothesis: The success rate is not significantly less than 65.00%. This means that there is no strong evidence to suggest that the success rate is lower than 65.00%.
Normality Test:

**The Shapiro-Wilk test for normality**:

The test suggests that the data does not follow a normal distribution.
This indicates that the assumption of normality for the t-test may not be met.

**Wilcoxon Signed-Rank Test**:

The Wilcoxon signed-rank test was performed to compare the observed success rates to the expected success rate of 0.65.
The test result indicates that the null hypothesis can be rejected.
Rejecting the null hypothesis suggests that the observed success rates are significantly different from the expected success rate of 0.65.

Eventually, we could conclude that approximately 65% of the missing gender values can be correctly restored, but the exact percentage is more likely to be between 63.9360% and 65.2840%.

The is no strong support for a success rate significantly lower than 65%, and the Wilcoxon signed-rank test indicates that the observed success rates are significantly different from a success rate of 0.65.

Accuracy of restoration of missing values of 65% also means that ~35% of the genders might be restored with incorrect values or not at all. At the end, we would end up with 175 incorrectly assigned gender values, or 9% of all values.

9-10% incorrect genders values vs 26% missing data

This is not planned but 500 names are not that much for manual verification and testing hypothesis and results. This so out of this project's scope but I got curious on the fly. So, let's do that!

#### Restoring

In [212]:
to_restore = data[['fullName','gender']]
to_restore = to_restore[to_restore['gender'].isnull() == True]

to_restore['gender'] = to_restore['fullName'].apply(detect)
to_restore['gender'].value_counts()
to_restore['gender'] = to_restore['gender'].apply(str.title)


Based on the test results followed by manual verification, we can infer the following breakdown:

**Female Gender**:
- Correctly identified: 27 out of 31 (**87%** accuracy)
- Incorrectly identified: 4 out of 31

**Male Gender**:
- Correctly identified: 343 out of 343 (**100%** accuracy)

**Unable to Identify Gender**:
- Total: 206 names
- Female: 11 names
- Male: 195 names

The model achieved an accuracy rate of **63.79%**, which falls within the expected confidence interval of **63.9360%** to **65.2840%**.

In [213]:
#male in female
names_m = 'Bobi Tsankov, Widad Hussein, Yensi Roberto Ordonez Galdamez, Nurul Islam Faruqi'.split(', ')
names_i = [389,2120,2150,1566]
to_restore.loc[to_restore['fullName'].isin(names_m), 'gender'] = 'Male'

#female in unknown
#	Chaitali Santra, 	Bolade Fasasi, 	Ding Sade,	Farah Hassan Sahal, 	Indra Mohan Hakasam, 	Isabel Chumpitaz Panta
# 	Issa Ngumba, 	Kishvaroy Sharifova, 	Palwasha Tokhi, 	Nerlita Ledesma, 	Shefki Popova
names_f = 'Chaitali Santra, Bolade Fasasi, Ding Sade, Farah Hassan Sahal, Indra Mohan Hakasam, Issa Ngumba, Kishvaroy Sharifova, Palwasha Tokhi, Nerlita Ledesma, Shefki Popova'.split(', ')
names_i = [433,391,512,607,863,867,881,1114,1620,1530,1918]
to_restore.loc[to_restore['fullName'].isin(names_f), 'gender'] = 'Female'

to_restore.loc[to_restore['gender']=='unknown','gender'] = 'Male'
# print(to_restore['gender'].value_counts())
# print('\nAll missing gender values have been restored.')


# Merge data and to_restore on the "fullName" column
merged = data.merge(to_restore, on='fullName', how='left')

# Update the "gender" column in data with values from to_restore
data['gender'] = merged['gender_y'].fillna(data['gender'])

del merged, to_restore, names_m, names_f, names_i


In [214]:
print('Missing values restored!\n')
print(data['gender'].value_counts(),'\n')

print(data.info())

Missing values restored!

Male       1449
Unknown     206
Female      132
Name: gender, dtype: int64 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2200 entries, 0 to 2200
Data columns (total 23 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   year                2200 non-null   int64  
 1   combinedStatus      2200 non-null   object 
 2   fullName            2200 non-null   object 
 3   primaryNationality  1487 non-null   object 
 4   gender              1787 non-null   object 
 5   type                2200 non-null   object 
 6   typeOfDeath         1476 non-null   object 
 7   status              2200 non-null   object 
 8   employedAs          2191 non-null   object 
 9   organizations       2194 non-null   object 
 10  jobs                1528 non-null   object 
 11  coverage            1470 non-null   object 
 12  mediums             1523 non-null   object 
 13  country             2200 non-null   object 
 14  l

### Unstacking some columns

In [215]:
# Unstack 'jobs' column
data['jobs'] = data['jobs'].str.title()

jobs_unstacked = data['jobs'].str.get_dummies(sep=',')
jobs_columns = [f'job_{column}' for column in jobs_unstacked.columns]
jobs_unstacked.columns = jobs_columns

# Unstack 'mediums' column
mediums_unstacked = data['mediums'].str.get_dummies(sep=',')
mediums_columns = [f'medium_{column}' for column in mediums_unstacked.columns]
mediums_unstacked.columns = mediums_columns

# Unstack 'coverage' column
coverage_unstacked = data['coverage'].str.get_dummies(sep=',')
coverage_columns = [f'coverage_{column}' for column in coverage_unstacked.columns]
coverage_unstacked.columns = coverage_columns

# Concatenate the unstacked columns with the original DataFrame
new = pd.concat([jobs_unstacked, mediums_unstacked, coverage_unstacked], axis=1)
data = pd.concat([data, new], axis=1)


del jobs_unstacked, mediums_unstacked, coverage_unstacked, jobs_columns, mediums_columns, coverage_columns

print(data.shape[0], 'rows and', data.shape[1], 'columns')


2200 rows and 47 columns


### Dropping columns

In [217]:
data = data.drop(['status','locality','jobs', 'mediums', 'coverage', axis=1)

Unnamed: 0,year,combinedStatus,fullName,primaryNationality,gender,type,typeOfDeath,employedAs,organizations,country,...,medium_Television,coverage_Business,coverage_Corruption,coverage_Crime,coverage_Culture,coverage_Human Rights,coverage_Politics,coverage_Sports,coverage_Unknown,coverage_War
0,2006,Confirmed,Ahmad [full name unavailable],,Male,Media Worker,,Staff,Al-Shaabiya,Iraq,...,0,0,0,0,0,0,0,0,0,0
1,2007,Confirmed,Khaled Mohammad Nofan,,Male,Media Worker,,Staff,Al-Watan,Iraq,...,0,0,0,0,0,0,0,0,0,0
2,2007,Confirmed,Sabah Salman,,Female,Media Worker,,Staff,Iraq Media Network,Iraq,...,0,0,0,0,0,0,0,0,0,0
3,2005,Confirmed,Unidentified,,Unknown,Media Worker,,Staff,Al-Hurra,Iraq,...,0,0,0,0,0,0,0,0,0,0
4,2018,Confirmed,Abadullah Hananzai,Afghanistan,Unknown,Journalist,Murder,Staff,"Radio Azadi,Radio Free Europe/Radio Liberty",Afghanistan,...,0,0,0,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2196,1993,Unconfirmed,Zivko Filipovic,,,Journalist,,Staff,Srpsko Slovo,Bosnia,...,0,0,0,0,0,0,0,0,0,0
2197,2022,Confirmed,Zoreslav Zamoysky,Ukraine,Male,Journalist,Dangerous Assignment,Freelance,Freelance,Ukraine,...,0,0,0,0,0,1,1,0,0,1
2198,2007,Confirmed,Zubair Ahmed Mujahid,Pakistan,Male,Journalist,Murder,Staff,Jang,Pakistan,...,0,0,1,1,0,0,0,0,0,0
2199,2014,Confirmed,Zubair Hatami,Afghanistan,Male,Journalist,Dangerous Assignment,Staff,Mitra TV,Afghanistan,...,1,0,0,0,0,1,1,0,0,1


In [216]:
info_frame = info2(data, 1300)

# (-) Saving file

In [218]:
#Saving file to CSV file
data.to_csv('Killed Journalists and Media Workers since 1992 _edited.csv', index=False)

# (-) Creating metadata

In [219]:
def generate_column_descriptions():
    # Column descriptions
    column_descriptions = {
        'year': 'The year in which the incident occurred.',
        'combinedStatus': 'The combined status of the incident, indicating if it has been confirmed or not.',
        'fullName': 'The full name of the journalist or media worker.',
        'primaryNationality': 'The primary nationality of the individual.',
        'gender': 'The gender of the individual.',
        'type': 'The type of the individual, specifying if they were a journalist or another type of media worker.',
        'typeOfDeath': 'The type of death the individual suffered.',
        'status': 'The status of the individual, indicating if they were killed.',
        'employedAs': 'The role or position held by the individual.',
        'organizations': 'The organizations the individual was associated with.',
        'jobs': 'The specific job roles or responsibilities of the individual. (unpacked into jobs_ columns)',
        'coverage': 'The areas or topics covered by the individual.(unpacked into coverage_ columns)',
        'mediums': 'The mediums or platforms used by the individual for their work.(unpacked into mediums_ columns)',
        'country': 'The country where the incident occurred.',
        'location': 'The general location of the incident.',
        'locality': 'The specific locality or region of the incident.',
        'localOrForeign': 'Indicates if the individual was a local or foreign journalist/media worker.',
        'sourcesOfFire': 'The sources of fire or the entities responsible for the attack.',
        'motiveConfirmed': 'Indicates if the motive behind the incident has been confirmed.',
        'impunityMurder': 'The level of impunity associated with the murder.',
        'tortured': 'Indicates if the individual was tortured.',
        'captive': 'Indicates if the individual was held captive.',
        'threatened': 'Indicates if the individual was threatened.',
        'job_Broadcast Reporter': "Indicates whether the individual's job role is a Broadcast Reporter. Example value: 1",
        'job_Camera Operator': "Indicates whether the individual's job role is a Camera Operator. Example value: 0",
        'job_Columnist/Commentator': "Indicates whether the individual's job role is a Columnist/Commentator. Example value: 1",
        'job_Documentary Filmmaker': "Indicates whether the individual's job role is a Documentary Filmmaker. Example value: 0",
        'job_Editor': "Indicates whether the individual's job role is an Editor. Example value: 1",
        'job_Internet Reporter': "Indicates whether the individual's job role is an Internet Reporter. Example value: 0",
        'job_Photographer': "Indicates whether the individual's job role is a Photographer. Example value: 1",
        'job_Print Reporter': "Indicates whether the individual's job role is a Print Reporter. Example value: 0",
        'job_Producer': "Indicates whether the individual's job role is a Producer. Example value: 1",
        'job_Publisher/Owner': "Indicates whether the individual's job role is a Publisher/Owner. Example value: 0",
        'job_Reporter': "Indicates whether the individual's job role is a Reporter. Example value: 1",
        'job_Technician': "Indicates whether the individual's job role is a Technician. Example value: 0",
        'job_Unknown': "Indicates whether the individual's job role is unknown. Example value: 1",
        'medium_Documentary Film': "Indicates whether the individual's medium is a Documentary Film. Example value: 0",
        'medium_Internet': "Indicates whether the individual's medium is the Internet. Example value: 1",
        'medium_Print': "Indicates whether the individual's medium is Print. Example value: 0",
        'medium_Radio': "Indicates whether the individual's medium is Radio. Example value: 1",
        'medium_Television': "Indicates whether the individual's medium is Television. Example value: 0",
        'coverage_Business': "Indicates whether the coverage area is Business. Example value: 1",
        'coverage_Corruption': "Indicates whether the coverage area is Corruption. Example value: 0",
        'coverage_Crime': "Indicates whether the coverage area is Crime. Example value: 1",
        'coverage_Culture': "Indicates whether the coverage area is Culture. Example value: 0",
        'coverage_Human Rights': "Indicates whether the coverage area is Human Rights. Example value: 1",
        'coverage_Politics': "Indicates whether the coverage area is Politics. Example value: 0",
        'coverage_Sports': "Indicates whether the coverage area is Sports. Example value: 1",
        'coverage_Unknown': "Indicates whether the coverage area is unknown. Example value: 0",
        'coverage_War': "Indicates whether the coverage area is War. Example value: 1"
    }

    # Generate the text file
    with open('column_descriptions.txt', 'w') as file:
        file.write("Data downloaded from The Committee to Protect Journalists.\n")
        file.write("Link to the data - https://cpj.org/data/killed/\n\n")
        file.write("Column Descriptions:\n\n")
        for column_name, description in column_descriptions.items():
            file.write(f"{column_name}: {description}\n")

    print("Column descriptions have been written to column_descriptions.txt\n")
    return column_descriptions
# generate_column_descriptions()


In [220]:
file = generate_column_descriptions()
file

Column descriptions have been written to column_descriptions.txt



{'year': 'The year in which the incident occurred.',
 'combinedStatus': 'The combined status of the incident, indicating if it has been confirmed or not.',
 'fullName': 'The full name of the journalist or media worker.',
 'primaryNationality': 'The primary nationality of the individual.',
 'gender': 'The gender of the individual.',
 'type': 'The type of the individual, specifying if they were a journalist or another type of media worker.',
 'typeOfDeath': 'The type of death the individual suffered.',
 'status': 'The status of the individual, indicating if they were killed.',
 'employedAs': 'The role or position held by the individual.',
 'organizations': 'The organizations the individual was associated with.',
 'jobs': 'The specific job roles or responsibilities of the individual. (unpacked into jobs_ columns)',
 'coverage': 'The areas or topics covered by the individual.(unpacked into coverage_ columns)',
 'mediums': 'The mediums or platforms used by the individual for their work.(unp