In [2]:
import spacy
import pandas as pd
from spacy import displacy
#from spacy.pipeline import EntityRuler

In [3]:
NER = spacy.load("en_core_web_sm")
ruler = NER.add_pipe("entity_ruler", before='ner')

entities = ['rebels', 'rebel', 'Rebel', 'Rebels', 'dinka', 'Dinka', 'army', 'Army', 'Janjaweed', 'janjaweed', 'Messeriya', 'messeriya','peacekeepers', 
            'Peacekeepers', 'Darfurian traders', 'UNMISS', 'Government of the Republic of South Sudan', 'Darfuris','SSLM', 'SPLM', 'Machar', "Sudan People's Liberation Movement", 'SPLA']
locations = ['Juba', 'Bentiu', 'Unity State', 'Mayom', 'Bor', 'Jonglei', 'Malakal','Wau']

for i in entities:
        ruler.add_patterns([{"label": "ORG", "pattern": i}])
for l in locations:
        ruler.add_patterns([{"label": "GPE", "pattern": l}])

NER.pipeline



[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x12daf22ce30>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x12daf1bce90>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x12db261f610>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x12db28c1190>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x12db28d07d0>),
 ('entity_ruler', <spacy.pipeline.entityruler.EntityRuler at 0x12db25fbed0>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x12db261f680>)]

In [4]:
df = pd.read_csv('data/articles_topics_conflict_cleaned.csv', parse_dates=['date'])

# use for Article Assignment investigation

In [18]:
index = 50


tex = df['summary'][index]

display(df.iloc[index,:][['chat_GPT_location', 'cd_district_name']])

text1 = NER(tex)

orgs_dict = {}
1
for word in text1.ents:
    if word.label_ == 'ORG':
        org_name = word.text
        if org_name in orgs_dict:
            orgs_dict[org_name] += 1
        else:
            orgs_dict[org_name] = 1

# Display the organizations and their respective counts
print(f"Organizations mentioned in the article:\n")
for org, count in orgs_dict.items():
    print(f"{org}: {count}")

displacy.render(text1,style="ent",jupyter=True)

chat_GPT_location    Unity State
cd_district_name            Koch
Name: 50, dtype: object

Organizations mentioned in the article:

Upper Nile State: 1


# FInding Articles explicitly mention Liberation Movement

In [82]:
looking_for = ['SPLM', 'SSLM', "Sudan People's Liberation Movement"]

articles_found = []

for index, content in df.iterrows():
    text1 = NER(content['summary'])
    for word in text1.ents:
        if word.label_ == 'ORG':
            org_name = word.text
            if org_name in looking_for:
                articles_found.append(index)
                break

print(f'total number of articles with matches: {len(articles_found)}')
print(f'Index of articles with matches: {articles_found}')

total number of articles with matches: 40
Index of articles with matches: [16, 21, 22, 71, 86, 93, 99, 108, 128, 132, 137, 152, 158, 170, 190, 199, 203, 220, 226, 244, 261, 276, 277, 280, 292, 295, 315, 318, 341, 342, 344, 345, 346, 347, 358, 363, 370, 376, 379, 457]


# Adding hand labeled info back into dataset

In [39]:
df = pd.read_csv('data/handlabeled_articles_results.csv')
df2 = pd.read_csv('data/articles_topics_conflict_cleaned.csv', parse_dates=['date'])

In [40]:
merged_df = df2.merge(df, left_index=True, right_on='Article index')
merged_df = merged_df.drop(['Article index', 'Location correct'], axis=1).rename({'keep location':'County level','hunger':'hunger_bert','refugees':'refugees_bert','humanitarian':'humanitarian_bert','conflict':'conflict_bert' }, axis=1)
merged_df

Unnamed: 0,summary,date,chat_GPT_location,lat,lng,article_id,cd_district_name,hunger_bert,refugees_bert,humanitarian_bert,conflict_bert,Conflict,County level,Unity level,South Sudan / Liberation movement Conflict,Additional Notes
0,The article discusses the readiness of Guit co...,2011-07-07,Bentiu,9.231487,29.800503,88,Rubkona,False,False,False,True,0,no,no,no,This article is about a celebration of South S...
1,The article discusses the celebration of South...,2011-07-09,Bentiu,9.231487,29.800503,103,Rubkona,False,False,False,True,0,no,no,no,This article is about a celebration of South S...
2,The article discusses the congratulatory messa...,2011-07-11,Bentiu,9.231487,29.800503,221,Rubkona,False,False,False,True,0,no,no,no,Head of Kenyan Community congratulates South S...
3,The article discusses the killing of rebel lea...,2011-07-23,Koch county,8.684727,29.881520,299,Koch,True,True,True,True,2,yes,no,yes,"Rebel leader Gatluak killed in Koch county, be..."
4,The article discusses the killing of South Sud...,2011-07-23,Unity state,8.927721,29.788925,327,Koch,True,True,True,True,2,yes,no,yes,"Rebel leader Gatluak killed in Koch county, be..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
454,The article discusses how the UK government ha...,2022-12-09,Koch County,8.684727,29.881520,18396,Koch,True,True,True,True,0,no,no,no,UK imposed sanctions on two county commissione...
455,The article discusses how the humanitarian pri...,2022-08-22,Unity State,8.927721,29.788925,18398,Koch,True,True,True,True,0,no,no,yes,Aid is frequently manipulated by political elites
456,The article discusses the intensification of a...,2022-09-26,Unity State,8.927721,29.788925,18427,Koch,False,False,False,True,2,no,yes,yes,Fighting between civilians
457,The article discusses a UN report documenting ...,2022-09-06,Unity state,8.927721,29.788925,18439,Koch,True,True,True,True,1,no,yes,yes,Widespread human rights violations


In [41]:
merged_df.to_csv('data/articles_raw_with_handlabels.csv', index=False)

In [55]:
df = pd.read_csv('data/articles_raw_with_handlabels.csv', parse_dates=['date'])


display(df.head())
df.shape

Unnamed: 0,summary,date,chat_GPT_location,lat,lng,article_id,cd_district_name,hunger_bert,refugees_bert,humanitarian_bert,conflict_bert,Conflict,County level,Unity level,South Sudan / Liberation movement Conflict,Additional Notes
0,The article discusses the readiness of Guit co...,2011-07-07,Bentiu,9.231487,29.800503,88,Rubkona,False,False,False,True,0,no,no,no,This article is about a celebration of South S...
1,The article discusses the celebration of South...,2011-07-09,Bentiu,9.231487,29.800503,103,Rubkona,False,False,False,True,0,no,no,no,This article is about a celebration of South S...
2,The article discusses the congratulatory messa...,2011-07-11,Bentiu,9.231487,29.800503,221,Rubkona,False,False,False,True,0,no,no,no,Head of Kenyan Community congratulates South S...
3,The article discusses the killing of rebel lea...,2011-07-23,Koch county,8.684727,29.88152,299,Koch,True,True,True,True,2,yes,no,yes,"Rebel leader Gatluak killed in Koch county, be..."
4,The article discusses the killing of South Sud...,2011-07-23,Unity state,8.927721,29.788925,327,Koch,True,True,True,True,2,yes,no,yes,"Rebel leader Gatluak killed in Koch county, be..."


(459, 16)

In [54]:
conflict_df = df[df['Conflict'] != 0] # removes all handlabeled non-conflict articles
conflict_df = conflict_df[(conflict_df['County level'] == 'yes') | (conflict_df['Unity level'] == 'yes')] # Checks that article applies to Unity state or Rubkona county


display(conflict_df.head())
conflict_df.shape

Unnamed: 0,summary,date,chat_GPT_location,lat,lng,article_id,cd_district_name,hunger_bert,refugees_bert,humanitarian_bert,conflict_bert,Conflict,County level,Unity level,South Sudan / Liberation movement Conflict,Additional Notes
3,The article discusses the killing of rebel lea...,2011-07-23,Koch county,8.684727,29.88152,299,Koch,True,True,True,True,2,yes,no,yes,"Rebel leader Gatluak killed in Koch county, be..."
4,The article discusses the killing of South Sud...,2011-07-23,Unity state,8.927721,29.788925,327,Koch,True,True,True,True,2,yes,no,yes,"Rebel leader Gatluak killed in Koch county, be..."
5,The article discusses the death of rebel leade...,2011-07-25,Bentiu,9.231487,29.800503,371,Rubkona,True,True,True,True,2,yes,no,yes,"Rebel leader Gatluak killed in Koch county, be..."
7,The article discusses the assassination of Gat...,2011-08-05,Unity state,8.927721,29.788925,493,Koch,True,True,True,True,2,yes,no,yes,"Rebel leader Gatluak killed in Koch county, be..."
14,The article discusses the challenges faced by ...,2011-10-20,Unity state,8.927721,29.788925,934,Koch,False,False,False,True,1,no,yes,no,Very broadly mentioned challenges faced by Sou...


(289, 16)

In [56]:
conflict_df.to_csv('data/conflict_articles_cleaned_handlabeled.csv', index=False)

## example for NER

In [26]:
orgs_dict = {}

for word in text1.ents:
    if word.label_ == 'ORG':
        org_name = word.text
        if org_name in orgs_dict:
            orgs_dict[org_name] += 1
        else:
            orgs_dict[org_name] = 1

# Display the organizations and their respective counts
print(f"Organizations mentioned in the article:\n")
for org, count in orgs_dict.items():
    print(f"{org}: {count}")


Organizations mentioned in the article:

the South Sudan Human Rights Society: 1
SSHURSA: 2
the South Sudan National Legislative Assembly: 1
Justice: 1


In [28]:
orgs = set()

for word in text1.ents:
    if word.label_ == 'ORG':
        orgs.add(word.text)

display(f'Orgs mentioned in article: {orgs}')

"Orgs mentioned in article: {'Justice', 'SSHURSA', 'the South Sudan Human Rights Society', 'the South Sudan National Legislative Assembly'}"

In [183]:
# source: https://www.analyticsvidhya.com/blog/2021/06/nlp-application-named-entity-recognition-ner-in-python-with-spacy/
display(spacy.explain("ORG"))
display(spacy.explain("GPE"))
display(spacy.explain("LOC"))
displacy.render(text1,style="ent",jupyter=True)

'Companies, agencies, institutions, etc.'

'Countries, cities, states'

'Non-GPE locations, mountain ranges, bodies of water'