In [1]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [2]:
doc = nlp('weather is good. How are you.')
# 打印词和词性
for token in doc:
    print(f'{token}-{token.pos_}')

weather-NOUN
is-AUX
good-ADJ
.-PUNCT
How-SCONJ
are-AUX
you-PRON
.-PUNCT


In [4]:
for sent in doc.sents:
    print(sent)

weather is good.
How are you.


In [11]:
# 命名体识别
doc2 = nlp('I went to Paris where I met my old friend Jack')
for ent in doc2.ents:
    print(f'{ent}-{ent.label_}')

Paris-GPE
Jack-PERSON


In [9]:
from spacy import displacy
displacy.render(doc2, style='ent', jupyter=True)

In [10]:
# 找到书中所有人的名字

text = ''



In [2]:
from collections import Counter,defaultdict

In [7]:
def count_person(doc):
    c = Counter()
    processed_text = nlp(doc)
    for ent in processed_text.ents:
        if ent.label_ == "PERSON":
            c[ent.lemma_] += 1 
    return c.most_common(10)

def get_doc(filename):
    with open(filename, mode='r') as f:
        return f.read()

doc = get_doc('./data/pride_and_prejudice.txt')
print(count_person(doc))

[('Elizabeth', 624), ('Darcy', 411), ('Jane', 280), ('Bennet', 245), ('Collins', 179), ('Bingley', 163), ('Wickham', 114), ('Lizzy', 94), ('Gardiner', 94), ('Lady Catherine', 81)]


In [3]:
common_terrorist_groups = [
    'taliban', 
    'al - qaeda', 
    'hamas',  
    'fatah', 
    'plo', 
    'bilad al - rafidayn'
]

common_locations = [
    'iraq',
    'baghdad', 
    'kirkuk', 
    'mosul', 
    'afghanistan', 
    'kabul',
    'basra', 
    'palestine', 
    'gaza', 
    'israel', 
    'istanbul', 
    'beirut', 
    'pakistan'
]

In [4]:
# 恐怖袭击分析
def read_file_to_list(file_name):
    with open(file_name, 'r') as file:
        return file.readlines()
terr_articles = read_file_to_list('./data/rand-terrorism-dataset.txt')
terr_articles_nlp = [nlp(art) for art in terr_articles]

['taliban', 'al - qaeda', 'hamas', 'fatah', 'plo', 'bilad al - rafidayn']


In [16]:
count_dict = defaultdict(Counter)
for article_nlp in terr_articles_nlp:
    # 存储文章中的组织和犯罪地点
    groups = [ent.lemma_ for ent in article_nlp.ents if ent.label_ == "PERSON" or ent.label_ == "ORG"]
    locations = [ent.lemma_ for ent in article_nlp.ents if ent.label_ == 'GPE']
    # 留下关注的组织和地点
    care_groups = [ent for ent in groups if ent in common_terrorist_groups]
    care_location = [ent for ent in locations if ent in common_locations]
    print(groups)
    print(care_groups)
    # 计数组织和地点
    for group in care_groups:
        for location in care_location:
            count_dict[group][location] += 1
count_dict

['the Santiago Binational Center']
[]
[]
[]
['GUATEMALA']
[]
['Chase Manhattan Bank', 'Trans-World Airways', 'the Bank of America']
[]
['anti-Castro Cubans', 'the Spanish National Tourist Office']
[]
['El Poder Cubano']
[]
['Montevideo']
[]
['El Poder Cubano']
[]
['Joseph Antoine', 'the Haitian Coalition', 'anti-duvalier']
[]
['the Australian National Tourist Office', 'El Poder Cubano']
[]
['the Mexican National Tourist Office']
[]
['El Poder Cubano', 'Aeronaves de Mexico']
[]
['Air France', 'El Poder Cubano']
[]
['Israel Airlines', 'the Popular Front for the Liberation of Palestine', 'the PFLP General Command', 'PFLP', 'the International Red Cross']
[]
['Ein Yahav']
[]
['Al Fatah']
[]
['kupre', 'FBI']
[]
['El Poder Cubano', 'Polancia']
[]
['USIS']
[]
['USIS']
[]
['USIS']
[]
[]
[]
['Olympic Airways', 'Olympic Airways']
[]
['Israel Airlines']
[]
['the Amerika Haus Library']
[]
[]
[]
[]
[]
[]
[]
['the Trans-Arabian Pipeline', 'PFLP']
[]
['tameion', 'USIS']
[]
['PAKISTAN', 'the Eritrean L

defaultdict(collections.Counter, {})

In [None]:
import pandas as pd
df = pd.DataFrame.from_dict(dict(count_dict), dtype=int)
df = df.fillna(value=0).astype(int)
df