# HW07: Parsing

Remember that these homework work as a completion grade. **You can skip one section of this homework.**

In [55]:
import pandas as pd

df = pd.read_csv('train.csv')

df.columns = ["label", "title", "lead"]
label_map = {1: "world", 2: "sport", 3: "business", 4: "sci/tech"}


def replace_label(x):
    return label_map[x]


df["label"] = df["label"].apply(replace_label)
df["text"] = df["title"] + " " + df["lead"]
df = df.sample(n=10000)  # # only use 10K datapoints
# df = df.sample(n=100)
df.head()

Unnamed: 0,label,title,lead,text
39815,world,Frenchman Shot Dead in Saudi Arabia,JEDDAH (Reuters) - A Frenchman was shot dead ...,Frenchman Shot Dead in Saudi Arabia JEDDAH (R...
44834,sport,Stars won't leave home for it,"Purses overflowing with cash, exotic locales, ...",Stars won't leave home for it Purses overflowi...
1945,sci/tech,Microsoft Updates Its IBM Connectivity Server,Microsoft on Tuesday unveiled the 2004 version...,Microsoft Updates Its IBM Connectivity Server ...
4037,sci/tech,An Aspirin a Day -- Good Medicine For Many,Most people have heard that taking a small dos...,An Aspirin a Day -- Good Medicine For Many Mos...
119361,sport,Former Blue Jays Manager Mattick Dies (AP),"AP - Bobby Mattick, who managed the Toronto Bl...",Former Blue Jays Manager Mattick Dies (AP) AP ...


In [56]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [57]:
#TODO preprocess the corpus using spacy or load the pre-processed corpus
df["preprocessed"] = df["text"].apply(lambda x: nlp(x))

### Information Extraction

In [58]:
from collections import Counter
from typing import Iterable


def extract_pairs(sent, which: str):
    elems = [w for w in sent if w.dep_ == which]
    return [(w.lemma_.lower(), w.head.lemma_.lower()) for w in elems]


def get_most_common(items: Iterable, n: int = 10, verbose: bool = True) -> Counter:
    counter = Counter()
    for item in items:
        counter.update(item)
    if verbose:
        for pair, counts in counter.most_common(n=n):
            print(pair, counts)
    return counter

In [59]:
##TODO extract the subject-verbs pairs and print the result for the first document
df["subj-verb-pairs"] = df["preprocessed"].apply(lambda x: extract_pairs(x, 'nsubj'))
df.iloc[0]["subj-verb-pairs"]

[('frenchman', 'shoot'), ('that', 'mirror')]

In [60]:
##TODO create a list ranking the most common pairs and print the first 10 items
counter_subj_verb = get_most_common(df["subj-verb-pairs"], n=10)

('-pron-', 'be') 447
('official', 'say') 162
('-pron-', 'have') 100
('that', 'be') 89
('-pron-', 'say') 73
('company', 'say') 68
('-pron-', 'take') 50
('what', 'be') 46
('this', 'be') 46
('group', 'say') 44


In [62]:
##TODO do the same for verbs-object pairs ('dobj')
df["verb-obj-pairs"] = df["preprocessed"].apply(lambda x: extract_pairs(x, 'dobj'))
df.iloc[0]["verb-obj-pairs"]

[('dead', 'shoot'), ('attack', 'mirror'), ('exporter', 'destabilize')]

In [63]:
counter_verb_obj = get_most_common(df["verb-obj-pairs"], n=10)

('people', 'kill') 85
('job', 'cut') 44
('agreement', 'reach') 33
('million', 'pay') 32
('point', 'score') 31
('lead', 'take') 28
('rate', 'raise') 24
('plan', 'announce') 22
('39;t', 'win') 21
('step', 'take') 20


In [64]:
##TODO do the same for adjectives-nouns pairs ('amod')
df["adj-noun-pairs"] = df["preprocessed"].apply(lambda x: extract_pairs(x, 'amod'))
df.iloc[0]["adj-noun-pairs"]

[('saudi', 'city'), ('recent', 'attack'), ('large', 'exporter')]

In [68]:
##TODO create a list ranking the most common pairs and print the first 10 items
counter_adj_noun = get_most_common(df["adj-noun-pairs"], n=10)

('third', 'quarter') 163
('last', 'week') 121
('next', 'year') 101
('last', 'night') 101
('first', 'time') 82
('presidential', 'election') 68
('last', 'month') 59
('last', 'year') 59
('next', 'week') 58
('such', 'as') 56


### Exploring cross label dependencies

In [69]:
import itertools
##TODO extract all the subject-verbs and verbs-object pairs for the verb "win"
for k, counter in [('subject-verbs', counter_subj_verb), ('verbs-object', counter_verb_obj)]:
    print('=' * 20, k, '=' * 20)
    for subj, verb in itertools.islice(iter(counter), 10):
        if verb == 'win':
            print(subj, verb)



In [67]:
##TODO for each label create a list ranking the most common subject-verbs pairs and one for the most common verbs-object pairs
counters = {}
for label in df.label:
    counters[label] = {
        'counter_subj_verb': get_most_common(df.loc[df.label == label, 'subj-verb-pairs'], verbose=False),
        'counter_verb_obj': get_most_common(df.loc[df.label == label, 'verb-obj-pairs'], verbose=False)}

##TODO print the 10 most common pairs for each of the two lists for the labels "sport" and "business"
for label in ["sport", "business"]:
    print('=' * 20, label, '=' * 20)
    for k, counter in counters[label].items():
        print('*' * 5, k, '*' * 5)
        for pair, counts in counter.most_common(n=10):
            print(pair, counts)
        print('\n')



***** counter_subj_verb *****
('-pron-', 'be') 175
('-pron-', 'have') 36
('this', 'be') 24
('-pron-', 'take') 23
('that', 'be') 22
('-pron-', 'go') 20
('-pron-', 'say') 17
('-pron-', 'see') 16
('-pron-', 'get') 16
('-pron-', 'need') 15


***** counter_verb_obj *****
('point', 'score') 31
('lead', 'take') 21
('title', 'win') 20
('medal', 'win') 17
('game', 'miss') 16
('point', 'have') 15
('lead', 'give') 14
('goal', 'score') 13
('game', 'play') 12
('39;t', 'wasn') 12


***** counter_subj_verb *****
('-pron-', 'be') 68
('company', 'say') 35
('profit', 'rise') 28
('group', 'say') 26
('stock', 'rise') 25
('quote', 'say') 25
('corp.', 'say') 25
('-pron-', 'cut') 24
('inc.', 'say') 23
('official', 'say') 21


***** counter_verb_obj *****
('job', 'cut') 39
('million', 'pay') 24
('rate', 'raise') 23
('share', 'send') 20
('agreement', 'reach') 18
('profit', 'post') 15
('loss', 'post') 15
('profit', 'report') 12
('rise', 'report') 11
('stake', 'buy') 11


