In [2]:
import pandas as pd
import spacy
from nltk.corpus import wordnet as wn

In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
data = pd.read_csv('data/it-corpus.tsv', sep='\t')

In [5]:
data

Unnamed: 0,Class,Sentence
0,NomAnaph,"Nevertheless, in view of the world-wide ..."
1,ClauseAnaph,It's simply bare-faced fortune hunting; b...
2,NomAnaph,Mary was situated about two miles from ...
3,ClauseAnaph,She is up at five every morning to mil...
4,ClauseAnaph,Don't say it.
...,...,...
508,NomAnaph,I was fairly certain that it was Mr.s. ...
509,ClauseAnaph,It all arose from a misunderstanding.
510,ClauseAnaph,He knew it.
511,NomAnaph,"Pardon me, mon ami, but you did not un..."


In [6]:
def process_row(row):
    sentence = row.Sentence
    doc = nlp(sentence)

    # obtain f1
    f_one = -1
    for i, token in enumerate(doc):
        if token.text.lower() == 'it' and token.pos_ == "PRON":
            f_one = i + 1
            break
    
    it_index = f_one - 1
    
    # obtain f2
    f_two = len(doc)
    
    #obtain f3
    f_three = 0
    for i, token in enumerate(doc):
        if token.is_punct:
            f_three+=1
            
    #obtain f4
    f_four = 0
    for np in doc.noun_chunks:
        if np.end <= it_index:
            f_four +=1
            
    #obtain f5
    f_five = 0
    for np in doc.noun_chunks:
        if np.start > it_index:
            f_five += 1
            
    #obtain f6
    f_six = False
    for tok in doc:
        if tok.pos_ == "ADP":  
            end_i = max(t.i for t in tok.subtree)
            if end_i == it_index - 1:
                f_six = True
    
    #obtain f7
    f_seven = ['ABS','ABS','ABS','ABS','ABS','ABS','ABS','ABS']    
    preceding = doc[max(0, it_index-4):it_index]
    for i, token in enumerate(preceding):
        f_seven[4 - len(preceding) + i] = token.pos_
        
    succeding = doc[it_index + 1:it_index + 5]
    for i, token in enumerate(succeding):
        f_seven[4 + i] = token.pos_
    
    #obtain f8
    f_eight = False
    if f_one < len(doc):
        if doc[f_one].text.lower().endswith('ing') and doc[f_one].pos_ == 'VERB':
            f_eight = True
    
    #obtain f9
    f_nine = False
    if f_one < len(doc):
        if doc[f_one].pos_ == "ADP":
            f_nine = True
    
    #obtain f10
    f_ten = 0
    for i in range(it_index+1, len(doc)):
        if doc[i].pos_ == "ADJ":
            f_ten += 1
            
    #obtain f11
    f_eleven = False
    if it_index-1 >= 0:
        if doc[it_index - 1].pos_ == "VERB":
            f_eleven = True
    
    #obtain f12
    f_twelve = False
    if it_index + 1 < len(doc):
        if doc[it_index + 1].pos_ == "VERB":
            f_twelve = True

    #obtain f13
    f_thirteen = False
    if it_index + 1 < len(doc):
        if doc[it_index + 1].pos_ == "ADJ":
            f_thirteen = True
    
    #obtain f14
    f_fourteen = False
    for np in doc.noun_chunks:
        if np.start > it_index:
            for t in np:
                if t.pos_ == "ADJ":
                    f_fourteen = True
                    break
    
    #obtain f15
    f_fifteen = 0
    for i in range(len(doc) - 1):
        if doc[i].lemma_ == "to" and doc[i].pos_ in {"PART", "ADP"} and doc[i+1].pos_ == "VERB":
            f_fifteen = i + 1
            break
    
    #obtain 16
    f_sixteen = 0
    for j in range(it_index + 1, len(doc)):
        if doc[j].pos_ == "ADP":
            f_sixteen = j - it_index - 1
            break

    
    #obtain f17
    f_seventeen = False
    np_starts = []
    for np in doc.noun_chunks:
        if np.start > it_index:
            np_starts.append(np.start)
        
    j = it_index + 1
    while j < len(doc) - 1:
        if doc[j].pos_ == "ADJ":
            if (j + 1) in np_starts:
                f_seventeen = True
                break
        j += 1

    #obtain f18
    f_eighteen = doc[it_index].dep_

    #obtain f19
    f_nineteen = False
    if it_index+1 < len(doc) and doc[it_index+1].pos_ == "VERB":
        synsets = wn.synsets(doc[it_index+1].lemma_, pos=wn.VERB)
        f_nineteen = any(s.lexname() == 'verb.weather' for s in synsets)

    #obtain f20
    f_twenty = False
    if it_index+1 < len(doc) and doc[it_index+1].pos_ == "VERB":
        synsets = wn.synsets(doc[it_index+1].lemma_, pos=wn.VERB)
        f_twenty = any(s.lexname() == 'verb.cognition' for s in synsets)
        
    return {
        "F1": f_one, "F2": f_two, "F3": f_three, "F4": f_four, "F5": f_five,
        "F6": f_six, "F7": f_seven, "F8": f_eight, "F9": f_nine, "F10": f_ten,
        "F11": f_eleven, "F12": f_twelve, "F13": f_thirteen, "F14": f_fourteen, "F15": f_fifteen,
        "F16": f_sixteen, "F17": f_seventeen, "F18": f_eighteen, "F19": f_nineteen, "F20": f_twenty
    }

In [7]:
rows = []

for row in data.itertuples():
    r = process_row(row)
    rows.append(r)

In [9]:
df

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20
0,22,68,6,3,7,False,"[PRON, SPACE, VERB, SPACE, PUNCT, SPACE, PRON,...",False,False,1,False,False,False,True,0,13,False,dobj,False,False
1,1,42,5,0,6,False,"[ABS, ABS, ABS, ABS, AUX, SPACE, ADV, SPACE]",False,False,2,False,False,False,True,0,0,False,nsubj,False,False
2,42,43,2,6,0,False,"[NOUN, SPACE, ADP, SPACE, PUNCT, ABS, ABS, ABS]",False,False,0,False,False,False,False,0,0,False,pobj,False,False
3,26,33,2,2,1,False,"[VERB, SPACE, ADP, SPACE, SPACE, ADV, SPACE, ADP]",False,False,0,False,False,False,False,0,3,False,pobj,False,False
4,6,7,1,0,0,False,"[PART, SPACE, VERB, SPACE, PUNCT, ABS, ABS, ABS]",False,False,0,False,False,False,False,0,0,False,dobj,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
508,11,40,3,1,4,False,"[ADJ, SPACE, SCONJ, SPACE, SPACE, AUX, SPACE, ...",False,False,1,False,False,False,False,0,0,False,nsubj,False,False
509,1,12,1,0,2,False,"[ABS, ABS, ABS, ABS, SPACE, PRON, SPACE, VERB]",False,False,0,False,False,False,False,0,5,False,nsubj,False,False
510,5,6,1,1,0,False,"[PRON, SPACE, VERB, SPACE, PUNCT, ABS, ABS, ABS]",False,False,0,False,False,False,False,0,0,False,dobj,False,False
511,21,28,3,3,0,False,"[PART, SPACE, VERB, SPACE, SPACE, ADP, SPACE, ...",False,False,1,False,False,False,False,0,1,False,dobj,False,False


In [None]:
df = pd.DataFrame(rows)

df.to_csv("features.csv", index=False)