In [1]:
import re
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import ufal.udpipe
from model import Model
import conllu
from collections import defaultdict
from nltk.parse import DependencyGraph

In [2]:
model = Model('english-ewt-ud-2.5-191206.udpipe')

In [3]:
def get_conllu(model, text):
    sentences = model.tokenize(text)
    for s in sentences:
        model.tag(s)
        model.parse(s)
    conllu_text = model.write(sentences, "conllu")
    return conllu_text

In [4]:
def get_dep_tree(text):
    trees = []
    for sent in text.split('\n\n'):
        tree = [line for line in sent.split('\n') if line and line[0] != '#']
        trees.append('\n'.join(tree))
    return trees

In [5]:
df = pd.read_csv('movie.csv')

In [6]:
df

Unnamed: 0,text,labels
0,plot two teen couples go to a church party dri...,0.0
1,the happy bastard s quick movie review damn th...,0.0
2,it is movies like these that make a jaded movi...,0.0
3,quest for camelot is warner bros . first featu...,0.0
4,synopsis a mentally unstable man undergoing ps...,0.0
...,...,...
1995,wow what a movie . it s everything a movie can...,1.0
1996,richard gere can be a commanding actor but he ...,1.0
1997,glory starring matthew broderick denzel washin...,1.0
1998,steven spielberg s second epic film on world w...,1.0


In [7]:
sentence = 'there are not many fight scenes in this movie'

In [8]:
con = get_conllu(model, sentence)
print(con)

# newdoc
# newpar
# sent_id = 1
# text = there are not many fight scenes in this movie
1	there	there	PRON	EX	_	2	expl	_	_
2	are	be	VERB	VBP	Mood=Ind|Tense=Pres|VerbForm=Fin	0	root	_	_
3	not	not	ADV	RB	_	4	advmod	_	_
4	many	many	ADJ	JJ	Degree=Pos	5	nsubj	_	_
5	fight	fight	VERB	VBP	Mood=Ind|Tense=Pres|VerbForm=Fin	6	amod	_	_
6	scenes	scene	NOUN	NNS	Number=Plur	2	nsubj	_	_
7	in	in	ADP	IN	_	9	case	_	_
8	this	this	DET	DT	Number=Sing|PronType=Dem	9	det	_	_
9	movie	movie	NOUN	NN	Number=Sing	6	nmod	_	SpaceAfter=No




In [9]:
prep = get_dep_tree(con)
prep

['1\tthere\tthere\tPRON\tEX\t_\t2\texpl\t_\t_\n2\tare\tbe\tVERB\tVBP\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\t_\n3\tnot\tnot\tADV\tRB\t_\t4\tadvmod\t_\t_\n4\tmany\tmany\tADJ\tJJ\tDegree=Pos\t5\tnsubj\t_\t_\n5\tfight\tfight\tVERB\tVBP\tMood=Ind|Tense=Pres|VerbForm=Fin\t6\tamod\t_\t_\n6\tscenes\tscene\tNOUN\tNNS\tNumber=Plur\t2\tnsubj\t_\t_\n7\tin\tin\tADP\tIN\t_\t9\tcase\t_\t_\n8\tthis\tthis\tDET\tDT\tNumber=Sing|PronType=Dem\t9\tdet\t_\t_\n9\tmovie\tmovie\tNOUN\tNN\tNumber=Sing\t6\tnmod\t_\tSpaceAfter=No',
 '']

In [10]:
d = DependencyGraph(prep[0])
d.root = d.nodes[0]
nodes = list(d.triples())
nodes

  "The graph doesn't contain a node " "that depends on the root element."


[((None, 'TOP'), 'root', ('are', 'VERB')),
 (('are', 'VERB'), 'expl', ('there', 'PRON')),
 (('are', 'VERB'), 'nsubj', ('scenes', 'NOUN')),
 (('scenes', 'NOUN'), 'amod', ('fight', 'VERB')),
 (('fight', 'VERB'), 'nsubj', ('many', 'ADJ')),
 (('many', 'ADJ'), 'advmod', ('not', 'ADV')),
 (('scenes', 'NOUN'), 'nmod', ('movie', 'NOUN')),
 (('movie', 'NOUN'), 'case', ('in', 'ADP')),
 (('movie', 'NOUN'), 'det', ('this', 'DET'))]

In [11]:
for i in range(1, len(nodes)):
    print(str(nodes[i][0][0]) + str(nodes[i][2][0]))

arethere
arescenes
scenesfight
fightmany
manynot
scenesmovie
moviein
moviethis


In [29]:
%%time

text = []
for review in tqdm(df['text']):
    sentences = []
    for sentence in review.split(' . '):
        con = get_conllu(model, sentence)
        prep = get_dep_tree(con)
        d = DependencyGraph(prep[0])
        d.root = d.nodes[0]
        nodes = list(d.triples())
        for i in range(1, len(nodes)):
            bigram = (str(nodes[i][0][0]) + str(nodes[i][2][0]))
            sentences.append(bigram)
    text.append(' '.join(sentences))

HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


Wall time: 27min 42s


In [31]:
df['sin_text'] = text

In [32]:
df

Unnamed: 0,text,labels,sin_text
0,plot two teen couples go to a church party dri...,0.0,plotcouples couplestwo couplesteen plotgo godr...
1,the happy bastard s quick movie review damn th...,0.0,damnbastard bastardthe bastardhappy bastards d...
2,it is movies like these that make a jaded movi...,0.0,isit ismovies moviesthese theselike movieswatc...
3,quest for camelot is warner bros . first featu...,0.0,questcamelot camelotfor questbros brosis brosw...
4,synopsis a mentally unstable man undergoing ps...,0.0,synopsissaves savespsychotherapy psychotherapy...
...,...,...,...
1995,wow what a movie . it s everything a movie can...,1.0,moviewow moviewhat moviea originalit originalm...
1996,richard gere can be a commanding actor but he ...,1.0,actorgere gererichard actorcan actorbe actora ...
1997,glory starring matthew broderick denzel washin...,1.0,storyglory glorywashington washingtonstarring ...
1998,steven spielberg s second epic film on world w...,1.0,masterpiecefilm filmspielberg spielbergsteven ...


In [44]:
df.to_csv('sin_movie.csv', index=False)