In [3]:
import re
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import ufal.udpipe
from model import Model
import conllu
from collections import defaultdict
from nltk.parse import DependencyGraph

In [4]:
model = Model('english-ewt-ud-2.5-191206.udpipe')

In [5]:
def get_conllu(model, text):
    sentences = model.tokenize(text)
    for s in sentences:
        model.tag(s)
        model.parse(s)
    conllu_text = model.write(sentences, "conllu")
    return conllu_text

In [6]:
def get_dep_tree(text):
    trees = []
    for sent in text.split('\n\n'):
        tree = [line for line in sent.split('\n') if line and line[0] != '#']
        trees.append('\n'.join(tree))
    return trees

In [7]:
df = pd.read_csv('movie.csv')

In [8]:
df

Unnamed: 0,text,labels
0,plot two teen couples go to a church party dri...,0.0
1,the happy bastard s quick movie review damn th...,0.0
2,it is movies like these that make a jaded movi...,0.0
3,quest for camelot is warner bros . first featu...,0.0
4,synopsis a mentally unstable man undergoing ps...,0.0
...,...,...
1995,wow what a movie . it s everything a movie can...,1.0
1996,richard gere can be a commanding actor but he ...,1.0
1997,glory starring matthew broderick denzel washin...,1.0
1998,steven spielberg s second epic film on world w...,1.0


In [24]:
sentence = 'it is never mentioned but the gravity on mars has been increased somehow'

In [25]:
con = get_conllu(model, sentence)
print(con)

# newdoc
# newpar
# sent_id = 1
# text = it is never mentioned but the gravity on mars has been increased somehow
1	it	it	PRON	PRP	Case=Nom|Gender=Neut|Number=Sing|Person=3|PronType=Prs	4	nsubj:pass	_	_
2	is	be	AUX	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	4	aux:pass	_	_
3	never	never	ADV	RB	_	4	advmod	_	_
4	mentioned	mention	VERB	VBN	Tense=Past|VerbForm=Part|Voice=Pass	0	root	_	_
5	but	but	CCONJ	CC	_	12	cc	_	_
6	the	the	DET	DT	Definite=Def|PronType=Art	7	det	_	_
7	gravity	gravity	NOUN	NN	Number=Sing	12	nsubj:pass	_	_
8	on	on	ADP	IN	_	9	case	_	_
9	mars	mar	NOUN	NNS	Number=Plur	7	nmod	_	_
10	has	have	AUX	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	12	aux	_	_
11	been	be	AUX	VBN	Tense=Past|VerbForm=Part	12	aux:pass	_	_
12	increased	increase	VERB	VBN	Tense=Past|VerbForm=Part|Voice=Pass	4	conj	_	_
13	somehow	somehow	ADV	RB	_	12	advmod	_	SpaceAfter=No




In [26]:
prep = get_dep_tree(con)
prep

['1\tit\tit\tPRON\tPRP\tCase=Nom|Gender=Neut|Number=Sing|Person=3|PronType=Prs\t4\tnsubj:pass\t_\t_\n2\tis\tbe\tAUX\tVBZ\tMood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin\t4\taux:pass\t_\t_\n3\tnever\tnever\tADV\tRB\t_\t4\tadvmod\t_\t_\n4\tmentioned\tmention\tVERB\tVBN\tTense=Past|VerbForm=Part|Voice=Pass\t0\troot\t_\t_\n5\tbut\tbut\tCCONJ\tCC\t_\t12\tcc\t_\t_\n6\tthe\tthe\tDET\tDT\tDefinite=Def|PronType=Art\t7\tdet\t_\t_\n7\tgravity\tgravity\tNOUN\tNN\tNumber=Sing\t12\tnsubj:pass\t_\t_\n8\ton\ton\tADP\tIN\t_\t9\tcase\t_\t_\n9\tmars\tmar\tNOUN\tNNS\tNumber=Plur\t7\tnmod\t_\t_\n10\thas\thave\tAUX\tVBZ\tMood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin\t12\taux\t_\t_\n11\tbeen\tbe\tAUX\tVBN\tTense=Past|VerbForm=Part\t12\taux:pass\t_\t_\n12\tincreased\tincrease\tVERB\tVBN\tTense=Past|VerbForm=Part|Voice=Pass\t4\tconj\t_\t_\n13\tsomehow\tsomehow\tADV\tRB\t_\t12\tadvmod\t_\tSpaceAfter=No',
 '']

In [27]:
d = DependencyGraph(prep[0])
d.root = d.nodes[0]
nodes = list(d.triples())
nodes

[((None, 'TOP'), 'root', ('mentioned', 'VERB')),
 (('mentioned', 'VERB'), 'nsubj:pass', ('it', 'PRON')),
 (('mentioned', 'VERB'), 'aux:pass', ('is', 'AUX')),
 (('mentioned', 'VERB'), 'advmod', ('never', 'ADV')),
 (('mentioned', 'VERB'), 'conj', ('increased', 'VERB')),
 (('increased', 'VERB'), 'cc', ('but', 'CCONJ')),
 (('increased', 'VERB'), 'nsubj:pass', ('gravity', 'NOUN')),
 (('gravity', 'NOUN'), 'det', ('the', 'DET')),
 (('gravity', 'NOUN'), 'nmod', ('mars', 'NOUN')),
 (('mars', 'NOUN'), 'case', ('on', 'ADP')),
 (('increased', 'VERB'), 'aux', ('has', 'AUX')),
 (('increased', 'VERB'), 'aux:pass', ('been', 'AUX')),
 (('increased', 'VERB'), 'advmod', ('somehow', 'ADV'))]

In [28]:
for i in range(1, len(nodes)):
    print(str(nodes[i][0][0]) + str(nodes[i][2][0]))

mentionedit
mentionedis
mentionednever
mentionedincreased
increasedbut
increasedgravity
gravitythe
gravitymars
marson
increasedhas
increasedbeen
increasedsomehow


In [29]:
%%time

text = []
for review in tqdm(df['text']):
    sentences = []
    for sentence in review.split(' . '):
        con = get_conllu(model, sentence)
        prep = get_dep_tree(con)
        d = DependencyGraph(prep[0])
        d.root = d.nodes[0]
        nodes = list(d.triples())
        for i in range(1, len(nodes)):
            bigram = (str(nodes[i][0][0]) + str(nodes[i][2][0]))
            sentences.append(bigram)
    text.append(' '.join(sentences))

HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


Wall time: 27min 42s


In [31]:
df['sin_text'] = text

In [32]:
df

Unnamed: 0,text,labels,sin_text
0,plot two teen couples go to a church party dri...,0.0,plotcouples couplestwo couplesteen plotgo godr...
1,the happy bastard s quick movie review damn th...,0.0,damnbastard bastardthe bastardhappy bastards d...
2,it is movies like these that make a jaded movi...,0.0,isit ismovies moviesthese theselike movieswatc...
3,quest for camelot is warner bros . first featu...,0.0,questcamelot camelotfor questbros brosis brosw...
4,synopsis a mentally unstable man undergoing ps...,0.0,synopsissaves savespsychotherapy psychotherapy...
...,...,...,...
1995,wow what a movie . it s everything a movie can...,1.0,moviewow moviewhat moviea originalit originalm...
1996,richard gere can be a commanding actor but he ...,1.0,actorgere gererichard actorcan actorbe actora ...
1997,glory starring matthew broderick denzel washin...,1.0,storyglory glorywashington washingtonstarring ...
1998,steven spielberg s second epic film on world w...,1.0,masterpiecefilm filmspielberg spielbergsteven ...


In [44]:
df.to_csv('sin_movie.csv', index=False)

In [37]:
df = pd.read_csv('sin_movie.csv')

In [38]:
from nltk.stem import WordNetLemmatizer
lem = WordNetLemmatizer()

In [41]:
lem.lemmatize('m')

'm'

In [39]:
%%time

lemm_text = []
for review in tqdm(df['text']):
    sentences = []
    for sentence in review.split(' . '):
        con = get_conllu(model, sentence)
        prep = get_dep_tree(con)
        d = DependencyGraph(prep[0])
        d.root = d.nodes[0]
        nodes = list(d.triples())
        for i in range(1, len(nodes)):
            bigram = (str(lem.lemmatize(nodes[i][0][0])) + str(lem.lemmatize(nodes[i][2][0])))
            sentences.append(bigram)
    lemm_text.append(' '.join(sentences))

HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))

  "The graph doesn't contain a node " "that depends on the root element."



Wall time: 34min 29s


In [40]:
df['lemm_sin_text'] = lemm_text

In [41]:
df

Unnamed: 0,text,labels,sin_text,lemm_sin_text
0,plot two teen couples go to a church party dri...,0.0,plotcouples couplestwo couplesteen plotgo godr...,plotcouple coupletwo coupleteen plotgo godrink...
1,the happy bastard s quick movie review damn th...,0.0,damnbastard bastardthe bastardhappy bastards d...,damnbastard bastardthe bastardhappy bastards d...
2,it is movies like these that make a jaded movi...,0.0,isit ismovies moviesthese theselike movieswatc...,isit ismovie moviethese theselike moviewatch w...
3,quest for camelot is warner bros . first featu...,0.0,questcamelot camelotfor questbros brosis brosw...,questcamelot camelotfor questbros brosis brosw...
4,synopsis a mentally unstable man undergoing ps...,0.0,synopsissaves savespsychotherapy psychotherapy...,synopsissave savepsychotherapy psychotherapya ...
...,...,...,...,...
1995,wow what a movie . it s everything a movie can...,1.0,moviewow moviewhat moviea originalit originalm...,moviewow moviewhat moviea originalit originalm...
1996,richard gere can be a commanding actor but he ...,1.0,actorgere gererichard actorcan actorbe actora ...,actorgere gererichard actorcan actorbe actora ...
1997,glory starring matthew broderick denzel washin...,1.0,storyglory glorywashington washingtonstarring ...,storyglory glorywashington washingtonstarring ...
1998,steven spielberg s second epic film on world w...,1.0,masterpiecefilm filmspielberg spielbergsteven ...,masterpiecefilm filmspielberg spielbergsteven ...


In [42]:
df.to_csv('lemm_sin_movie.csv', index=False)