Downloading and installing libraries, set the workset

In [0]:
!pip install pymorphy2

import json 
from collections import Counter, defaultdict
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm


import nltk
import pymorphy2
from nltk.tokenize import word_tokenize
from utilities import verb_extractor, features_extractor

from google.colab import drive

nltk.download('punkt')
morph = pymorphy2.MorphAnalyzer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Open datasets

In [0]:
pos = pd.read_csv("positive.csv", sep = ";", header = None)
neg = pd.read_csv("negative.csv", sep = ";", header = None)

In [0]:
def sent_tok (sent):
    toksent = []
    sent = word_tokenize(sent)
    for word in sent:
        p = morph.parse(word)[0]
        toksent.append(p.normal_form)
    return toksent
ptok = pos[3].apply(sent_tok)
d = {'tok': ptok.tolist(), 'untok': pos[3].tolist()}
posdf = pd.DataFrame(data = d)
ntok = neg[3].apply(sent_tok)
p = {'tok': ntok.tolist(), 'untok': neg[3].tolist()}
negdf = pd.DataFrame(data = p)

Rework json to pandas dataframe for visualising

In [0]:
f = open('frames.json',)
data = json.load(f)

elements = ['0','1']
variants = []
effect = []
polarity = []
state = []
value = []
rolea0 = []
rolea1 = []
rolea2 = []
title = []

for el in elements:
    for i in range(200):
        name = el + '_' + str(i)
        if name in data:
          if 'effect' in data[name]['frames']:
              effect.append(data[name]['frames']['effect'])
          else:
              effect.append(None)

          if 'polarity' in data[name]['frames']:
              polarity.append(data[name]['frames']['polarity'])
          else:
              polarity.append(None)

          if 'state' in data[name]['frames']:
              state.append(data[name]['frames']['state'])
          else:
              state.append(None)

          if 'value' in data[name]['frames']:
              value.append(data[name]['frames']['value'])
          else:
              value.append(None)

          if 'a0' in data[name]['roles']:
              rolea0.append(data[name]['roles']['a0'])
          else:
              rolea0.append(None)

          if 'a1' in data[name]['roles']:
              rolea1.append(data[name]['roles']['a1'])
          else:
              rolea1.append(None)

          if 'a2' in data[name]['roles']:
              rolea2.append(data[name]['roles']['a2'])
          else:
              rolea2.append(None)

          title.append(data[name]['title'])

          variants.append(data[name]['variants'])

extrdata = {'variants' : variants,
            'title' : title,
            'role_a0' : rolea0,
            'role_a1' : rolea1,
            'role_a2' : rolea2,
            'value' : value,
            'state' : state,
            'polarity' : polarity,
            'effect' : effect}
framework = pd.DataFrame(data = extrdata)

Extracting positive and negative verbs and their forms

In [0]:
posv = verb_extractor(pos)
negv = verb_extractor(neg)

HBox(children=(FloatProgress(value=0.0, max=114911.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=111923.0), HTML(value='')))




In [0]:
upv = list(posv.keys()) #List of unique positive verbs
unv = list(negv.keys()) #List of unique negative verbs
a = list(set(upv) | set(unv))

Make a dataset with collected udeful data

In [0]:
tweetverbsdf = pd.DataFrame(columns = ['pos_entries', 'neg_entries', 'sing', 'plur', 'none_num', 'fem', 'masc', 'neut', 'none_gen', 'past', 'pres', 'futr', 'none_tense', '1per', '2per', '3per', 'none_per'], index = a)

In [0]:
for verb in tweetverbsdf.index.values:
    if verb in upv:
        tweetverbsdf.loc[verb, 'pos_entries'] = len(posv[verb])
    else:
        tweetverbsdf.loc[verb, 'pos_entries'] = 0
    if verb in unv:
        tweetverbsdf.loc[verb, 'neg_entries'] = len(negv[verb])
    else:
        tweetverbsdf.loc[verb, 'neg_entries'] = 0

In [0]:
genddict, numbdict, tensedict, perdict = features_extractor(tweetverbsdf.index.values, posv, negv)
for verb in tweetverbsdf.index.values:
    numbers = numbdict[verb]
    length = len(numbers)
    if length == 0:
        tweetverbsdf.loc[verb,'none_num'] = 100
        tweetverbsdf.loc[verb,'sing'] = 0
        tweetverbsdf.loc[verb,'plur'] = 0
    else:
        c = Counter(numbers)
        tweetverbsdf.loc[verb, 'none_num']= c[None]/length*100
        tweetverbsdf.loc[verb, 'sing'] = c['sing']/length*100
        tweetverbsdf.loc[verb, 'plur'] = c['plur']/length*100

    genders = genddict[verb]
    length = len(genders)
    if length == 0:
        tweetverbsdf.loc[verb,'none_gen'] = 100
        tweetverbsdf.loc[verb,'masc'] = 0
        tweetverbsdf.loc[verb,'fem'] = 0
        tweetverbsdf.loc[verb,'neut'] = 0
    else:
        c = Counter(genders)
        tweetverbsdf.loc[verb,'none_gen'] = c[None]/length*100
        tweetverbsdf.loc[verb,'masc'] = c['masc']/length*100
        tweetverbsdf.loc[verb,'fem'] = c['femn']/length*100
        tweetverbsdf.loc[verb,'neut'] = c['neut']/length*100

    tense = tensedict[verb]
    length = len(tense)
    if length == 0:
        tweetverbsdf.loc[verb,'none_tense'] = 100
        tweetverbsdf.loc[verb,'futr'] = 0
        tweetverbsdf.loc[verb,'pres'] = 0
        tweetverbsdf.loc[verb,'past'] = 0
    else:
        c = Counter(tense)
        tweetverbsdf.loc[verb,'none_tense'] = c[None]/length*100
        tweetverbsdf.loc[verb,'futr'] = c['futr']/length*100
        tweetverbsdf.loc[verb,'pres'] = c['pres']/length*100
        tweetverbsdf.loc[verb,'past'] = c['past']/length*100

    person = perdict[verb]
    length = len(person)
    if length == 0:
        tweetverbsdf.loc[verb,'none_per'] = 100
        tweetverbsdf.loc[verb,'1per'] = 0
        tweetverbsdf.loc[verb,'2per'] = 0
        tweetverbsdf.loc[verb,'3per'] = 0
    else:
        c = Counter(person)
        tweetverbsdf.loc[verb,'none_per'] = c[None]/length*100
        tweetverbsdf.loc[verb,'1per'] = c['1per']/length*100
        tweetverbsdf.loc[verb,'2per'] = c['2per']/length*100
        tweetverbsdf.loc[verb,'3per'] = c['3per']/length*100

HBox(children=(FloatProgress(value=0.0, max=19791.0), HTML(value='')))




In [0]:
tweetverbsdf['summa'] = tweetverbsdf['pos_entries'] + tweetverbsdf['neg_entries']
tweetverbsdf['pos_entries'] = tweetverbsdf['pos_entries'] / tweetverbsdf['summa'] * 100
tweetverbsdf['neg_entries'] = tweetverbsdf['neg_entries'] / tweetverbsdf['summa'] * 100

Extracting verbs from frames

In [0]:
frverbs = framework['variants'].tolist()
merge = lambda ll: [el for lst in ll for el in lst]
mergedfrverbs = list(set(merge(frverbs)))

Find intersection between frame verbs and tweet verbs

In [0]:
intersection_verbs = list(set(tweetverbsdf.index.values) & set(mergedfrverbs))
temp_df = tweetverbsdf.T
train_data = pd.DataFrame(columns = ['pos_entries', 'neg_entries', 'sing', 'plur', 'none_num', 'fem', 'masc', 'neut', 'none_gen', 'past', 'pres', 'futr', 'none_tense', '1per', '2per', '3per', 'none_per', 'summa'])
for verb in intersection_verbs:
    train_data.loc[verb] = temp_df[verb].tolist()
tweetverbsdf = tweetverbsdf.drop(intersection_verbs, axis = 0)

In [0]:
counter = []
effects = defaultdict(list)
for i in tqdm(range(len(frverbs))):
    for verb in frverbs[i]:
        if verb in intersection_verbs and verb not in counter:
            effects[verb].append(framework[i:i+1]['effect'].tolist())
        counter.append(verb)

HBox(children=(FloatProgress(value=0.0, max=310.0), HTML(value='')))




In [0]:
a0_list = []
a1_list = []
for i in tqdm(range(len(intersection_verbs))):
    if intersection_verbs[i] in list(effects.keys()):
        k = effects[intersection_verbs[i]][0][0]
        a0 = 0
        a1 = 0
        if k == None:
            a0_list.append(a0)
            a1_list.append(a1)
        else:
            for ef in k:
                if ef[0] == 'a0':
                    if ef[1] == '-':
                        a0 = ef[2] * -1
                    else:
                        a0 = ef[2]
                if ef[0] == 'a1':
                    if ef[1] == '-':
                        a1 = ef[2] * -1
                    else:
                        a1 = ef[2]
            a0_list.append(a0)
            a1_list.append(a1)

HBox(children=(FloatProgress(value=0.0, max=2046.0), HTML(value='')))




In [0]:
train_data['a0_effect'] = a0_list
train_data['a1_effect'] = a1_list
train_data['expl_effect'] = train_data['a0_effect'] + train_data['a1_effect']

In [0]:
train_data = train_data.sort_values(by = 'summa', ascending = False)
tweetverbsdf = tweetverbsdf.sort_values(by = 'summa', ascending = False)

In [0]:
tweetverbsdf = tweetverbsdf.loc[tweetverbsdf['summa'] > 50]
train_data = train_data.loc[train_data['summa'] > 50]

In [0]:
lotest_verbs = tweetverbsdf.index.values
lotrain_verbs = train_data.index.values

new_test_df = tweetverbsdf
new_train_df = train_data



def exex (exdf1, exdf2, verbs):
    ex1 = exdf1['tok'].tolist()
    ex2 = exdf2['tok'].tolist()
    examplelist = []
    for i in tqdm(range(len(verbs))):
        for z in range(len(ex1)):
            if len(examplelist) == i+1:
                break
            if verbs[i] in ex1[z]:
                examplelist.append(exdf1['untok'][z])

        for p in range(len(ex2)):
            if len(examplelist) == i+1:
                break
            if verbs[i] in ex2[p]:
                examplelist.append(exdf2['untok'][p])
            
    return examplelist

exlist =  exex(posdf, negdf, lotrain_verbs)
new_train_df['examples'] = exlist

HBox(children=(FloatProgress(value=0.0, max=374.0), HTML(value='')))




In [0]:
from google.colab import drive
drive.mount('/drive')
new_train_df.to_csv('/drive/My Drive/Colab Notebooks/test_we.csv')

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).


Downloading recieved data

In [0]:
tweetverbsdf.to_csv('/drive/My Drive/Colab Notebooks/test_data.csv')
train_data.to_csv('/drive/My Drive/Colab Notebooks/train_data.csv')

In [0]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from imblearn.over_sampling import RandomOverSampler


from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

In [0]:
def predictor(model, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=179)
    modulation = model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print('F1-macro score: ', f1_score(preds, y_test, average='macro'))
    print('Accuracy score: ', accuracy_score(preds, y_test))
    return y_test, preds

In [0]:
def class_dist (y_test, y_pred):
    answers = defaultdict(list)
    for i in range(len(y_test)):
        if y_test[i] == y_pred[i]:
           answers[y_test[i]].append(True)
        else:
           answers[y_test[i]].append(False)
    return answers         

In [0]:
model = MLPClassifier(random_state=179)
parameters = {
    "activation": ["identity","logistic","tanh","relu"],
    "solver": ["lbfgs","sgd","adam"],
    "learning_rate":["constant","invscaling","adaptive"],
    "max_iter": np.arange(100,300,100),

}
MLP = GridSearchCV(model, parameters, cv = 5, verbose=True, n_jobs=-1, scoring = 'f1_macro')
y_test, y_pred = predictor(MLP, XT, y)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 304 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:    2.1s finished


ValueError: ignored

In [0]:
ans = class_dist(y_test, y_pred)
Counter(ans[2])

Counter({False: 7, True: 40})

In [0]:
pipe = Pipeline([('classifier' , LogisticRegression())])
param_grid = [
    {'classifier' : [LogisticRegression()],
     'classifier__penalty' : ['l1', 'l2', 'elasticnet'],
    'classifier__C' : np.logspace(-4, 4, 20),
    'classifier__solver' : ['liblinear']}
]
lr = GridSearchCV(pipe, param_grid = param_grid, cv = 5, verbose=True, n_jobs=-1, scoring = 'f1_macro')
y_test, y_pred = predictor(lr, XT, y)
ans = class_dist(y_test, y_pred)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


F1-macro score:  0.4401323308270676
Accuracy score:  0.46774193548387094


[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    3.6s finished


In [0]:
model = KNeighborsClassifier()
parameters = {
    "n_neighbors": np.arange(1,11),
    "weights": ["uniform", "distance"],
    "algorithm": ["auto","ball_tree","kd_tree","brute"],
    "metric":["euclidean", "manhattan"],
}
KNC = GridSearchCV(model, param_grid=parameters,scoring = 'f1_macro')
y_test, y_pred = predictor(KNC, XT, y)

Models for multilabel classification

In [0]:
def predictor_v2(model, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=179)
    modulation = model.fit(X_train, y_train)
    preds = model.predict(X_test)
    z=0
    for i in range(len(y_test)):
        if y_test[i].all() == preds[i].all():
            z += 1
    print("Accuracy score: ", z/len(y_test))
    return y_test, preds

In [0]:
test = pd.read_csv('test_data.csv', index_col = 0)
train = pd.read_csv('train_data.csv', index_col = 0)
train['expl_effect'] = train.apply(lambda x: round(x['expl_effect'], 0), axis = 1)
train['a0_effect'] = train.apply(lambda x: round(x['a0_effect'], 0), axis = 1)
train['a1_effect'] = train.apply(lambda x: round(x['a1_effect'], 0), axis = 1)
#train = train.drop(columns = ['sing','plur','none_num'])
#train = train.drop(columns = ['fem','masc','neut', 'none_gen'])
#train = train.drop(columns = ['past','pres','futr','none_tense'])
#train = train.drop(columns = ['1per','2per','3per','none_per'])
#train = train[(train.expl_effect != -2)]
y = train[['a0_effect',	"a1_effect"]].to_numpy()
XT = train.drop(columns = ['expl_effect', 'summa', 'examples','a0_effect',	"a1_effect"])

In [0]:
for i in np.arange(10,30,1):
    model = MultiOutputClassifier(KNeighborsClassifier(n_neighbors = i))
    y_test, y_pred = predictor_v2(model, XT, y)

In [0]:
model = MultiOutputClassifier(MLPClassifier(random_state=179))
y_test, y_pred = predictor_v2(model, XT, y)

Accuracy score:  0.7338709677419355


