In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from data_utils import load_scores_data,load_text_xml,check_arsenl_data,check_duplicates_per_pos

In [2]:
scores =load_scores_data('datasets/affectivetext_test.emotions.gold.csv')
sent = load_text_xml('datasets/affectivetext_test.xml')

In [3]:
scores.describe()

Unnamed: 0,id,anger,disgust,fear,joy,sadness,surprise
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,999.5,8.563,5.076,16.144,19.003,18.494,18.624
std,288.819436,13.503347,10.027201,20.400182,21.08553,22.646077,14.165991
min,500.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,749.75,0.0,0.0,0.0,0.0,0.0,8.0
50%,999.5,1.0,0.0,8.0,12.0,11.0,16.0
75%,1249.25,12.25,7.0,24.0,35.0,27.0,25.0
max,1499.0,71.0,87.0,92.0,84.0,96.0,87.0


In [4]:
#sns.pairplot(scores.iloc[:,1:],diag_kind='hist')

In [5]:
#corr =scores.iloc[:,1:].corr()
#plt.figure(figsize=(10,7))
#sns.heatmap(corr, 
      #      xticklabels=corr.columns.values,
      #      yticklabels=corr.columns.values,annot=True)

In [6]:
#sns.boxplot(data=scores.iloc[:,1:])

In [7]:
def view_text_by_results(df_scores,df_sentences,col,func=max):
    """
    Returns one sentence from the given emotion column
    based on a given function for the emotion values

    Parameters:
    ------------
    df_scores: scores DataFrame
    df_sentences : sentences Dataframe
    col : emotion columns
    func {max,'min',..}: function to apply to emotion values (default=max)

    Returns
    ------------
    result : str
    """
    max_value = df_scores.loc[df_scores[col] == func(df_scores[col]),'id'].tolist()[0]
    return df_sentences.loc[df_sentences['id']== str(max_value),'text'].tolist()[0]

In [8]:
for col in scores.columns[1:]:
    print('max '+col + ': ' + view_text_by_results(scores,sent,col,max))
print('--------------------------------')
for col in scores.columns[1:]:
    print('min '+col + ': ' + view_text_by_results(scores,sent,col,min))

max anger: Israeli woman's tirade spurs PM outrage
max disgust: Teacher charged with sex assault
max fear: UK workers on alert for letter bombs
max joy: Families celebrate return of sons
max sadness: Iraqi death toll exceeded 34,000 in 2006, United Nations says
max surprise: Man rides stationary bike for 85 hours
--------------------------------
min anger: Test to predict breast cancer relapse is approved
min disgust: Test to predict breast cancer relapse is approved
min fear: Sights and sounds from CES
min joy: Trucks swallowed in subway collapse
min sadness: Sights and sounds from CES
min surprise: Golden Globes on their way


In [9]:
for i in 'avrn':
    print('\'arsel/arsenl_'+i+'.txt\'')
   # check_arsenl_data('arsel/arsenl_'+i+'.txt')
    #check_duplicates_per_pos('arsel/arsenl_'+i+'.txt')

'arsel/arsenl_a.txt'
'arsel/arsenl_v.txt'
'arsel/arsenl_r.txt'
'arsel/arsenl_n.txt'


In [10]:
%load_ext autoreload
%autoreload 2

In [45]:
from arsel import Arsel
from evaluator import Evaluator
adj ='arsel/arsenl_a.txt'
verb ='arsel/arsenl_v.txt'
adv = 'arsel/arsenl_r.txt'
nouns = 'arsel/arsenl_n.txt'

ar_binary = Arsel(adj,nouns,verb,adv,binary=True)
ar_binary.get_emotionScores('hAjis','noun')
ar_norm = Arsel(adj,nouns,verb,adv,norm=True)
ar_norm.get_emotionScores('hAjis','noun')

ev = Evaluator(ar_norm)
ev.load('datasets/Buckwalter_all_sentences_google_translated.txt','datasets/affectivetext_test.emotions.gold.csv',True)
d = ev.dataset_info()

  a = (a-min(a)) / (max(a)-min(a))


In [46]:
d

{'uniq_verbs_count': 324,
 'uniq_nouns_count': 1571,
 'uniq_advs_count': 7,
 'uniq_adjs_count': 339,
 'words_count': 7229,
 'words_uniq': 2688,
 'unsupported_count': 442,
 'unsupported_uniq_count': 302,
 'unsupported_pos_count': 114}

2241

In [35]:
len(set(ev.buck_obj.buck_dict['verb']))

321

In [36]:
321+1314+5+337+447

2424

In [13]:
l = [x[0] for x in ev.unsupported_madamira_list]

In [67]:
2424+442

2866

In [17]:
set(l)

{'',
 '$',
 '$$wl',
 '$wAHn',
 '$wflyftrz',
 "'",
 '21st-Ø³ÙŠÙ†ØªÙˆØ±ÙŠ',
 '2M',
 '<fAkwys',
 '<ksbrs',
 '<kshAnj',
 '<njlykywn',
 '<rks',
 '<ybwd',
 '<yfryA',
 '<yrv',
 '<yrvlynk',
 '<yrwys',
 '<ysynbAyfA',
 '>dylAyd',
 '>jAks',
 '>kwrywm',
 '>lmndynjr',
 '>my$',
 '>mynws',
 '>ndrwmydA',
 '>ntArktykA',
 '>wbAmA',
 '>wbynyd',
 '>wdysyws',
 '>wfys',
 '>wjAy',
 '>wksfAm',
 '>wkwr',
 '>wrAkl',
 '>wrlyAnz',
 '>wtkry',
 '>wzfyst',
 '>ydwl',
 'A',
 'A380',
 'AlAHtrAr',
 'AlbAndA',
 'AldEk',
 'AljrAbyAt',
 'AlkAkAw',
 'Alkrykyt',
 'AllqAHAt',
 'AllystyryA',
 'Almjrp',
 'Almtswqyn',
 'AlnybAly',
 'AlrAtnj',
 'Alrwkrz',
 'Alrwkyz',
 'AlsAlmwnylA',
 'Alsynyp',
 'AlzlAjAt',
 'Al|ybwd',
 'Amazon.com',
 'Armstrwng',
 'Arwyw',
 'AwrlyAnz',
 'Ayfl',
 'Ayfrtwn',
 'C',
 "Eto'o",
 'G7',
 'G7:',
 'H.I.V.',
 'H5N1',
 'I-95',
 'J.K.',
 'L.A',
 'N',
 'PS3',
 'R400',
 'SPACE.com:',
 'Sqy',
 'X',
 'bAbwn',
 'bAl$lAny',
 'bAlAHtrAr',
 'bAnAnAkwndA',
 'bArbArw',
 'bAsrz',
 'bAtrswn',
 'bAvfAyndr',
 'bb',
 'bftA