In [1]:
import re 
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
def read_data(file):
    data = []
    with open(file, 'r')as f:
        for line in f:
            line = line.strip()
            label = ' '.join(line[1:line.find("]")].strip().split())
            text = line[line.find("]")+1:].strip()
            data.append([label, text])
    return data

In [3]:
file = 'dataset1_es.txt'
data = read_data(file)
print("Number of instances: {}".format(len(data)))

Number of instances: 7480


### tokenización y la generación de las características de una oración 

In [4]:
def ngram(token, n): 
    output = []
    for i in range(n-1, len(token)): 
        ngram = ' '.join(token[i-n+1:i+1])
        output.append(ngram) 
    return output

def create_feature(text, nrange=(1, 1)):
    text_features = [] 
    text = text.lower() 
    text_alphanum = re.sub('[^a-z0-9#]', ' ', text)
    for n in range(nrange[0], nrange[1]+1): 
        text_features += ngram(text_alphanum.split(), n)    
    text_punc = re.sub('[a-z0-9]', ' ', text)
    text_features += ngram(text_punc.split(), 1)
    return Counter(text_features)

### Función para almacenar las etiquetas, que se basarán en emociones como Alegría, Miedo, Ira, etc.:

In [5]:
def convert_label(item, name): 
    #print('item: ', item)
    #print('name: ', name)
    items = list(map(float, item.split()))
    label = ""
    for idx in range(len(items)): 
        if items[idx] == 1: 
            label += name[idx] + " "
    
    return label.strip()

emotions = ["alegría", 'miedo', "ira", "tristeza", "disgusto", "vergüenza", "culpa"]

X_all = []
y_all = []
for label, text in data:
    y_all.append(convert_label(label, emotions))
    X_all.append(create_feature(text, nrange=(1, 4)))

### Separación de datos en conjuntos de entrenamiento y prueba

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = 0.2, random_state = 123)

def train_test(clf, X_train, X_test, y_train, y_test):
    clf.fit(X_train, y_train)
    train_acc = accuracy_score(y_train, clf.predict(X_train))
    test_acc = accuracy_score(y_test, clf.predict(X_test))
    return train_acc, test_acc

from sklearn.feature_extraction import DictVectorizer
vectorizer = DictVectorizer(sparse = True)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

### Entrenamiento con cuatro modelos de aprendizaje automático. Se imprime resultados para la selección del mejor modelo 

In [7]:
svc = SVC()
lsvc = LinearSVC(random_state=123)
rforest = RandomForestClassifier(random_state=123)
dtree = DecisionTreeClassifier()

clifs = [svc, lsvc, rforest, dtree]

# train and test them 
print("| {:25} | {} | {} |".format("Classifier", "Training Accuracy", "Test Accuracy"))
print("| {} | {} | {} |".format("-"*25, "-"*17, "-"*13))
for clf in clifs: 
    clf_name = clf.__class__.__name__
    train_acc, test_acc = train_test(clf, X_train, X_test, y_train, y_test)
    print("| {:25} | {:17.7f} | {:13.7f} |".format(clf_name, train_acc, test_acc))

#clf_name = svc.__class__.__name__
#train_acc, test_acc = train_test(svc, X_train, X_test, y_train, y_test)
#print("| {:25} | {:17.7f} | {:13.7f} |".format(clf_name, train_acc, test_acc))



| Classifier                | Training Accuracy | Test Accuracy |
| ------------------------- | ----------------- | ------------- |
| SVC                       |         0.9097594 |     0.4699198 |




| LinearSVC                 |         0.9988302 |     0.5514706 |
| RandomForestClassifier    |         0.9988302 |     0.5421123 |
| DecisionTreeClassifier    |         0.9988302 |     0.4318182 |


## Definición de la etiqueta emocional

In [10]:
l = ["alegría", 'miedo', "ira", "tristeza", "disgusto", "vergüenza", "culpa"]
l.sort()
label_freq = {}
for label, _ in data: 
    label_freq[label] = label_freq.get(label, 0) + 1

# print the labels and their counts in sorted order 
for l in sorted(label_freq, key=label_freq.get, reverse=True):
    print("{:10}({})  {}".format(convert_label(l, emotions), l, label_freq[l]))

alegría   (1. 0. 0. 0. 0. 0. 0.)  1084
ira       (0. 0. 1. 0. 0. 0. 0.)  1080
tristeza  (0. 0. 0. 1. 0. 0. 0.)  1079
miedo     (0. 1. 0. 0. 0. 0. 0.)  1078
disgusto  (0. 0. 0. 0. 1. 0. 0.)  1057
culpa     (0. 0. 0. 0. 0. 0. 1.)  1057
vergüenza (0. 0. 0. 0. 0. 1. 0.)  1045


## Definición del emoji y aplicación del modelo 

In [60]:
emoji_dict = {"alegría":"😂", "miedo":"😱", "ira":"😠", "tristeza":"😢", "disgusto":"😒", "vergüenza":"😳", "culpa":"😳"}
t1 = "alegría impresionante"
t2 = "Tengo miedo a los perros"
t3 = "Mi perro murió ayer"
t4 = "ya no te amo...!"
t5 = 'alegría alegría murió murió alegría alegría alegría'

texts = [t1, t2, t3, t4, t5]
for text in texts: 
    features = create_feature(text, nrange=(1, 4))
    features = vectorizer.transform(features)
    prediction = clf.predict(features)[0]
    print( text,emoji_dict[prediction], prediction)

alegría impresionante 😂 alegría
Tengo miedo a los perros 😱 miedo
Mi perro murió ayer 😢 tristeza
ya no te amo...! 😳 vergüenza
alegría alegría murió murió alegría alegría alegría 😢 tristeza
