In [43]:
import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import KFold, cross_val_score, train_test_split, cross_val_predict
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
from pandas import DataFrame
import numpy as np
from scipy import sparse
import re
from scipy.stats.stats import pearsonr   
import pandas as pd
import gensim, logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


In [2]:
df_anger_en_train = pd.read_csv("data/en/EI-reg-en_anger_train.txt", header=None, names=["text", "emotion", "intensity"], sep="	")
df_anger_en_dev = pd.read_csv("data/en/2018-EI-reg-En-anger-dev.txt", header=None, names=["text", "emotion", "intensity"], sep="	")
df_anger_es_train = pd.read_csv("data/es/2018-EI-reg-Es-anger-train.txt", header=None, names=["text", "emotion", "intensity"], sep="	")
df_anger_es_dev = pd.read_csv("data/es/2018-EI-reg-Es-anger-dev.txt", header=None, names=["text", "emotion", "intensity"], sep="	")

df_anger_en = df_anger_en_train.append([df_anger_en_dev])
df_anger_es = df_anger_es_train.append([df_anger_es_dev])

In [3]:
df_anger_en["emotion"] = pd.Categorical(df_anger_en["emotion"]).codes

In [4]:
df_anger_en.tail(10)

Unnamed: 0,text,emotion,intensity
2018-en-anger-dev-191,+ cant get to tell them what will offend them ...,0,0.29
2018-en-anger-dev-192,Seriously about to smack someone in the face 😵...,0,0.922
2018-en-anger-dev-193,Been at work for not even 4 hours and I've thr...,0,0.833
2018-en-anger-dev-194,I click on download on my PC. Message says 'Th...,0,0.859
2018-en-anger-dev-195,Why does @DANCEonFOX get rudely interrupted by...,0,0.875
2018-en-anger-dev-196,If I have to hear one more time how I am intim...,0,0.766
2018-en-anger-dev-197,i've been to 1 and 1/3 of my classes today and...,0,0.383
2018-en-anger-dev-198,quick note about insta stories how the f do yo...,0,0.5
2018-en-anger-dev-199,@AliceT120 @shaney_waney1 @shaney_waney1 u hor...,0,0.75
2018-en-anger-dev-200,#Twitter these days is just a massive critical...,0,0.719


In [5]:
# Pipeline function
def use_pipeline(X, y, pipeline):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=40)    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    print(pearsonr(y_test, y_pred))
    print(np.corrcoef(y_test, y_pred))
    

X_en = df_anger_en["text"]
y_en = df_anger_en["intensity"]
X_es = df_anger_es["text"]
y_es = df_anger_es["intensity"]
pipeline1 = Pipeline([
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer(use_idf=True)),
        ('classifier', LinearRegression())
    ])
# baseline for English - Anger
use_pipeline(X_en, y_en, pipeline1)
# baseline for Spanish - Anger
use_pipeline(X_es, y_es, pipeline1)

(0.49672225351908889, 5.0334462759154037e-31)
[[ 1.          0.49672225]
 [ 0.49672225  1.        ]]
(0.47509418260514341, 1.7598213856502596e-15)
[[ 1.          0.47509418]
 [ 0.47509418  1.        ]]


In [8]:
# English word2vec pre-trained word embeddings
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
model

<gensim.models.keyedvectors.KeyedVectors at 0x21dabde6be0>

In [9]:
# Spanish word2vec pre-trained word embeddings
model2 = gensim.models.KeyedVectors.load_word2vec_format('sbw_vectors.bin', binary=True)
model2

<gensim.models.keyedvectors.KeyedVectors at 0x21dabde6780>

In [56]:
# iemand een oplossing voor deze UnicodeDecodeError????
# Arabic word2vec pre-trained word embeddings
model3 = gensim.models.KeyedVectors.load_word2vec_format('wiki.ar.bin', binary=True, unicode_errors="replace")
model3

2017-10-15 18:13:37,873 : INFO : loading projection weights from wiki.ar.bin


UnicodeDecodeError: 'utf-8' codec can't decode byte 0xba in position 0: invalid start byte

In [None]:
print(model.most_similar(positive=['woman', 'king'], negative=['man']))
print(model.similarity('woman', 'man'))

In [None]:
print(model2.most_similar(positive=['mujer', 'rey'], negative=['hombre']))
print(model2.similarity('mujer', 'hombre'))

In [None]:
##########################################################################################################
##########################################################################################################
# THE EMOTION PIPELINE BELOW IS ONLY FOR REFERENCE > you can use it to create new features based on this

class ItemSelector(BaseEstimator, TransformerMixin):
    """For data grouped by feature, select subset of data at a provided key."""
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]

class ItemExtractor(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self

    def transform(self, posts):
        # change 'emotion' to key you want to use
        return [{'emotion': text}
                for text in posts]
    
# pipeline that incorporates both text and emotion features
pipeline2 = Pipeline([

    ('union', FeatureUnion(
        transformer_list=[
            ('text', Pipeline([
                ('selector', ItemSelector(key='text')),
                ('tfidf', TfidfVectorizer(min_df=50)),
            ])),

            ('emotion', Pipeline([
                ('selector', ItemSelector(key='emotion')),
                ('stats', ItemExtractor()),  # returns a list of dicts
                ('vect', DictVectorizer()),  # list of dicts -> feature matrix
            ])),
        ],

        # give equal weights to text and emotion features
        transformer_weights={
            'text': 1.0,
            'emotion': 0.0,
            },
        )),
        ('classifier', LinearRegression())
        ])
