In [5]:
import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
from pandas import DataFrame
import numpy as np
from scipy import sparse
import re
from scipy.stats.stats import pearsonr   
import pandas as pd

In [7]:
df_anger = pd.read_csv("EI-reg-en_anger_train.txt", header=None, names=["text", "emotion", "intensity"], sep="	")
df_fear = pd.read_csv("EI-reg-en_fear_train.txt", header=None, names=["text", "emotion", "intensity"], sep="	")
df_joy = pd.read_csv("EI-reg-en_joy_train.txt", header=None, names=["text", "emotion", "intensity"], sep="	")
df_sadness = pd.read_csv("EI-reg-en_sadness_train.txt", header=None, names=["text", "emotion", "intensity"], sep="	")
df = df_anger.append([df_fear, df_joy, df_sadness])

In [8]:
df["emotion"] = pd.Categorical(df["emotion"]).codes

In [9]:
df.tail(10)

Unnamed: 0,text,emotion,intensity
41523,@ITdominiccoyle @IrishTimesBiz Between Barroso...,3,0.458
41524,"@pottermore : I can't find my patronus, the we...",3,0.729
41525,Nutella is pine green forget me nots are ivory...,3,0.125
41526,I was not made for this world. #empath #unhappy,3,0.75
41527,"She used to be beautiful, but she lived her li...",3,0.398
41528,Why does Candice constantly pout #GBBO 💄😒,3,0.396
41529,"@redBus_in #unhappy with #redbus CC, when I ta...",3,0.604
41530,"@AceOperative789 no pull him afew weeks ago, s...",3,0.479
41531,I'm buying art supplies and I'm debating how s...,3,0.375
41532,@sainsburys Could you ask your Chafford Hundre...,3,0.438


In [10]:
# PIPELINE WITHOUT INCORPORATING THE EMOTION FEATURE
pipeline = Pipeline([
    ("vect", CountVectorizer()),
    ("tfidf", TfidfTransformer(use_idf=True)),
    ('classifier', LinearRegression())
])

In [11]:
X = df["text"]
y = df["intensity"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(pearsonr(y_test, y_pred))
print(np.corrcoef(y_test, y_pred))

(0.29013908027270746, 8.6589669175317977e-36)
[[ 1.          0.29013908]
 [ 0.29013908  1.        ]]


In [14]:
class ItemSelector(BaseEstimator, TransformerMixin):
    """For data grouped by feature, select subset of data at a provided key."""
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]

class EmotionExtractor(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self

    def transform(self, posts):
        return [{'emotion': text}
                for text in posts]
    
# pipeline that does incorporate both text and emotion features
pipeline2 = Pipeline([

    ('union', FeatureUnion(
        transformer_list=[
            ('text', Pipeline([
                ('selector', ItemSelector(key='text')),
                ('tfidf', TfidfVectorizer(min_df=50)),
            ])),

            ('emotion', Pipeline([
                ('selector', ItemSelector(key='emotion')),
                ('stats', EmotionExtractor()),  # returns a list of dicts
                ('vect', DictVectorizer()),  # list of dicts -> feature matrix
            ])),
        ],

        # give equal weights to text and emotion features
        transformer_weights={
            'text': 0.5,
            'emotion': 0.5,
            },
        )),
        ('classifier', LinearRegression())
        ])
X = df[["emotion", "text"]]
y = df["intensity"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
pipeline2.fit(X_train, y_train)
y_pred = pipeline2.predict(X_test)

print(pearsonr(y_test, y_pred))
print(np.corrcoef(y_test, y_pred))


(0.37413374403823801, 4.1089960537085461e-60)
[[ 1.          0.37413374]
 [ 0.37413374  1.        ]]
