In [159]:
import logging
import pandas as pd
import numpy as np
from numpy import random
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re

from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

### Snippet of the data
Here we take a look of expert annotations data. We can see that a text is in "agreement throughout" doesn't always lead to its sentiment to be "positive". Also, we can see "constructive" texts mostly fall into our classifications of ERICs (that we'll be taking below) by looking at their sd_type.

In [118]:
df = pd.read_csv('./data/yahoodata/ydata-ynacc-v1_0_expert_annotations.tsv',sep='\t')
df.head()

Unnamed: 0,sdid,commentindex,headline,url,guid,commentid,timestamp,thumbs-up,thumbs-down,text,parentid,constructiveclass,sd_agreement,sd_type,sentiment,tone,commentagreement,topic,intendedaudience,persuasiveness
0,53971,2,Disneyland Worker Found Dead in Haunted Mansion,http://www.cosmopolitan.com/lifestyle/news/a56...,rjrPtwH5oVVuQnEXX3hf,00003n000000000000000000000000-ed2ae6d0-32ac-4...,1459917444,,,"These things happen , Every job has its dangers.",1459879464596-a3771c05-fd2e-4f44-a26a-23baec3b...,Constructive,,Positive/respectful,negative,,Disagreement with commenter,Off-topic with article,Reply to a specific commenter,Not persuasive
1,53971,0,Disneyland Worker Found Dead in Haunted Mansion,http://www.cosmopolitan.com/lifestyle/news/a56...,VaW6HEsuOFUAIBqjw1k~,1459879464596-a3771c05-fd2e-4f44-a26a-23baec3b...,1459879464,1.0,,Sad to hear such a bad thing. Very dangerous j...,,Constructive,,Positive/respectful,mixed,,,Off-topic with article,Broadcast message / general audience,Not persuasive
2,53971,1,Disneyland Worker Found Dead in Haunted Mansion,http://www.cosmopolitan.com/lifestyle/news/a56...,uwQePj970KaMZuW3~9Q9,00002n000000000000000000000000-1c30b878-b717-4...,1459881644,,,Yes..because too many houses in EU look like t...,1459879464596-a3771c05-fd2e-4f44-a26a-23baec3b...,Constructive,,Positive/respectful,neutral,Informative,,Off-topic with article,Reply to a specific commenter,Not persuasive
3,135929,0,This Old Navy Ad Featuring an Interracial Fami...,http://mic.com/articles/142323/this-old-navy-a...,fixyWJivQjEQtPLLVXsu,1462203719963-3eeffb02-faae-4b51-9174-704c57e6...,1462203719,18.0,3.0,"I am frankly quite SICK of the phrase ""shoved ...",,Not constructive,Agreement throughout,Off-topic/digression,negative,Mean,,Off-topic with article,Broadcast message / general audience,Persuasive
4,135929,1,This Old Navy Ad Featuring an Interracial Fami...,http://mic.com/articles/142323/this-old-navy-a...,_TDnK715vO5y0OzZz_n4,00002I000000000000000000000000-7ef2ac58-bd84-4...,1462204643,7.0,2.0,"Ya, I always wonder why the conservatives are ...",1462203719963-3eeffb02-faae-4b51-9174-704c57e6...,Not constructive,Agreement throughout,Off-topic/digression,neutral,Sarcastic,Agreement with commenter,Off-topic with article,Reply to a specific commenter,Not persuasive


In [119]:
df[df['constructiveclass']=="Constructive"].groupby('sd_type').count()

Unnamed: 0_level_0,sdid,commentindex,headline,url,guid,commentid,timestamp,thumbs-up,thumbs-down,text,parentid,constructiveclass,sd_agreement,sentiment,tone,commentagreement,topic,intendedaudience,persuasiveness
sd_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Argumentative (back and forth),5930,5930,5930,5930,5930,5930,5930,4125,3870,5930,5097,5930,5712,5900,5015,4339,1895,5892,5840
"Argumentative (back and forth),Flamewar (insulting)",477,477,477,477,477,477,477,317,323,477,414,477,477,475,443,361,113,472,472
"Argumentative (back and forth),Flamewar (insulting),Personal stories",34,34,34,34,34,34,34,26,22,34,31,34,20,34,31,30,6,34,30
"Argumentative (back and forth),Flamewar (insulting),Positive/respectful",7,7,7,7,7,7,7,7,7,7,6,7,7,7,6,2,0,7,7
"Argumentative (back and forth),NA",10,10,10,10,10,10,10,9,5,10,8,10,6,9,9,7,2,9,10
"Argumentative (back and forth),Off-topic/digression",603,603,603,603,603,603,603,405,346,603,526,603,491,598,506,479,443,598,569
"Argumentative (back and forth),Off-topic/digression,Flamewar (insulting)",35,35,35,35,35,35,35,26,27,35,32,35,35,35,34,22,12,35,35
"Argumentative (back and forth),Off-topic/digression,Flamewar (insulting),Personal stories",7,7,7,7,7,7,7,5,3,7,6,7,7,7,7,5,2,7,7
"Argumentative (back and forth),Off-topic/digression,Flamewar (insulting),Positive/respectful,Personal stories",8,8,8,8,8,8,8,6,1,8,7,8,8,8,8,7,4,8,8
"Argumentative (back and forth),Off-topic/digression,Personal stories",94,94,94,94,94,94,94,56,62,94,84,94,73,93,85,72,64,94,87


## Definition of ERICs
ERICs are characterized by argumentative, respectful exchanges containing persuasive, informative, and/or sympathetic comments. They tend to stay on topic with the original article and not to contain funny, mean, or sarcastic comments. We found differences between the distribution of annotations made by trained and untrained anno- tators, but high levels of agreement within each group, suggesting that crowdsourcing annotations for this task is reliable.

Now, we select the columns related to ERICs and mainly look at these.

In [120]:
df = df[['text','constructiveclass','sd_type','sentiment','persuasiveness','tone']]
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

df.head()

Unnamed: 0,text,constructiveclass,sd_type,sentiment,persuasiveness,tone
0,Yes..because too many houses in EU look like t...,Constructive,Positive/respectful,neutral,Not persuasive,Informative
1,"I am frankly quite SICK of the phrase ""shoved ...",Not constructive,Off-topic/digression,negative,Persuasive,Mean
2,"Ya, I always wonder why the conservatives are ...",Not constructive,Off-topic/digression,neutral,Not persuasive,Sarcastic
3,They are also places where you are supposed no...,Not constructive,Argumentative (back and forth),neutral,Persuasive,Sarcastic
4,"Stop trying to make sense, it only confuses pe...",Not constructive,Argumentative (back and forth),negative,Persuasive,Mean


## Classifying ERIC by certain fields
---
Next, we try to classify the ERIC by observing the certain attributes of each text. We are not selecting all columns because for instance, the sd_type columns contain more than one attribute of the certain text because this column represents the properties of the whole thread of dicussions and so every text belonged to that thread is labeled the same tag even if that comment does not correspond with the attribute it is labeled. Also the column 'topic' mostly contains 'off-topic' comments which seems to be strange so we are not taking this column into consideration. Thus, we ultimately decide to choose 'constructiveclass', 'persuasiveness', 'tone' as our standard to classify ERICs. An ERIC should have a constructive label, persuasive label and an Informative tone.

### Add the new column 'ERIC' to the data

In [121]:
df['ERIC'] = 0

df.loc[(df['constructiveclass'] == 'Constructive') 
       & (df['persuasiveness'] == 'Persuasive')
       &(df['tone'] == 'Informative'),"ERIC"] = 1



In [122]:
df[df['ERIC'] == 1]

Unnamed: 0,text,constructiveclass,sd_type,sentiment,persuasiveness,tone,ERIC
8,I know this was probably the best thing that e...,Constructive,Argumentative (back and forth),neutral,Persuasive,Informative,1
9,Ghrelin is produced by your fat cells. You can...,Constructive,Positive/respectful,neutral,Persuasive,Informative,1
30,I believe they are eaten in Venezuela? It's a ...,Constructive,Positive/respectful,neutral,Persuasive,Informative,1
37,HF and You've got to be kidding me.... Nelson ...,Constructive,Argumentative (back and forth),neutral,Persuasive,Informative,1
39,"So Ed - 12,000 - that's still FOUR TIMES more ...",Constructive,Argumentative (back and forth),negative,Persuasive,Informative,1
...,...,...,...,...,...,...,...
17415,"alex - Sorry, but you are wrong and show an in...",Constructive,Snarky/humorous,mixed,Persuasive,Informative,1
17428,When Nate Silver has Ms Clinton's chance of wi...,Constructive,Snarky/humorous,negative,Persuasive,Informative,1
17443,If any politician in my lifetime fits the defi...,Constructive,Argumentative (back and forth),negative,Persuasive,Informative,1
17451,"As a naturally aspirated Camaro SS owner, I ra...",Constructive,Positive/respectful,neutral,Persuasive,Informative,1


### Logistic Regression

In [152]:
#first we divide the data into training and testing sets
training_set, test_set = train_test_split(df, test_size=0.3, random_state=42)


#then preprocess the text with the rule of Bag of Words 
def words_in_texts(words, texts):
    '''
    Inputs:
        words (list-like): words to find
        texts (Series): strings to search in
    
    Output:
        NumPy array of 0s and 1s with shape (n, p) where n is the
        number of texts and p is the number of words.
    '''
    nested_arr = []
    for text in texts:
        arr = []
        for word in words:
            if word in text:
                arr.append(1)
            else:
                arr.append(0)
        nested_arr.append(arr)
    return nested_arr



In [153]:
some_words = ['please','thanks','suggest','advice','note']
X_train = words_in_texts(some_words,training_set.text)
Y_train = np.asarray(training_set.ERIC)


In [154]:
model = LogisticRegression()
model.fit(X_train,Y_train)
training_accuracy = model.score(X_train, Y_train)

print('Logistic Regression training_accuracy:',training_accuracy)
# Y_train_hat = model.predict(Y_train)


Logistic Regression training_accuracy: 0.9448857189171664


### Naive Bayes

In [163]:
X_train = training_set.text
Y_train = training_set.ERIC

#this time use tf-idf and vectorizer to preprocess text and use naive bayes as the model
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, Y_train)


X_test = test_set.text
Y_test = test_set.ERIC
y_pred = nb.predict(X_test)

print('Naive Bayes accuracy %s' % accuracy_score(y_pred, Y_test))

Naive Bayes accuracy 0.9457261724659607


## Sentiment Field Training and Evaluation
---

In [27]:
df['sentiment'].value_counts()

negative    10262
neutral      3443
mixed        2737
positive     1184
Name: sentiment, dtype: int64

Let's firstly take a look of how well we can predict the sentiment of a text.

In [28]:
sentiments = df['sentiment'].unique()

In [29]:
data = df[['text','sentiment']]

In [30]:
neg_data = data.groupby('sentiment').get_group('negative')
mix_data = data.groupby('sentiment').get_group('mixed')
neu_data = data.groupby('sentiment').get_group('neutral')
pos_data = data.groupby('sentiment').get_group('positive')
neg_data

Unnamed: 0,text,sentiment
1,"I am frankly quite SICK of the phrase ""shoved ...",negative
4,"Stop trying to make sense, it only confuses pe...",negative
7,"Look up teacher ""Michelle Yeh"" guys. Would've ...",negative
10,George H. Smith is/was a fool.,negative
11,George H. smith is a damn fool,negative
...,...,...
17621,"Thanks for providing oil corporation figures, ...",negative
17622,@USERNAME - I hope Miami doesn't drown. And I ...,negative
17623,Brazilian fans suck as much as their country.,negative
17624,Brazilian fans are just like fans of any other...,negative


In [9]:
X_train, X_test, y_train, y_test = [np.array([],dtype='str'), np.array([],dtype='str'),np.array([],dtype='str'),np.array([],dtype='str')]


Since negative sentiment texts are much more than other types of texts, we can't directly do train_test_split (because sometimes we may fail to choose from all 4 labels and resulting error in classification_report). We need to train_test_split from each type of sentiment and combine the training/test data/labels.

In [10]:
for item in [neg_data, mix_data, neu_data, pos_data]:
    X = item.text.to_numpy()
    y = item.sentiment.to_numpy()
    X_train_loc, X_test_loc, y_train_loc, y_test_loc = train_test_split(X, y, test_size=0.3, random_state = 42)
    X_train = np.concatenate((X_train_loc, X_train))
    X_test = np.concatenate((X_test_loc, X_test))
    y_train = np.concatenate((y_train_loc, y_train))
    y_test = np.concatenate((y_test_loc, y_test))

In [11]:
# sanity check for the train & test data
print(len(neg_data)+len(mix_data)+len(neu_data)+len(pos_data))
print(len(X_train)+len(X_test))

20749
20749


In [13]:
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)

from sklearn.metrics import classification_report
y_pred = nb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred, target_names=sentiments))

accuracy 0.5357314918901558
              precision    recall  f1-score   support

    negative       0.67      0.00      0.00      1009
       mixed       0.53      0.99      0.69      3264
     neutral       0.57      0.07      0.12      1532
    positive       0.67      0.01      0.02       422

    accuracy                           0.54      6227
   macro avg       0.61      0.27      0.21      6227
weighted avg       0.57      0.54      0.40      6227

