In [6]:
import logging
import pandas as pd
import numpy as np
from numpy import random
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.metrics import classification_report
from keras.preprocessing.text import Tokenizer                    
from keras.preprocessing.sequence import pad_sequences
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

### Snippet of the data
Here we take a look of expert annotations data. We can see that a text is in "agreement throughout" doesn't always lead to its sentiment to be "positive". Also, we can see "constructive" texts mostly fall into our classifications of ERICs (that we'll be talking below) by looking at their sd_type.

In [3]:
df = pd.read_csv('./data/ydata-ynacc-v1_0/ydata-ynacc-v1_0_expert_annotations.tsv',sep='\t')
# df.head()

### Definition of ERICs
ERICs are characterized by argumentative, respectful exchanges containing persuasive, informative, and/or sympathetic comments. They tend to stay on topic with the original article and not to contain funny, mean, or sarcastic comments. We found differences between the distribution of annotations made by trained and untrained anno- tators, but high levels of agreement within each group, suggesting that crowdsourcing annotations for this task is reliable.

Now, we select the columns related to ERICs and mainly look at these.

In [136]:
df = df[['text','constructiveclass','sd_type','tone','sentiment','persuasiveness']]
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,text,constructiveclass,sd_type,tone,sentiment,persuasiveness
0,Yes..because too many houses in EU look like t...,Constructive,Positive/respectful,Informative,neutral,Not persuasive
1,"I am frankly quite SICK of the phrase ""shoved ...",Not constructive,Off-topic/digression,Mean,negative,Persuasive
2,"Ya, I always wonder why the conservatives are ...",Not constructive,Off-topic/digression,Sarcastic,neutral,Not persuasive
3,They are also places where you are supposed no...,Not constructive,Argumentative (back and forth),Sarcastic,neutral,Persuasive
4,"Stop trying to make sense, it only confuses pe...",Not constructive,Argumentative (back and forth),Mean,negative,Persuasive


In [143]:
df['tone'].unique()

array(['Informative', 'Mean', 'Sarcastic', 'Funny', 'Controversial,Mean',
       'Mean,Sarcastic', 'Controversial', 'Sympathetic,Sarcastic',
       'Informative,Sarcastic', 'Informative,Mean', 'Sympathetic',
       'Controversial,Mean,Sarcastic', 'Informative,Controversial',
       'Sarcastic,Funny', 'Controversial,Sarcastic,Funny',
       'Mean,Sarcastic,Funny', 'Controversial,Mean,Funny',
       'Controversial,Funny', 'Mean,Funny',
       'Informative,Sympathetic,Funny', 'Sympathetic,Controversial,Mean',
       'Informative,Sympathetic', 'Informative,Controversial,Mean',
       'Controversial,Sarcastic', 'Controversial,Mean,Sarcastic,Funny',
       'Sympathetic,Controversial', 'Sympathetic,Funny',
       'Controversial,NA', 'Informative,Controversial,Sarcastic',
       'Informative,NA', 'Informative,Funny', 'Sympathetic,Mean',
       'Informative,Controversial,Funny', 'NA,Funny',
       'Informative,Controversial,Mean,Sarcastic',
       'Informative,Sympathetic,Controversial',
      

#### Create a column of ERIC

In [140]:
df['ERIC']=-1 # false
df.loc[(df['sd_type'].str.contains("Off-topic/digression")==False & (df['sd_type'].str.contains('Positive')) | df['sd_type'].str.contains('Personal')) 
   & (df['persuasiveness']=='Persuasive') 
    & (df['tone'].str.contains('Informative') | df['tone'].str.contains('Controversial') | df['tone'].str.contains('Sympathetic'))
#     & (df['sentiment']=='neutral' | df['sentiment']=='positive')
    & (df['constructiveclass']=='Constructive'), ['ERIC']] = 1

# df.loc[df['ERIC'] != 'True', ['ERIC']] = 'False'

In [141]:
eric = df[df['ERIC'] == 1]
eric.info()
eric.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2474 entries, 8 to 17604
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   text               2474 non-null   object
 1   constructiveclass  2474 non-null   object
 2   sd_type            2474 non-null   object
 3   tone               2474 non-null   object
 4   sentiment          2474 non-null   object
 5   persuasiveness     2474 non-null   object
 6   ERIC               2474 non-null   int64 
dtypes: int64(1), object(6)
memory usage: 154.6+ KB


Unnamed: 0,text,constructiveclass,sd_type,tone,sentiment,persuasiveness,ERIC
8,I know this was probably the best thing that e...,Constructive,Argumentative (back and forth),Informative,neutral,Persuasive,1
9,Ghrelin is produced by your fat cells. You can...,Constructive,Positive/respectful,Informative,neutral,Persuasive,1
30,I believe they are eaten in Venezuela? It's a ...,Constructive,Positive/respectful,Informative,neutral,Persuasive,1
37,HF and You've got to be kidding me.... Nelson ...,Constructive,Argumentative (back and forth),Informative,neutral,Persuasive,1
39,"So Ed - 12,000 - that's still FOUR TIMES more ...",Constructive,Argumentative (back and forth),Informative,negative,Persuasive,1


In [142]:
n_eric = len(np.array(eric.index))
n_eric

2474

In [164]:
df.loc[(df['sd_type'].str.contains("Off-topic/digression") | (df['sd_type'].str.contains('Flamewar'))) 
   & (df['persuasiveness']=='Not Persuasive') 
    & (df['tone'].str.contains('Mean') | df['tone'].str.contains('Sarcastic'))
    & (df['sentiment']=='negative')
    & (df['constructiveclass']!='Constructive'), ['ERIC']] = 0

In [165]:
noneric = df[df['ERIC'] == 0]

In [166]:
n_non = len(np.array(noneric.index))
n_non

3130

In [58]:
noneric_idx = np.array(noneric.index)
chosen_noneric_idx = np.random.choice(noneric_idx, n_eric, replace=False)
chosen_noneric_idx[:10]

array([15193, 10281, 14297,  6150,   569,  3548, 14991,  1510,  6629,
       11436])

In [59]:
sampled_noneric = df.iloc[chosen_noneric_idx]

In [167]:
chosen_data = pd.concat([eric, noneric])
print(chosen_data.ERIC.unique())
print(len(chosen_data))

[1 0]
5604


In [61]:
other_noneric_idx = np.array([i for i in noneric_idx if i not in chosen_noneric_idx])
other_noneric = df.iloc[other_noneric_idx]

Let's firstly take a look of how well we can predict the ERIC attribute of a text.

In [15]:
def split_data(X, y, df):
    data = df[[X, y]]
    labels = df[y].unique()
    
    X_train, X_test, y_train, y_test = [np.array([],dtype='str'), np.array([],dtype='str'),np.array([],dtype='str'),np.array([],dtype='str')]

    for l in labels:
        item = data.groupby(y).get_group(l)
        a = item[X].to_numpy()
        
        b = item[y].to_numpy()
        # ?? use dummies to convert the strings to matrix of features
#         b = item[y.name].str.get_dummies(sep=",").to_numpy()
        
        # there're cases where the number of rows/entries are fewer than 4, 
        # and will cause train_test_split to generate empty values.
        # so we sacrifice a little accuracy of our model and include those entries in both training and testing datasets.
        # in general, it won't affect too much because the number is small.
        if len(item) >= 4:
            X_train_loc, X_test_loc, y_train_loc, y_test_loc = train_test_split(a, b, test_size=0.3, random_state = 400)           
        else:
            X_train_loc, X_test_loc, y_train_loc, y_test_loc = [a, a, b, b]
        
        
        X_train = np.concatenate((X_train_loc, X_train))
        X_test = np.concatenate((X_test_loc, X_test))
        y_train = np.concatenate((y_train_loc, y_train))
        y_test = np.concatenate((y_test_loc, y_test))
    
#     le = preprocessing.LabelEncoder()
#     y_train = le.fit_transform(y_train)
#     y_test = le.transform(y_test)
    return X_train, X_test, y_train, y_test
        

In [185]:
# X_train, X_test, y_train, y_test = [np.array([],dtype='str'), np.array([],dtype='str'),np.array([],dtype='str'),np.array([],dtype='str')]
X_train, X_test, y_train, y_test = split_data('text', 'ERIC', chosen_data)


Since negative sentiment texts are much more than other types of texts, we can't directly do train_test_split (because sometimes we may fail to choose from all 4 labels and resulting error in classification_report). We need to train_test_split from each type of sentiment and combine the training/test data/labels.

### Run classifiers

In [21]:
# X_train[:10]

In [169]:
# from nltk.stem import WordNetLemmatizer
# loop over classifiers: Naive Bayes, Supported Vectors Machine, KNN
pipe_list = []
grid_search_list = []

In [184]:
for clf in [MultinomialNB(alpha=1, fit_prior=True), SVC(), KNeighborsClassifier(n_neighbors=9), LogisticRegression()]:
    pipe = Pipeline([('vect', CountVectorizer(stop_words='english',ngram_range=(3,3))),
                    ('tfidf', TfidfTransformer()),
                    ('clf', clf),
                  ])
    pipe.fit(X_train, y_train)
    pipe_list.append(pipe)
    print(type(pipe.named_steps['clf']))
    training_accuracy = pipe.score(X_train, y_train)
    print('Training_accuracy:',training_accuracy)
    y_pred = pipe.predict(X_test)
    
    unique, counts = np.unique(y_pred, return_counts=True)
#     print(unique)
#     print(counts)
    
    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred, target_names=['True','False']))

#     y_pred = pipe.predict(np.append(other_noneric.text, X_test))
#     y_true = np.append(other_noneric.ERIC, y_test)

#     print('accuracy %s' % accuracy_score(y_pred, y_true))
#     print(classification_report(y_true, y_pred, target_names=chosen_data.ERIC.unique()))

<class 'sklearn.naive_bayes.MultinomialNB'>
Training_accuracy: 0.9716981132075472
accuracy 0.6676575505350772
              precision    recall  f1-score   support

        True       0.64      0.95      0.76       939
       False       0.83      0.31      0.45       743

    accuracy                           0.67      1682
   macro avg       0.73      0.63      0.61      1682
weighted avg       0.72      0.67      0.63      1682

<class 'sklearn.svm._classes.SVC'>
Training_accuracy: 0.9724630290668026
accuracy 0.6617122473246135
              precision    recall  f1-score   support

        True       0.63      0.95      0.76       939
       False       0.83      0.30      0.44       743

    accuracy                           0.66      1682
   macro avg       0.73      0.62      0.60      1682
weighted avg       0.72      0.66      0.62      1682

<class 'sklearn.neighbors._classification.KNeighborsClassifier'>
Training_accuracy: 0.5586435492095869
accuracy 0.5582639714625446
    

In [108]:
# idx = np.argsort(pipe_list[0]['tfidf'].idf_)[:50]

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

#### logistic regression

In [48]:
# preprocess the text with the rule of Bag of Words 
def words_in_texts(words, texts):
    '''
    Inputs:
        words (list-like): words to find
        texts (Series): strings to search in
    
    Output:
        NumPy array of 0s and 1s with shape (n, p) where n is the
        number of texts and p is the number of words.
    '''
    nested_arr = []
    for text in texts:
        arr = []
        for word in words:
            if word in text:
                arr.append(1)
            else:
                arr.append(0)
        nested_arr.append(arr)
    return nested_arr

In [70]:
# # w/ preprocess
some_words = ['please','thanks','suggest','advice','note']
X_train = words_in_texts(some_words,X_train)
X_test = words_in_texts(some_words,X_test)
# y_train = np.asarray(y_train)

In [71]:
# model = LogisticRegression()
# model.fit(X_train, y_train)
# training_accuracy = model.score(X_train, y_train)

# print('Logistic Regression training_accuracy:',training_accuracy)
# y_pred = model.predict(X_test)

# test_accuracy = model.score(y_test,y_pred)

# print('Logistic Regression accuracy:',test_accuracy)

### CNN

In [171]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1                          

maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)



In [75]:
def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

In [82]:
# X_train[:10]

In [172]:
from keras.models import Sequential
from keras import layers
embedding_dim = 100

model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
history = model.fit(X_train, y_train,
                    epochs=10,
                    validation_data=(X_test, y_test),
                    batch_size=10)

Train on 3922 samples, validate on 1682 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# Naive Bayes w/ gridSearch
pipe = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', MultinomialNB()),
                  ])
params = {'vect__min_df': np.linspace(0.005, 0.05, 5),
            'vect__ngram_range': ((1, 1),(1, 2),(2, 2)),  # unigrams or bigrams
            'tfidf__use_idf': (True, False),
            'clf__alpha': np.logspace(0,1,10),
            'clf__fit_prior': (True, False),
            }
search = GridSearchCV(pipe, param_grid=params)
search.fit(X_train, y_train)
grid_search_list.append(search)
print("Best parameter values:")
for param in search.best_params_.items():
    print(param)
print("CV Score using best parameter values:", search.best_score_)

In [None]:
# SVM w/ gridSearch
pipe = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', SVC()),
                  ])
params = {'vect__min_df': np.linspace(0.005, 0.05, 5),
            'vect__ngram_range': ((1, 1),(1, 2),(2, 2)),  # unigrams or bigrams
            'tfidf__use_idf': (True, False),
            'clf__kernel': ('linear', 'poly', 'rbf', 'sigmoid'),
            'clf__gamma': ('scale', 'auto'),
            }
search = GridSearchCV(pipe, param_grid=params)
search.fit(X_train, y_train)
grid_search_list.append(search)
print("Best parameter values:")
for param in search.best_params_.items():
    print(param)
print("CV Score using best parameter values:", search.best_score_)

In [None]:
# KNN w/ gridSearch 

pipe2 = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', KNeighborsClassifier()),
              ])
# pipe2.fit(X_train, y_train)
# print(pipe2.named_steps)
# y_pred = pipe2.predict(X_test)

# print('accuracy %s' % accuracy_score(y_pred, y_test))
# print(classification_report(y_test, y_pred, target_names=df.sentiment.unique()))

params = {
            'vect__min_df': np.linspace(0.005, 0.05, 5),
            'vect__ngram_range': ((1, 1),(1, 2),(2, 2)),  # unigrams or bigrams
            'tfidf__use_idf': (True, False),
            'clf__n_neighbors': (5,6,7,8,9),
            'clf__weights': ('uniform', 'distance')
#             'clf__fit_prior': (True, False),
            }
search = GridSearchCV(pipe2, param_grid=params)
search.fit(X_train, y_train)
grid_search_list.append(search)
print("Best parameter values:")
for param in search.best_params_.items():
    print(param)
print("CV Score using best parameter values:", search.best_score_)

In [None]:
type(pipe2.named_steps['clf'])

In [None]:
type_dum = df['sd_type'].str.get_dummies(sep=",")
type_dum_arr = type_dum.to_numpy()

In [None]:
type_dum_name = np.array(type_dum.columns)

### Feature Extraction for ERIC and non-ERIC

In [109]:
eric.head()

Unnamed: 0,text,constructiveclass,sd_type,tone,sentiment,persuasiveness,ERIC
8,I know this was probably the best thing that e...,Constructive,Argumentative (back and forth),Informative,neutral,Persuasive,1
9,Ghrelin is produced by your fat cells. You can...,Constructive,Positive/respectful,Informative,neutral,Persuasive,1
30,I believe they are eaten in Venezuela? It's a ...,Constructive,Positive/respectful,Informative,neutral,Persuasive,1
37,HF and You've got to be kidding me.... Nelson ...,Constructive,Argumentative (back and forth),Informative,neutral,Persuasive,1
39,"So Ed - 12,000 - that's still FOUR TIMES more ...",Constructive,Argumentative (back and forth),Informative,negative,Persuasive,1


In [110]:
noneric.head()

Unnamed: 0,text,constructiveclass,sd_type,tone,sentiment,persuasiveness,ERIC
0,Yes..because too many houses in EU look like t...,Constructive,Positive/respectful,Informative,neutral,Not persuasive,0
1,"I am frankly quite SICK of the phrase ""shoved ...",Not constructive,Off-topic/digression,Mean,negative,Persuasive,0
2,"Ya, I always wonder why the conservatives are ...",Not constructive,Off-topic/digression,Sarcastic,neutral,Not persuasive,0
3,They are also places where you are supposed no...,Not constructive,Argumentative (back and forth),Sarcastic,neutral,Persuasive,0
4,"Stop trying to make sense, it only confuses pe...",Not constructive,Argumentative (back and forth),Mean,negative,Persuasive,0


In [177]:
# extract featured phrases for ERICs
pipe_eric = Pipeline([('vect', CountVectorizer(stop_words='english',ngram_range=(3,3))),
               ('tfidf', TfidfTransformer())])
pipe_eric.fit(eric.text)
eric_idx = np.argsort(pipe_eric['tfidf'].idf_)[:50]
feat = pipe_eric['vect'].get_feature_names()
eric_feat = [feat[i] for i in eric_idx]
print(eric_feat)

['black lives matter', 'techniques knowing world', 'difference right wrong', 'knowing world overlap', 'self defense scenario', 'step right direction', 'living paycheck paycheck', '16 year old', 'locker rooms bathrooms', 'prohibits government prohibiting', 'government prohibiting free', 'saying black people', '10 20 percent', 'paying fair share', 'young earth creationism', 'prohibiting free exercise', 'false sense security', 'free exercise religion', 'right act way', 'gay marriage supreme', 'completely different epistemological', 'hope door time', 'throwing away billions', 'liberal religionists gentle', 'door time comes', 'black people didn', 'people food stamps', 'companies time fail', 'just don believe', 'white black people', 'don want change', 'green energy companies', 'time comes ban', 'stamps cost money', 'act way people', 'won happen like', 'letting isis expand', 'live let live', 'gun violence america', 'time fail costs', 'different epistemological techniques', 'syria libya cost',

In [178]:
# extract featured phrases for non-ERICs
pipe_noneric = Pipeline([('vect', CountVectorizer(stop_words='english',ngram_range=(3,3))),
               ('tfidf', TfidfTransformer())])
pipe_noneric.fit(noneric.text)
noneric_idx = np.argsort(pipe_noneric['tfidf'].idf_)[:50]
feat2 = pipe_noneric['vect'].get_feature_names()
noneric_feat = [feat2[i] for i in noneric_idx]
print(noneric_feat)

['new york city', 'world better place', 'inbred racist idiots', 'spoken like true', 'comments just continue', 'mom horrible person', 'piece dogshit god', 'black lives matter', 'independently probably destroyed', 'poor excuse human', 'makes feel better', 'probably destroyed remaining', 'remaining common sense', 'excuse human dung', 'image reality knows', 'inability think independently', 'hate inability think', 'destroyed remaining common', 'known multi account', 'think independently probably', 'feeling hate inability', 'called dominant group', 'leader free world', 'proceeds received free', 'calling don forget', 'nimrods fail ways', 'got head start', 'gotta play like', 'group got head', 'doesn suggest rape', 'dominant group got', '40 years ago', 'wall hope sticks', 'head start proceeds', 'start proceeds received', 'like typical liberals', 'let know ll', 'school improve life', 'people skin look', 'george bush lied', 'life outside employment', 'ways school improve', 'like tomato sauce', 'c

In [179]:
print([i for i in eric_feat if i in noneric_feat])

['black lives matter', 'makes feel better']
