In [49]:

# %load ./src/semeval-pan-2019-random-baseline.py
#!/usr/bin/env python

"""Random baseline for the PAN19 hyperpartisan news detection task"""
# Version: 2018-09-24

# Parameters:
# --inputDataset=<directory>
#   Directory that contains the articles XML file with the articles for which a prediction should be made.
# --outputDir=<directory>
#   Directory to which the predictions will be written. Will be created if it does not exist.

from __future__ import division

import gc
import getopt
import os
import random
import sys
import xml.sax

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import linear_model
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from article import Article
import matplotlib.pyplot as plt

random.seed(42)
runOutputFileName = "prediction.txt"
articles = {}

########## SAX ##########

class HyperpartisanNewsRandomPredictor(xml.sax.ContentHandler):
    def __init__(self, outFile):
        xml.sax.ContentHandler.__init__(self)
        self.outFile = outFile
        self.previousId = "null"

    def startElement(self, name, attrs):
        if name == "article":
            articleId = attrs.getValue("id") # id of the article for which hyperpartisanship should be predicted
            self.previousId = articleId
            if articleId not in articles.keys():
                articles[articleId] = Article(articleId)
            if "hyperpartisan" in attrs.keys():
                articles[articleId].hyperpartisan = attrs.getValue("hyperpartisan")
            if "bias" in attrs.keys():
                articles[articleId].bias = attrs.getValue("bias")
            if "title" in attrs.keys():
                articles[articleId].title = attrs.getValue("title")

            if "labeled-by" in attrs.keys():
                articles[articleId].labeled = attrs.getValue("labeled-by")
            if "published-at" in attrs.keys():
                articles[articleId].published_at = attrs.getValue("published-at")
            prediction = random.choice(["true", "false"]) # random prediction
            confidence = random.random() # random confidence value for prediction
            # output format per line: "<article id> <prediction>[ <confidence>]"
            #   - prediction is either "true" (hyperpartisan) or "false" (not hyperpartisan)
            #   - confidence is an optional value to describe the confidence of the predictor in the prediction---the higher, the more confident
            self.outFile.write(articleId + " " + prediction + " " + str(confidence) + "\n")
        if name == "p":
            articles[self.previousId].count_paragraphs += 1
        if name == "q":
            articles[self.previousId].count_quotes += 1
        if name == "a":
            articles[self.previousId].count_urls += 1

    def characters(self, content):
        try:
            if self.previousId != "null":
                articles[self.previousId].text.append(content)
        except AttributeError:
            print(self.previousId)


########## MAIN ##########

def extract_features(filename):
    file = open(filename, encoding="ISO-8859-1").readlines()
    vectorizer = CountVectorizer()
    vectorizer.fit(file)
    return vectorizer


def main(inputDataset, outputDir):
    """Main method of this module."""

    with open(outputDir + "/" + runOutputFileName, 'w') as outFile:
        for file in os.listdir(inputDataset):
            if file.endswith(".xml"):
                with open(inputDataset + "/" + file) as inputRunFile:
                    print(file)
                    xml.sax.parse(inputRunFile, HyperpartisanNewsRandomPredictor(outFile))

    print("The predictions have been written to the output folder.")


if __name__ == '__main__':
    main("./data/test", "./output")
    hedges_vectorizer = extract_features("./lists/hedges.txt")
    boosters_vectorizer = extract_features("./lists/boosters.txt")
    negatives_vectorizer = extract_features("./lists/opinion-lexicon-English/negative-words.txt")
    positives_vectorizer = extract_features("./lists/opinion-lexicon-English/positive-words.txt")
    gc.collect()
    X = []
    y = []
    for articleid in articles:
        text = articles[articleid].text
        articles[articleid].hedges = sum(hedges_vectorizer.transform(text).toarray())
        articles[articleid].negatives = sum(negatives_vectorizer.transform(text).toarray())
        articles[articleid].positives = sum(positives_vectorizer.transform(text).toarray())
        #articles[articleid].boosters = sum(boosters_vectorizer.transform(text).toarray())
        features = np.concatenate((articles[articleid].hedges, articles[articleid].negatives,
                                   articles[articleid].positives), axis=None)

        X.append(features)
        if articles[articleid].hyperpartisan == "true":
            y.append(1)
        else:
            y.append(0)
        gc.collect()
    X = np.array(X)
    y = np.array(y)


articles-training.xml
articles-training-text.xml
The predictions have been written to the output folder.


In [17]:

def model(x):
    return 1 / (1 + np.exp(-x))


In [36]:
#all
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
reg = linear_model.LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)
threshold = 0.5
print(classification_report(y_test, y_pred > threshold))
print(accuracy_score(y_test, y_pred > threshold))
# Plot outputs
plt.scatter(np.sum(X_test, axis=1), y_test,  color='black')
plt.plot(np.sum(X_test, axis=1), y_pred, color='blue', linewidth=1)
plt.xticks(())
plt.yticks(())
plt.title("regression on all features")
plt.show()


              precision    recall  f1-score   support

           0       0.53      0.64      0.58        64
           1       0.65      0.54      0.59        79

   micro avg       0.59      0.59      0.59       143
   macro avg       0.59      0.59      0.59       143
weighted avg       0.60      0.59      0.59       143

0.5874125874125874


In [37]:
#hedges 
hedges_index =len(hedges_vectorizer.get_feature_names())
x_hedges = X[:, 0:hedges_index]
X_train, X_test, y_train, y_test = train_test_split(x_hedges, y, test_size=0.33, random_state=42)
reg = linear_model.LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)
threshold = 0.5
print(classification_report(y_test, y_pred > threshold))
print(accuracy_score(y_test, y_pred > threshold))
print(x_hedges.shape)

plt.scatter(np.sum(X_test, axis=1), y_test,  color='black')
plt.plot(np.sum(X_test, axis=1), y_pred, color='blue', linewidth=1)
plt.xticks(())
plt.yticks(())
plt.title("hedges regression")
plt.show()


              precision    recall  f1-score   support

           0       0.44      0.55      0.49        64
           1       0.55      0.44      0.49        79

   micro avg       0.49      0.49      0.49       143
   macro avg       0.49      0.49      0.49       143
weighted avg       0.50      0.49      0.49       143

0.48951048951048953
(431, 120)


In [39]:
#negatives
negatives_index = (len(negatives_vectorizer.get_feature_names())
                   + len(hedges_vectorizer.get_feature_names()))
x_negative = X[:, hedges_index: negatives_index]
X_train, X_test, y_train, y_test = train_test_split(x_negative, y, test_size=0.33, random_state=42)
reg = linear_model.LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)
threshold = 0.5
print(classification_report(y_test, y_pred > threshold))
print(accuracy_score(y_test, y_pred > threshold))
print(x_negative.shape)
plt.scatter(np.sum(X_test, axis=1), y_test,  color='black')
plt.plot(np.sum(X_test, axis=1), y_pred, color='blue', linewidth=1)
plt.xticks(())
plt.yticks(())
plt.title("negatives regression")
plt.show()



              precision    recall  f1-score   support

           0       0.52      0.75      0.62        64
           1       0.69      0.44      0.54        79

   micro avg       0.58      0.58      0.58       143
   macro avg       0.60      0.60      0.58       143
weighted avg       0.61      0.58      0.57       143

0.5804195804195804
(431, 4809)


In [40]:
#positive
positives_index = negatives_index + len(positives_vectorizer.get_feature_names())
x_positive = X[:, negatives_index: positives_index]
X_train, X_test, y_train, y_test = train_test_split(x_positive, y, test_size=0.33, random_state=42)
reg = linear_model.LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)
threshold = 0.5
print(classification_report(y_test, y_pred > threshold))
print(accuracy_score(y_test,y_pred > threshold))
print(x_positive.shape)

plt.scatter(np.sum(X_test, axis=1), y_test,  color='black')
plt.plot(np.sum(X_test, axis=1), y_pred, color='blue', linewidth=1)
plt.xticks(())
plt.yticks(())
plt.title("positives regression")
plt.show()


              precision    recall  f1-score   support

           0       0.53      0.77      0.63        64
           1       0.71      0.46      0.55        79

   micro avg       0.59      0.59      0.59       143
   macro avg       0.62      0.61      0.59       143
weighted avg       0.63      0.59      0.59       143

0.5944055944055944
(431, 2021)


In [41]:
len(hedges_vectorizer.get_feature_names())

120

In [42]:
len(positives_vectorizer.get_feature_names())

2021

In [43]:
len(negatives_vectorizer.get_feature_names())

4809

In [44]:
#combined negatives and positives
x_combined = X[:, hedges_index: ]
X_train, X_test, y_train, y_test = train_test_split(x_combined, y, test_size=0.33, random_state=42)
reg = linear_model.LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)
threshold = 0.5
print(classification_report(y_test, y_pred > threshold))
print(accuracy_score(y_test, y_pred > threshold))
print(x_combined.shape)

plt.scatter(np.sum(X_test, axis=1), y_test,  color='black')
plt.plot(np.sum(X_test, axis=1),  y_pred, color='blue', linewidth=1)
plt.xticks(())
plt.yticks(())
plt.title("combined regression")
plt.show()




              precision    recall  f1-score   support

           0       0.54      0.69      0.61        64
           1       0.68      0.53      0.60        79

   micro avg       0.60      0.60      0.60       143
   macro avg       0.61      0.61      0.60       143
weighted avg       0.62      0.60      0.60       143

0.6013986013986014
(431, 6830)


In [50]:
len(articles)

431

In [51]:
np.count_nonzero(y)

230

In [52]:
len(articles) - 230

201

In [None]:
#add unigrams+bigrams + new dictionary lueke + wordnet effect (64+).  save it as sparse matrix and then pickle.  + SVM with linear kernel.
#use lueke for sintement analysis. 
# 1. convert everything to unigrams(tfidf)
# 2. for every group of features "class" have a model (LR and SVM) 
# 3. have 1 model that combines all features (LR and SVM)