In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
from pathlib import Path
import numpy as np
import pandas as pd
import os

In [0]:
train_folder = "/content/drive/My Drive/emnlp/datasets/train-articles" # check that the path to the datasets folder is correct,
dev_folder = "/content/drive/My Drive/emnlp/datasets/dev-articles"     # if not adjust these variables accordingly
train_labels_folder = "/content/drive/My Drive/emnlp/datasets/train-labels-SLC"
dev_template_labels_file = "/content/drive/My Drive/emnlp/datasets/dev.template-output-SLC.out"


In [0]:
dev_template_labels_file = "/content/drive/My Drive/emnlp/datasets/dev-template-output-SLC"

In [36]:
len(os.listdir(train_folder)),len(os.listdir(train_labels_folder))

(350, 350)

In [0]:
import glob
import os.path
import numpy as np
import sys


def read_articles_from_file_list(folder_name, file_pattern="*.txt"):
    
    file_list = glob.glob(os.path.join(folder_name, file_pattern))
    article_id_list, sentence_id_list, sentence_list = ([], [], [])
    for filename in sorted(file_list):
        article_id = os.path.basename(filename).split(".")[0][7:]
        with open(filename, "r", encoding="utf-8") as f:
            for sentence_id, row in enumerate(f.readlines(), 1):
                sentence_list.append(row.rstrip())
                article_id_list.append(article_id)
                sentence_id_list.append(str(sentence_id))

    return article_id_list, sentence_id_list, sentence_list


def are_ids_aligned(article_id_list, sentence_id_list,
                    reference_article_id_list, reference_sentence_id_list):
    """
    check whether the two lists of ids of the articles and the sentences are aligned
    """
    for art, ref_art, sent, ref_sent in zip(article_id_list, reference_article_id_list,
                                            sentence_id_list, reference_sentence_id_list):
        if art != ref_art:
            print("ERROR: article ids do not match: article id = %s, reference article id = %s"%(art, ref_art))
            return False
        if sent != ref_sent:
            print("ERROR: sentence ids do not match: article id:%s,%s sentence id:%s,%s" %(art, ref_art, sent, ref_sent))
            return False
    return True


def read_predictions_from_file(filename):
   
    articles_id, sentence_id_list, gold_labels = ([], [], [])
    with open(filename, "r") as f:
        for row in f.readlines():
            article_id, sentence_id, gold_label = row.rstrip().split("\t")
            articles_id.append(article_id)
            sentence_id_list.append(sentence_id)
            gold_labels.append(gold_label)
    return articles_id, sentence_id_list, gold_labels


def read_predictions_from_file_list(folder_name, file_pattern):
    gold_file_list = glob.glob(os.path.join(folder_name, file_pattern))
    articles_id, sentence_id_list, gold_labels = ([], [], [])
    for filename in sorted(gold_file_list):
        art_ids, sent_ids, golds = read_predictions_from_file(filename)
        articles_id += art_ids
        sentence_id_list += sent_ids
        gold_labels += golds
    return articles_id, sentence_id_list, gold_labels


In [0]:

# loading articles' content from *.txt files in the train folder
train_article_ids, train_sentence_ids, sentence_list = read_articles_from_file_list(train_folder)

# loading gold labels, articles ids and sentence ids from files *.task-SLC.labels in the train labels folder
reference_articles_id, reference_sentence_id_list, gold_labels = read_predictions_from_file_list(
    train_labels_folder, "*.task-SLC.labels")


In [85]:
len(ds),len(gold_labels)

(16965, 16965)

In [0]:
tdf=pd.DataFrame(list(zip(sentence_list,gold_labels)),columns=['sentence','labels'])
tdf.to_csv('/content/drive/My Drive/emnlp/datasets/traindf.csv')

In [39]:
len(train_article_ids),len(reference_articles_id)

(16965, 16965)

In [0]:
# checking that the number of sentences in the raw training set and the gold label file
if not are_ids_aligned(train_article_ids, train_sentence_ids, reference_articles_id, reference_sentence_id_list):
    sys.exit("Exiting: training set article ids and gold labels are not aligned")
print("Loaded %d sentences from %d articles" % (len(sentence_list), len(set(train_article_ids))))

In [0]:
dev_article_id_list, dev_sentence_id_list, dev_sentence_list = read_articles_from_file_list(dev_folder)
reference_articles_id, reference_sentence_id_list, dev_labels = read_predictions_from_file(dev_template_labels_file)


In [42]:
len(dev_article_id_list),len(reference_articles_id)

(2235, 2235)

In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [0]:
ds = np.array([ len(sentence) for sentence in sentence_list ]).reshape(-1, 1)
gold_labels=np.array(gold_labels)

In [97]:
np.unique(gold_labels,return_counts=True)

(array(['non-propaganda', 'propaganda'], dtype='<U14'), array([12245,  4720]))

In [94]:
X_train,X_test, y_train,y_test=train_test_split(ds,gold_labels,random_state=42)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((12723, 1), (4242, 1), (12723,), (4242,))

In [95]:
# compute one feature for each sentence: the length of the sentence and train the model
model = LogisticRegression(penalty='l2', class_weight='balanced', solver="lbfgs")
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [96]:
test_preds=model.predict(X_test)
print (classification_report(y_test,test_preds))

                precision    recall  f1-score   support

non-propaganda       0.78      0.67      0.72      3035
    propaganda       0.39      0.53      0.45      1207

      accuracy                           0.63      4242
     macro avg       0.59      0.60      0.59      4242
  weighted avg       0.67      0.63      0.64      4242



In [0]:
# reading data from the development set
dev_article_id_list, dev_sentence_id_list, dev_sentence_list = read_articles_from_file_list(dev_folder)
reference_articles_id, reference_sentence_id_list, dev_labels = read_predictions_from_file(dev_template_labels_file)
if not are_ids_aligned(dev_article_id_list, dev_sentence_id_list, reference_articles_id, reference_sentence_id_list):
    sys.exit("Exiting: development set article ids and gold labels are not aligned")

# computing the predictions on the development set
dev = np.array([ len(sentence) for sentence in dev_sentence_list ]).reshape(-1, 1)
predictions = model.predict(dev)

In [0]:
task_SLC_output_file = "/content/drive/My Drive/emnlp/baseline-output-SLC1.txt"
# writing predictions to file
with open(task_SLC_output_file, "w") as fout:
    for article_id, sentence_id, prediction in zip(dev_article_id_list, dev_sentence_id_list, predictions):
        fout.write("%s\t%s\t%s\n" % (article_id, sentence_id, prediction))
print("Predictions written to file " + task_SLC_output_file)