In [1]:
import pandas as pd
from nltk import tokenize
from tqdm import tqdm

import seaborn as sns
from matplotlib import pyplot as plt



1. content meta information
    - date, scrap_time, scraping_date, time, author, description, headline, subtitle
    - master_id, master_news_id, url, source
2. aws mturk label information
    - `mturk`: 0 or 1
    - `coder.number`: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
    - `n_of_labels`: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
    - survey_date
    - country1, country2, list_of_countries_answered
3. columns for training
    - labelled, sentiment, sentiment.average, sentence_ner
    - grouped, entity_check, entity_validity
    - input.text.clean, input_text, title, category, content, lowercase_content

In [2]:
train_df = pd.read_csv('data/peaceindexChallenge_clean_train.csv', encoding='utf-8')
print(train_df.shape)
print(train_df.columns)

(20788, 34)
Index(['X1', 'master_id', 'master_news_id', 'url', 'source', 'country1',
       'country2', 'list_of_countries_answered', 'date', 'input_text', 'mturk',
       'survey_date', 'labelled', 'sentiment', 'n_of_labels', 'grouped',
       'entity_check', 'title', 'subtitle', 'content', 'category',
       'lowercase_content', 'sentence_ner', 'scrap_time', 'scraping_date',
       'time', 'author', 'description', 'headline', 'entity_validity',
       'sentiment.average', 'coder.number', 'train', 'input.text.clean'],
      dtype='object')


  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
test_df = pd.read_csv('data/peaceindexChallenge_clean_test.csv', encoding='utf-8')
print(test_df.shape)

(8905, 34)


In [4]:
train_df.isnull().sum()

X1                                0
master_id                         0
master_news_id                    0
url                               1
source                          405
country1                          0
country2                          0
list_of_countries_answered     5166
date                              0
input_text                        0
mturk                             0
survey_date                     405
labelled                          0
sentiment                        42
n_of_labels                       0
grouped                         405
entity_check                    405
title                             0
subtitle                      14074
content                         405
category                         62
lowercase_content               405
sentence_ner                    863
scrap_time                    12334
scraping_date                 19102
time                          20043
author                        16442
description                 

In [5]:
def preprocess_dataset(df):
    data_list = []
    for idx, row in tqdm(df[['master_news_id','input_text', 'sentiment.average']].iterrows()):
        sent_list = tokenize.sent_tokenize(row['input_text'])
        news_id = row['master_news_id']        
        sentiment = -1 if row['sentiment.average'] < 0 else 1 if row['sentiment.average'] > 0 else 0

        for sent in sent_list:
            data_list.append((news_id, sent, sentiment))

    final_df = pd.DataFrame(data_list, columns=['id','sentence','sentiment'])
    return final_df

In [18]:
# Create dataset for irrelevance and sentiment
def create_dataset(raw_df, tag='irrelevant'):
    new_df = raw_df.dropna(how='any')

    if tag == 'irrelevant':
        new_df['sentiment'] = new_df['sentiment'].map({-1: 1, 0: 0, 1: 1})
        
    elif tag == 'sentiment':
        new_df = new_df[new_df.sentiment != 0]
        new_df.loc[new_df['sentiment'] < 0, 'sentiment'] = 0
    
    return new_df

In [24]:
# Train Set
raw_df = preprocess_dataset(train_df)
print(train_df.shape, raw_df.shape)

irre_df = create_dataset(raw_df, tag='irrelevant')
sent_df = create_dataset(raw_df, tag='sentiment')

print(irre_df.shape, irre_df['sentiment'].unique())
print(sent_df.shape, sent_df['sentiment'].unique())

irre_df.to_csv('data/irrelevant_train.tsv', sep='\t')
sent_df.to_csv('data/sentiment_train.csv', sep='\t')

20788it [00:03, 5850.23it/s]


(20788, 34) (50342, 3)
(50342, 3) [1 0]
(41109, 3) [0 1]


In [23]:
# Test Set
raw_df = preprocess_dataset(test_df)
print(test_df.shape, raw_df.shape)

irre_df = create_dataset(raw_df, tag='irrelevant')
sent_df = create_dataset(raw_df, tag='sentiment')

print(irre_df.shape, irre_df['sentiment'].unique())
print(sent_df.shape, sent_df['sentiment'].unique())

irre_df.to_csv('data/irrelevant_test.tsv', sep='\t')
sent_df.to_csv('data/sentiment_test.csv', sep='\t')

8905it [00:01, 5954.54it/s]


(8905, 34) (21473, 3)
(21473, 3) [1 0]
(17628, 3) [0 1]
