In [1]:
import pandas as pd
import os
import math
from bag_of_words import tf_idf_vectorizer
import nltk
from nltk.corpus import stopwords
from nltk.stem import *
from nltk.stem.porter import *
from nltk.tokenize import RegexpTokenizer
import numpy as np
from numpy import linalg as LA
# nltk.download('all')

# Loading data sets

In [None]:
train_headline = pd.read_csv(os.path.join('train_stances.csv'))
train_body = pd.read_csv(os.path.join('train_bodies.csv'))
test_headline = pd.read_csv(os.path.join('competition_test_stances.csv'))
test_body = pd.read_csv(os.path.join('competition_test_bodies.csv'))

In [None]:
train = train_headline.join(train_body.set_index('Body ID'), on='Body ID')
test = test_headline.join(test_body.set_index('Body ID'), on='Body ID')

In [None]:
def create_val(train):
    train_subs = []
    train_subs.append(train.query('Stance == "agree"'))
    train_subs.append(train.query('Stance == "disagree"'))
    train_subs.append(train.query('Stance == "discuss"'))
    train_subs.append(train.query('Stance == "unrelated"'))
       
    trains = []
    val_subs = []
    for train_sub in train_subs:
        length = len(train_sub)
        if length > 0:
            index = math.ceil(length * 0.1)
            val_subs.append(train_sub[:index])
            trains.append(train_sub[index:])            
    return pd.concat(trains), pd.concat(val_subs)

In [None]:
train, val = create_val(train.copy())

In [None]:
train.head()

In [None]:
len(val) / len(train)

In [None]:
print(len(val.query('Stance == "agree"')) / len(val))
print(len(val.query('Stance == "disagree"')) / len(val))
print(len(val.query('Stance == "discuss"')) / len(val))
print(len(val.query('Stance == "unrelated"')) / len(val))

In [None]:
print(len(train.query('Stance == "agree"')) / len(train))
print(len(train.query('Stance == "disagree"')) / len(train))
print(len(train.query('Stance == "discuss"')) / len(train))
print(len(train.query('Stance == "unrelated"')) / len(train))

# Clean headlines and bodies

In [None]:
def tokenize(data):
    stemmer = PorterStemmer()
    stopwords_english = set(stopwords.words('english'))
    tokenizer = RegexpTokenizer(r'\w+')
    
    tokenized_words = []
    for word in tokenizer.tokenize(data):
        word = stemmer.stem(word.lower())
        if word not in stopwords_english and word.isalpha():
            tokenized_words.append(word)
    return tokenized_words

In [None]:
def clean_sentences_for_row(row):
    row['Headline'] = ' '.join(tokenize(row['Headline']))
    row['articleBody'] = ' '.join(tokenize(row['articleBody']))
    return row

In [None]:
train = train.apply(lambda row: clean_sentences_for_row(row), axis=1)
val = val.apply(lambda row: clean_sentences_for_row(row), axis=1)
test = test.apply(lambda row: clean_sentences_for_row(row), axis=1)

# Calculating tf idf matrix for all data sets

In [None]:
# train = pd.read_csv(os.path.join('train_features.csv'))
# val = pd.read_csv(os.path.join('val_features.csv'))
# test = pd.read_csv(os.path.join('test_features.csv'))

In [None]:
train_headlines = list(set(train['Headline']))
train_bodies = list(set(train['articleBody']))
train_sentences = train_headlines + train_bodies

In [None]:
val_headlines = list(set(val['Headline']))
val_bodies = list(set(val['articleBody']))
val_sentences = val_headlines + val_bodies

In [None]:
test_headlines = list(set(test['Headline']))
test_bodies = list(set(test['articleBody']))
test_sentences = test_headlines + test_bodies

In [None]:
vectorizer = tf_idf_vectorizer()
vectorizer.fit(train_sentences)

In [None]:
tf_idf_train_headlines = vectorizer.transform(train_headlines)
tf_idf_train_bodies = vectorizer.transform(train_bodies)
tf_idf_train = pd.concat([tf_idf_train_headlines, tf_idf_train_bodies])
tf_idf_train.to_csv('tf_idf_train.csv')

In [None]:
tf_idf_val_headlines = vectorizer.transform(val_headlines)
tf_idf_val_bodies  = vectorizer.transform(val_bodies)
tf_idf_val = pd.concat([tf_idf_val_headlines, tf_idf_val_bodies])
tf_idf_val.to_csv('tf_idf_val.csv')

In [None]:
tf_idf_test_headlines = vectorizer.transform(test_headlines)
tf_idf_test_bodies = vectorizer.transform(test_bodies)
tf_idf_test = pd.concat([tf_idf_test_headlines, tf_idf_test_bodies])
tf_idf_test.to_csv('tf_idf_test.csv')

# Calculating tf idf cosine similarity

In [None]:
def cosine_similarity(vec1, vec2):
    return np.vdot(vec1, vec2) / (LA.norm(vec1) * LA.norm(vec2))

In [None]:
tf_idf_train = pd.read_csv(os.path.join('tf_idf_train.csv'))
tf_idf_val = pd.read_csv(os.path.join('tf_idf_val.csv'))
tf_idf_test = pd.read_csv(os.path.join('tf_idf_test.csv'))

In [None]:
tf_idf_train = tf_idf_train.set_index('sentence')
tf_idf_val = tf_idf_val.set_index('sentence')
tf_idf_test = tf_idf_test.set_index('sentence')

In [None]:
def calc_cos_sim_features_for_row(row, tf_idf_martix):
    headline = row['Headline']
    body = row['articleBody']
    headline_vec = tf_idf_martix.loc[tf_idf_martix.index==headline].values[0]
    body_vec = tf_idf_martix.loc[tf_idf_martix.index==body].values[0]
    row['headline_vec'] = headline_vec
    row['body_vec'] = body_vec
    row['tf_idf_cos_sim'] = cosine_similarity(headline_vec, body_vec)
    return row

In [None]:
train = train.apply(lambda row : calc_cos_sim_features_for_row(row, tf_idf_train), axis=1)
val = val.apply(lambda row : calc_cos_sim_features_for_row(row, tf_idf_val), axis=1)
test = test.apply(lambda row : calc_cos_sim_features_for_row(row, tf_idf_test), axis=1)

In [None]:
train.head()

In [None]:
train.to_csv('train_features.csv', index = False)
val.to_csv('val_features.csv', index = False)
test.to_csv('test_features.csv', index = False)

# Calculating LM for headline and body for all datasets

In [2]:
from LM import Unigram
from Interpolated_LM import interpolated_lm
from LM import Ngram

In [3]:
train = pd.read_csv(os.path.join('train_features.csv'))
val = pd.read_csv(os.path.join('val_features.csv'))
test = pd.read_csv(os.path.join('test_features.csv'))

In [4]:
def inject_oovs(data):
    seen = set()
    data_with_oovs = []
    for word in data:
        if word in seen:
            data_with_oovs.append(word)
        else:
            data_with_oovs.append('oov')
            seen.add(word)
    return data_with_oovs

In [5]:
def replace_oovs(lm, data):
    return [word if word in lm.vocab else 'oov' for word in data]

In [6]:
def perplexity(lm, data):
    log_prob = 0.0
    order = lm.order - 1
    for i in range(order, len(data)):
        word = data[i]
        history = data[i-order:i]
        prob = lm.probability(word, *history)
        log_prob += math.log(prob) if prob > 0.00 else float('-inf')
    return math.exp(-log_prob / (len(data) - order))

In [7]:
def if_normalize(lm):
    return 1 - sum([lm.probability(word) for word in lm.vocab]) <= 0.0001

In [8]:
def KL_divergence(row, lm_headline, lm_body):
    res = 0
    order = lm_headline.order - 1
    headline = row['Headline'].split()
    body = row['articleBody'].split()
    lm_headline = interpolated_lm(Unigram(headline, 0.1), lm_headline, 0.9)
    lm_body = interpolated_lm(Unigram(body, 0.1), lm_body, 0.9)

    for i in range(order, len(headline)):
        word = headline[i]
        history = headline[i-order:i]
        res -= lm_headline.probability(word, *history) * math.log(lm_body.probability(word, *history))
    row['KL_divergence'] = res
    return row

In [9]:
def generate_collection_lm(data):
    headlines = ' '.join(list(set(data['Headline']))).split()
    bodies = ' '.join(list(set(data['articleBody']))).split()
    sentences = headlines + bodies   

    headlines = inject_oovs(headlines)
    bodies = inject_oovs(bodies)
    sentences = inject_oovs(sentences)
    
    lm_headlines = Unigram(headlines, 0.1)
    lm_body = Unigram(bodies, 0.1)
    return lm_headlines, lm_body

In [10]:
lm_train_headlines, lm_train_body = generate_collection_lm(train)
lm_val_headlines, lm_val_body = generate_collection_lm(val)
lm_test_headlines, lm_test_body = generate_collection_lm(test)

In [11]:
train = train.apply(lambda row: KL_divergence(row, lm_train_headlines, lm_train_body), axis=1)
val = val.apply(lambda row: KL_divergence(row, lm_val_headlines, lm_val_body), axis=1)
test = test.apply(lambda row: KL_divergence(row, lm_test_headlines, lm_test_body), axis=1)

In [12]:
train.head()

Unnamed: 0,Headline,Body ID,Stance,articleBody,headline_vec,body_vec,tf_idf_cos_sim,tf_idf_eucliden_dis,tf_idf_Manhattan_dis,common_words_count,KL_divergence
0,willi nelson dead internet freak anoth celebr ...,2186,agree,hoax went viral internet ha left fan one count...,"[ 0. 0. 0. ..., 0. 0. 0.]","[ 0. 0. 0. ..., 0. 0. 0.]",0.251449,38.429421,298.491765,4,7.199204
1,heartbroken girl spend week kfc get dump,1225,agree,year old woman chengdu china southwest sichuan...,"[ 0. 0. 0. ..., 0. 0. 0.]","[ 0. 0. 0. ..., 0. 0. 0.]",0.119022,53.745376,474.133136,4,7.104116
2,small meteorit strike nicaragua capit citi man...,961,agree,loud boom heard saturday night resid nicaragua...,"[ 0. 0. 0. ..., 0. 0. 0.]","[ 0. 0. 0. ..., 0. 0. 0.]",0.223101,31.472045,254.703593,6,4.808862
3,miss jetlin rais fear new style attack libyan ...,1197,agree,u offici tuesday express fear islamist milit a...,"[ 0. 0. 0. ..., 0. 0. 0.]","[ 0. 0. 0. ..., 0. 0. 0.]",0.266811,39.087583,334.98387,10,5.264663
4,angri mob cut rapist peni meat cleaver viewer ...,1959,agree,man hi peni cut angri mob attempt rape teenag ...,"[ 0. 0. 0. ..., 0. 0. 0.]","[ 0. 0. 5.10957524 ..., 0. ...",0.307222,49.897797,483.570265,6,5.996599


In [13]:
train.to_csv('train_features.csv', index = False)
val.to_csv('val_features.csv', index = False)
test.to_csv('test_features.csv', index = False)

# Custom Feature Engineering

In [None]:
train = pd.read_csv(os.path.join('train_features.csv'))
val = pd.read_csv(os.path.join('val_features.csv'))
test = pd.read_csv(os.path.join('test_features.csv'))

In [None]:
tf_idf_train = pd.read_csv(os.path.join('tf_idf_train.csv'))
tf_idf_val = pd.read_csv(os.path.join('tf_idf_val.csv'))
tf_idf_test = pd.read_csv(os.path.join('tf_idf_test.csv'))

In [None]:
tf_idf_train = tf_idf_train.set_index('sentence')
tf_idf_val = tf_idf_val.set_index('sentence')
tf_idf_test = tf_idf_test.set_index('sentence')

In [None]:
train.head()

In [None]:
def count_common_words(headline, body):
    return sum([min(headline.count(word), body.count(word)) for word in set(headline.split())])

In [None]:
def euclidean_distance(vec1, vec2):
    return np.linalg.norm(vec1 - vec2)

In [None]:
def Manhattan_distance(vec1, vec2):
    return sum(np.abs(vec1 - vec2))

In [None]:
def create_more_features(row, tf_idf_martix):
    headline = row['Headline']
    body = row['articleBody']
    headline_vec = tf_idf_martix.loc[tf_idf_martix.index==headline].values[0]
    body_vec = tf_idf_martix.loc[tf_idf_martix.index==body].values[0]
    row['tf_idf_eucliden_dis'] = euclidean_distance(headline_vec, body_vec)
    row['tf_idf_Manhattan_dis'] = Manhattan_distance(headline_vec, body_vec)
    row['common_words_count'] = count_common_words(headline, body)
    return row

In [None]:
train = train.apply(lambda row : create_more_features(row, tf_idf_train), axis=1)
val = val.apply(lambda row : create_more_features(row, tf_idf_val), axis=1)
test = test.apply(lambda row : create_more_features(row, tf_idf_test), axis=1)

In [None]:
train.to_csv('train_features.csv', index = False)
val.to_csv('val_features.csv', index = False)
test.to_csv('test_features.csv', index = False)

# Plot feature values distribution

In [None]:
train = pd.read_csv(os.path.join('train_features.csv'))
val = pd.read_csv(os.path.join('val_features.csv'))
test = pd.read_csv(os.path.join('test_features.csv'))

In [None]:
train.head()

In [None]:
train_subs = []
train_subs.append(train.query('Stance == "agree"'))
train_subs.append(train.query('Stance == "discuss"'))
train_subs.append(train.query('Stance == "disagree"'))
train_subs.append(train.query('Stance == "unrelated"'))

In [None]:
import matplotlib.pyplot as plt
def plot_distribution(data, column, category):
    data[column].hist(normed=True)
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.title('Investigating the distribution of ' + column + ' for ' + str(category))
    plt.show()

def plot_distribution_for_each_category(datas, column):
    for data in datas:
        category = set(data['Stance'])
        plot_distribution(data, column, category)

In [None]:
plot_distribution(train_subs[3], 'common_words_count', 'agree')

In [None]:
plot_distribution_for_each_category(train_subs, 'common_words_count')