In [1]:
import tensorflow_hub as hub
import pickle
import numpy as np
import xgboost as xgb
from clean import *
from CountFeatureGenerator import *
from TfidfFeatureGenerator import *
from SvdFeatureGenerator import *
from Word2VecFeatureGenerator import *
from SentimentFeatureGenerator import *
from score import report_score, LABELS, score_submission
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
open_process_rewrite('fnc-1/competition_test_bodies.csv')
open_process_rewrite('fnc-1/competition_test_stances_unlabeled.csv')

In [4]:
test_bodies = pd.read_csv("fnc-1/competition_test_bodies_processed.csv")
test_unlabeled_stances = pd.read_csv("fnc-1/competition_test_stances_unlabeled_processed.csv")
test_stances = pd.read_csv("fnc-1/competition_test_stances.csv")

In [5]:
test_df = pd.merge(test_unlabeled_stances, test_bodies, how='left', on='Body ID')

In [6]:
def add_embedding(emb, bodies, stances):
    body_embedding = emb(bodies["articleBody"].astype(str)).numpy()
    headline_embedding = emb(stances["Headline"].astype(str)).numpy()
    body_aug   = pd.concat([bodies["Body ID"],  
                          pd.DataFrame(body_embedding)],     
                         axis=1)
    stance_aug = pd.concat([stances[["Body ID", "Stance"]], 
                          pd.DataFrame(headline_embedding)], 
                         axis=1)
    return body_aug, stance_aug

In [7]:
use_base = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
test_bodies_aug,  test_stances_aug  = add_embedding(use_base, 
                                                    test_bodies,  
                                                    test_stances)

In [8]:
def make_X_y(bodies_aug, stances_aug):
    X = pd.merge(bodies_aug, stances_aug, how='left', on='Body ID')
    y = X["Stance"]
    X.drop(columns=['Body ID','Stance'], inplace=True)
    return X, y

In [9]:
test_X, test_y = make_X_y(test_bodies_aug, test_stances_aug)

In [10]:
from joblib import dump, load

In [11]:
binary_clf = load('binary.joblib')
binary_pred_y = binary_clf.predict(test_X)

In [40]:
sum(test_y!='unrelated')

7064

In [13]:
sum(binary_pred_y)

3505

In [15]:
related_test_df = test_df[binary_pred_y==1]
related_test_df.shape

(3505, 3)

In [16]:
related_test_df.head()

Unnamed: 0,Headline,Body ID,articleBody
1,Crazy Conservatives Are Sure a Gitmo Detainee ...,1550,Dave Morin's social networking company Path is...
93,Florida woman gets third breast surgically imp...,1128,"A WOMAN has spent $20,000 on surgery to get a ..."
107,"Report: Joan Rivers doctor took selfie, began ...",2467,A Facebook post By Tikal goldie showed a image...
129,‘Three-boobed’ woman: They’re not fake,1617,The woman who claimed she had a third breast h...
155,Is 'hands up' in Hong Kong really the same as ...,1947,A man scheduled to fly from New York to Dallas...


In [17]:
print("generate unigram")
related_test_df["Headline_unigram"] = related_test_df["Headline"].map(lambda x: preprocess_data(x))
related_test_df["articleBody_unigram"] = related_test_df["articleBody"].map(lambda x: preprocess_data(x))

print("generate bigram")
related_test_df["Headline_bigram"] = related_test_df["Headline_unigram"].map(lambda x: getBigram(x))
related_test_df["articleBody_bigram"] = related_test_df["articleBody_unigram"].map(lambda x: getBigram(x))

print("generate trigram")
related_test_df["Headline_trigram"] = related_test_df["Headline_unigram"].map(lambda x: getTrigram(x))
related_test_df["articleBody_trigram"] = related_test_df["articleBody_unigram"].map(lambda x: getTrigram(x))      

generate unigram
generate bigram
generate trigram


In [18]:
related_test_df.shape

(3505, 9)

In [19]:
generators = [
    CountFeatureGenerator(),
    TfidfFeatureGenerator(),
    SvdFeatureGenerator(),
    Word2VecFeatureGenerator(),
    SentimentFeatureGenerator()
]

for g in generators:
    g.process(related_test_df, header='competition_test')

generate counting features
basic counting features for training saved in feature_pkl/competition_test.basic.pkl
xHeadlineTfidf.shape: (3505, 205006)
headline tfidf features of data set saved in feature_pkl/competition_test.headline.tfidf.pkl
xBodyTfidf.shape: (3505, 205006)
body tfidf features of data set saved in feature_pkl/competition_test.body.tfidf.pkl
simTfidf.shape: (3505, 1)
tfidf sim. features of data set saved in feature_pkl/competition_test.sim.tfidf.pkl
xHeadlineTfidf.shape: (3505, 205006)
xBodyTfidf.shape: (3505, 205006)
simTfidf.shape: (3505, 1)
xHeadlineTfidf.shape: (3505, 205006)
xHeadlineSvd.shape: (3505, 50)
headline svd features of data set saved in feature_pkl/competition_test.headline.svd.pkl
xBodySvd.shape: (3505, 50)
body svd features of training set saved in feature_pkl/competition_test.body.svd.pkl
simSvd.shape: (3505, 1)
svd sim. features of data set saved in feature_pkl/competition_test.sim.svd.pkl
generating word2vec features
model loaded
Headline_unigram_ar

In [20]:
related_test_features = []
for g in generators:
    features = g.read(header='competition_test')
    if g.name()=='tfidfFeatureGenerator':
        related_test_features.append(features[-1])
    else:
        for f in features:
            related_test_features.append(f)
related_test_X = np.hstack(related_test_features)
print(related_test_X.shape)

feature names:  ['count_of_Headline_unigram', 'count_of_unique_Headline_unigram', 'ratio_of_unique_Headline_unigram', 'count_of_Headline_bigram', 'count_of_unique_Headline_bigram', 'ratio_of_unique_Headline_bigram', 'count_of_Headline_trigram', 'count_of_unique_Headline_trigram', 'ratio_of_unique_Headline_trigram', 'count_of_articleBody_unigram', 'count_of_unique_articleBody_unigram', 'ratio_of_unique_articleBody_unigram', 'count_of_articleBody_bigram', 'count_of_unique_articleBody_bigram', 'ratio_of_unique_articleBody_bigram', 'count_of_articleBody_trigram', 'count_of_unique_articleBody_trigram', 'ratio_of_unique_articleBody_trigram', 'count_of_Headline_unigram_in_articleBody', 'ratio_of_Headline_unigram_in_articleBody', 'count_of_Headline_bigram_in_articleBody', 'ratio_of_Headline_bigram_in_articleBody', 'count_of_Headline_trigram_in_articleBody', 'ratio_of_Headline_trigram_in_articleBody', 'len_sent_Headline', 'len_sent_articleBody', 'fake_exist', 'fraud_exist', 'hoax_exist', 'false

In [21]:
relation_clf = load('relation.joblib')
relation_pred_y = relation_clf.predict(related_test_X)

In [22]:
relation_pred_y.shape

(3505,)

In [24]:
prediction = binary_pred_y

In [25]:
prediction[prediction==0]=3

In [27]:
prediction[prediction==1] = relation_pred_y

In [31]:
predicted = [LABELS[int(a)] for a in prediction]

In [35]:
report_score(test_y, predicted)

-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |    117    |     5     |    608    |   1173    |
-------------------------------------------------------------
| disagree  |    30     |     4     |    171    |    492    |
-------------------------------------------------------------
|  discuss  |    352    |    15     |   1700    |   2397    |
-------------------------------------------------------------
| unrelated |    103    |    10     |    390    |   17846   |
-------------------------------------------------------------
Score: 6577.75 out of 11651.25	(56.4553159532239%)


56.4553159532239