In [1]:
import tensorflow_hub as hub

In [2]:
use_base = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [3]:
import pandas as pd

In [4]:
train_bodies = pd.read_csv("fnc-1/train_bodies.csv")
train_stances = pd.read_csv("fnc-1/train_stances.csv")

test_bodies = pd.read_csv("fnc-1/competition_test_bodies.csv")
test_stances = pd.read_csv("fnc-1/competition_test_stances.csv")

In [5]:
train_bodies.head()

Unnamed: 0,Body ID,articleBody
0,0,A small meteorite crashed into a wooded area i...
1,4,Last week we hinted at what was to come as Ebo...
2,5,(NEWSER) – Wonder how long a Quarter Pounder w...
3,6,"Posting photos of a gun-toting child online, I..."
4,7,At least 25 suspected Boko Haram insurgents we...


In [6]:
train_stances.head()

Unnamed: 0,Headline,Body ID,Stance
0,Police find mass graves with at least '15 bodi...,712,unrelated
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated
4,Spider burrowed through tourist's stomach and ...,1923,disagree


In [7]:
def add_embedding(emb, bodies, stances):
    body_embedding = emb(bodies["articleBody"].astype(str)).numpy()
    headline_embedding = emb(stances["Headline"].astype(str)).numpy()
    body_aug   = pd.concat([bodies["Body ID"],  
                          pd.DataFrame(body_embedding)],     
                         axis=1)
    stance_aug = pd.concat([stances[["Body ID", "Stance"]], 
                          pd.DataFrame(headline_embedding)], 
                         axis=1)
    return body_aug, stance_aug

In [8]:
train_bodies_aug, train_stances_aug = add_embedding(use_base, 
                                                    train_bodies, 
                                                    train_stances)
test_bodies_aug,  test_stances_aug  = add_embedding(use_base, 
                                                    test_bodies,  
                                                    test_stances)

In [9]:
train_stances_aug.head()

Unnamed: 0,Body ID,Stance,0,1,2,3,4,5,6,7,...,502,503,504,505,506,507,508,509,510,511
0,712,unrelated,-0.03998,0.077764,0.032448,0.00234,0.052369,-0.061372,0.067031,0.013967,...,0.057494,0.031326,-0.062627,-0.017168,-0.020941,0.083314,0.008403,-0.018552,-0.032492,-0.080363
1,158,agree,0.046992,0.010744,0.069049,-0.022822,-0.006118,0.049886,0.040966,-0.007423,...,0.040342,0.060714,0.000363,-0.031237,-0.06786,0.070658,-0.016111,0.028931,-0.041213,-0.068572
2,137,unrelated,0.018255,-0.049816,-0.023277,-0.048364,0.067178,-0.043995,-0.019505,0.00171,...,0.052702,0.065237,0.026011,-0.067817,-0.013193,0.071012,0.00812,0.01918,0.083779,0.038927
3,1034,unrelated,0.003002,-0.047788,0.030941,0.007057,-0.020168,-0.06858,-0.013878,-0.05577,...,-0.010687,0.061648,0.068334,0.002518,0.058192,0.008357,-0.071766,0.047078,0.070513,0.032552
4,1923,disagree,-0.032554,0.083519,0.017997,-0.048322,-0.012216,-0.057962,-0.010599,0.040906,...,0.009376,0.057961,-0.056759,-0.027848,0.053993,-0.058927,-0.049857,-0.060633,0.008571,-0.030744


In [10]:
train_bodies_aug.head()

Unnamed: 0,Body ID,0,1,2,3,4,5,6,7,8,...,502,503,504,505,506,507,508,509,510,511
0,0,-0.051975,0.061438,-0.037152,-0.058729,0.059308,-0.057099,0.017761,-0.041944,0.062315,...,0.032628,-0.064906,0.015322,3.1e-05,-0.026529,0.061622,-0.047899,0.041718,-0.064322,-0.064834
1,4,-0.023436,0.046326,-0.028285,0.017151,0.048567,-0.036017,0.028506,0.000864,0.071112,...,0.062298,-0.073567,0.010405,-0.07255,0.05963,-0.055734,-0.061896,0.055291,0.014313,0.050452
2,5,-0.019251,-0.058158,0.058223,-0.019478,-0.052088,0.042392,0.016966,-0.003825,-0.046936,...,0.023925,-0.066423,-0.013784,0.02556,0.026624,-0.064867,-0.062649,-0.045024,-0.019334,-0.065035
3,6,0.019407,0.052048,-0.015479,-0.030376,-0.057135,-0.015682,0.004288,0.032956,-0.056941,...,0.021446,-0.057244,0.046186,-0.032939,0.023157,-0.057158,0.014345,0.057102,-0.056127,-0.05721
4,7,0.022549,0.052593,-0.041177,-0.056396,0.057507,0.054077,0.055402,-0.017496,-0.057149,...,-0.002091,-0.058937,-0.009348,-0.052749,-0.01838,-0.058195,0.019914,0.052852,-0.056883,-0.058933


In [11]:
def make_X_y(bodies_aug, stances_aug):
    X = pd.merge(bodies_aug, stances_aug, how='left', on='Body ID')
    y = X["Stance"]
    X.drop(columns=['Body ID','Stance'], inplace=True)
    return X, y

In [12]:
train_X, train_y = make_X_y(train_bodies_aug, train_stances_aug)
test_X, test_y = make_X_y(test_bodies_aug, test_stances_aug)

In [13]:
train_X.head()

Unnamed: 0,0_x,1_x,2_x,3_x,4_x,5_x,6_x,7_x,8_x,9_x,...,502_y,503_y,504_y,505_y,506_y,507_y,508_y,509_y,510_y,511_y
0,-0.051975,0.061438,-0.037152,-0.058729,0.059308,-0.057099,0.017761,-0.041944,0.062315,0.017917,...,0.050829,0.070283,0.020578,-0.059297,0.048319,-0.024017,-0.020586,0.011433,-0.050691,0.023712
1,-0.051975,0.061438,-0.037152,-0.058729,0.059308,-0.057099,0.017761,-0.041944,0.062315,0.017917,...,0.035925,0.057652,-0.021223,-0.025872,0.027794,-0.082535,0.00776,-0.065484,0.079216,0.029912
2,-0.051975,0.061438,-0.037152,-0.058729,0.059308,-0.057099,0.017761,-0.041944,0.062315,0.017917,...,-0.016254,0.067857,0.059119,0.029891,-0.005045,-0.039295,0.017845,0.064382,-0.002041,-0.017721
3,-0.051975,0.061438,-0.037152,-0.058729,0.059308,-0.057099,0.017761,-0.041944,0.062315,0.017917,...,0.041267,0.059182,0.051431,-0.031225,0.008105,-0.052372,0.040708,0.035619,-0.038925,0.024641
4,-0.051975,0.061438,-0.037152,-0.058729,0.059308,-0.057099,0.017761,-0.041944,0.062315,0.017917,...,-0.03939,0.060164,0.061696,0.06783,0.072534,0.057049,-0.072158,-0.059757,-0.051678,-0.072506


In [14]:
train_y.unique()

array(['unrelated', 'agree', 'discuss', 'disagree'], dtype=object)

In [15]:
oht_train_y = pd.Series([0 if x=='unrelated' else 1 for x in train_y])

In [16]:
sum(oht_train_y)

13427

In [17]:
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingClassifier

In [18]:
kf = KFold(n_splits=5, shuffle=True, random_state = 1415926)
train_cv_acc = []
best_score = 0
for train_idx, val_idx in kf.split(train_X):
    train_data = train_X.iloc[train_idx]
    train_label = oht_train_y.iloc[train_idx]
    val_data = train_X.iloc[val_idx]
    val_label = oht_train_y.iloc[val_idx]
    clf = GradientBoostingClassifier(n_estimators=200, random_state=14128, verbose=True)
    clf.fit(train_data, train_label)
    pred_label = clf.predict(val_data)
    score = accuracy_score(val_label, pred_label)
    if score>best_score:
        best_score = score
        best_clf=clf
    print("accuracy = ", score)

      Iter       Train Loss   Remaining Time 
         1           1.1432           25.27m
         2           1.1210           22.36m
         3           1.1026           22.06m
         4           1.0896           21.12m
         5           1.0746           20.59m
         6           1.0607           20.16m
         7           1.0423           19.77m
         8           1.0348           19.46m
         9           1.0197           19.23m
        10           1.0107           18.95m
        20           0.9363           17.40m
        30           0.8792           16.31m
        40           0.8391           15.16m
        50           0.8078           14.27m
        60           0.7800           13.32m
        70           0.7476           12.37m
        80           0.7200           11.41m
        90           0.6937           10.46m
       100           0.6687            9.51m
       200           0.4680            0.00s
accuracy =  0.9333666833416708
      Iter       Train 

In [19]:
from joblib import dump, load

In [20]:
dump(best_clf,'binary.joblib')

['binary.joblib']