In [0]:
# Mounting Drive
from google.colab import drive
drive.mount('/gdrive')

In [None]:
# Importing Libraries
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
!pip install --upgrade transformers
!pip install simpletransformers
import re
import string
import numpy as np
import pandas as pd
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Dropout, MaxPooling1D, Concatenate, Input, Flatten, Conv1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from simpletransformers.classification.classification_model import ClassificationModel
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.decomposition import NMF
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_distances
from sklearn.model_selection import train_test_split
from sklearn import feature_extraction
from gensim import models,corpora
from nltk.corpus import stopwords

In [None]:
%%writefile setup.sh

export CUDA_HOME=/usr/local/cuda-10.1
git clone https://github.com/NVIDIA/apex
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex

In [None]:
!sh setup.sh

In [None]:
# Loading Data
body = pd.read_csv("../input/fnc-stance/train_bodies.csv")
stances = pd.read_csv("../input/fnc-stance/train_stances.csv")
data = pd.merge(stances,body,on="Body ID")
data_related = data[data["Stance"]!="unrelated"]
ag_data = data_related[data_related["Stance"]!="discuss"]
ag_data.head()

In [None]:
print(len(ag_data))

In [None]:
X=ag_data[["Headline","articleBody"]].values
Y=ag_data["Stance"].values

In [None]:
test_body = pd.read_csv("../input/fnc-stance/competition_test_bodies.csv")
test_stances = pd.read_csv("../input/fnc-stance/competition_test_stances.csv")
test_data = pd.merge(test_stances,test_body,on="Body ID")
test_data_related = test_data[test_data["Stance"]!="unrelated"]
test_ag_data = test_data_related[test_data_related["Stance"]!="discuss"]
test_ag_data.head()

In [None]:
le = LabelEncoder()
Y = le.fit_transform(Y)
X_train_head,X_val_head,y_train,y_val = train_test_split(X,Y,test_size=0.2,random_state=0)

In [None]:
X_train = pd.DataFrame({"text_a":X_train_head[:,0],"text_b":X_train_head[:,1],"labels":y_train})
X_val = pd.DataFrame({"text_a":X_val_head[:,0],"text_b":X_val_head[:,1],"labels":y_val})

In [None]:
train_args ={
    'learning_rate':1e-5,
    'num_train_epochs': 10,
    'reprocess_input_data': True,
    'process_count': 10,
    'train_batch_size': 10,
    'eval_batch_size': 10,
    'max_seq_length': 512,
    'fp16': True,
    'save_steps': 4800,
    'evaluate_during_training': True,
    'evaluate_during_training_verbose': True,
    'save_model_every_epoch': False,
    'save_eval_checkpoints': False,
    'overwrite_output_dir': True
}
model = ClassificationModel('roberta', 'roberta-base', num_labels=2,  args=train_args)

In [None]:
model.train_model(X_train, eval_df=X_val)

In [None]:
X_test = test_ag_data[["Headline","articleBody"]].values
Y_test = test_ag_data["Stance"].values
#X_test = pd.DataFrame({"text_a":X_test[:,0],"text_b":X_test[:,1]})

In [None]:
X_test.head()

In [None]:
predictions,raw = model.predict(X_test.tolist())

In [None]:
Y_test = le.transform(Y_test)

In [None]:
from sklearn.metrics import f1_score

def calculate_f1_scores(y_true, y_predicted):
    f1_macro = f1_score(y_true, y_predicted, average='macro')
    f1_classwise = f1_score(y_true, y_predicted, average=None, labels=[0, 1])

    resultstring = "F1 macro: {:.3f}".format(f1_macro * 100) + "% \n"
    resultstring += "F1 agree: {:.3f}".format(f1_classwise[0] * 100) + "% \n"
    resultstring += "F1 disagree: {:.3f}".format(f1_classwise[1] * 100) + "% \n"
    #resultstring += "F1 discuss: {:.3f}".format(f1_classwise[2] * 100) + "% \n"

    return resultstring

calculate_f1_scores(Y_test, predictions)