<a href="https://colab.research.google.com/github/MayyarAA/NaturalLangaugeProcessingBERT/blob/main/MSCI598ProjectBERTV2Complex.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

In [2]:
import transformers
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import pandas as pd
import matplotlib.pyplot as plt
import sklearn    
from sklearn.model_selection import train_test_split

In [None]:
### using simple BERT
!pip install simpletransformers
from simpletransformers.classification import (
    ClassificationModel, ClassificationArgs
)

# Taking in the csv files

In [3]:
from google.colab import drive
drive.mount('/content/drive')     


Mounted at /content/drive


In [5]:
train_body = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/train_bodies.csv")
train_stance = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/train_stances.csv")

In [None]:
train_body.head(5)

In [None]:
train_stance.head(5)

In [None]:
train_stance.loc[train_stance['Body ID']==712].head(2)

Unnamed: 0,Headline,Body ID,Stance
0,Police find mass graves with at least '15 bodi...,712,unrelated
1787,Seth Rogen to Play Apple’s Steve Wozniak,712,discuss


# Merge the stance & body on Body ID

In [7]:
full_sb = pd.merge(train_stance,train_body,on="Body ID", how='left')
full_sb.head(1)

Unnamed: 0,Headline,Body ID,Stance,articleBody
0,Police find mass graves with at least '15 bodi...,712,unrelated,Danny Boyle is directing the untitled film\n\n...


# Set up for Simple BERT

# Setup pearson_corr & spearman_corr for Simple BERT

In [9]:
from scipy.stats import pearsonr, spearmanr

def pearson_corr(preds, labels): return pearsonr(preds, labels)[0]

def spearman_corr(preds, labels): return spearmanr(preds, labels)[0]

# Trim the size of the articleBody 

In [10]:
print(len(full_sb.iloc[0]['articleBody']))

1093


In [11]:
#make full_sb shorter body

full_sb['articleBody'] = full_sb['articleBody'].str.slice(0,50)
full_sb.head(1)

Unnamed: 0,Headline,Body ID,Stance,articleBody
0,Police find mass graves with at least '15 bodi...,712,unrelated,Danny Boyle is directing the untitled film\n\n...


In [12]:

#smaller_sb = full_sb.iloc[0:1000]
smaller_sb = full_sb
print(len(smaller_sb))

49972


In [13]:
train_df_simple = smaller_sb[['Headline', 'articleBody','Stance']].copy()

In [14]:
#turn stance into integer value 
train_df_simple['Stance'] = train_df_simple['Stance'].replace({'unrelated':0,'agree': 1, 'disagree':2 , 'discuss':3})
print(train_df_simple.iloc[0])

Headline       Police find mass graves with at least '15 bodi...
articleBody    Danny Boyle is directing the untitled film\n\n...
Stance                                                         0
Name: 0, dtype: object


# Create BERT Simple Model


In [None]:
# Create a ClassificationModel
model_args = {
    'fps16':True,
    'num_train_epochs': 5,
    'learning_rate': 1e-5,
    'overwrite_output_dir': True,
    'reprocess_input_data':True,
    'process_count':10,
    #'train_batch_size':6,
    'train_batch_size':100,
    'eval_batch_size':100,
    'max_seq_length':50,
    'output_dir' :'/content/drive/MyDrive/Colab Notebooks/MSCI598ProjectOutput'
}

model = ClassificationModel("roberta", "roberta-base",use_cuda=True,num_labels=4,args=model_args)
#model = ClassificationModel("roberta", "roberta-base",use_cuda=True,num_labels=4,args=model_args)
#model = ClassificationModel("roberta", "/content/drive/MyDrive/Colab Notebooks/MSCI598ProjectOutput/",use_cuda=True,num_labels=4,args=model_args)


In [17]:
train_df_simple.columns = ['text_a',"text_b","labels"]

In [None]:
#model.train_model(train_df=train_df_simple, eval_df=eval_df, pearson_corr=pearson_corr, spearman_corr=spearman_corr)
model.train_model(train_df_simple, pearson_corr=pearson_corr, spearman_corr=spearman_corr)

# Test Simple BERT Model


## Prepare Testing Dataframe


In [20]:
def generate_df_from_csv(path):
  df = pd.read_csv(path)
  return df

def produce_set_for_simpleBERT(stance,body):
  full_sb = pd.merge(stance,body,on="Body ID",how='left')
  #print(len(full_sb.columns))
  #print(full_sb.head(4))
  #make full_sb shorter body
  full_sb['articleBody'] = full_sb['articleBody'].str.slice(0,50)
  df_simple= full_sb[['Headline', 'articleBody','Stance']].copy()
  df_simple['Stance'] = df_simple['Stance'].replace({'unrelated':0,'agree': 1, 'disagree':2 , 'discuss':3})
  df_simple.columns = ['text_a',"text_b","labels"]
  return df_simple,full_sb

def df_to_listOflist(df):
  df_2d = df[['text_a','text_b']].copy()
  listOfList = df_2d.to_numpy().tolist()
  return listOfList;

In [21]:
stance_comp = generate_df_from_csv("/content/drive/MyDrive/Colab Notebooks/competition_test_stances.csv")
print((stance_comp.head(2)))

                                            Headline  Body ID     Stance
0  Ferguson riots: Pregnant woman loses eye after...     2008  unrelated
1  Crazy Conservatives Are Sure a Gitmo Detainee ...     1550  unrelated


In [22]:
body_comp = generate_df_from_csv("/content/drive/MyDrive/Colab Notebooks/competition_test_bodies.csv")
print((body_comp.head(2)))

   Body ID                                        articleBody
0        1  Al-Sisi has denied Israeli reports stating tha...
1        2  A bereaved Afghan mother took revenge on the T...


In [23]:
df_comp,df_full_everything = produce_set_for_simpleBERT(stance_comp,body_comp)
df_comp.head(5)

Unnamed: 0,text_a,text_b,labels
0,Ferguson riots: Pregnant woman loses eye after...,A RESPECTED senior French police officer inves...,0
1,Crazy Conservatives Are Sure a Gitmo Detainee ...,Dave Morin's social networking company Path is...,0
2,A Russian Guy Says His Justin Bieber Ringtone ...,A bereaved Afghan mother took revenge on the T...,0
3,"Zombie Cat: Buried Kitty Believed Dead, Meows ...",Hewlett-Packard is officially splitting in two...,0
4,Argentina's President Adopts Boy to End Werewo...,An airline passenger headed to Dallas was remo...,0


In [24]:
print(len(df_comp))

25413


In [25]:
comp_list = df_to_listOflist(df_comp)
#print(comp_list.pop())

## Testing BERT Simple Model


In [26]:
# test the model
predictions, raw_outputs = model.predict(comp_list)

  0%|          | 0/25413 [00:00<?, ?it/s]

  0%|          | 0/255 [00:00<?, ?it/s]

In [None]:
print(f'predictions {predictions.flat[1]} + predictions length {len(predictions)} of type {type(predictions)}')

predictions 0 + predictions length 25413 of type <class 'numpy.ndarray'>


## Determine Testing Accuracy of BERT Simple Model


In [27]:
import numpy as np
def turn_list_to_df(listObj):
  #df = pd.DataFrame([["a","a"]],columns=['A','B'])
  df = pd.DataFrame()
  # for listRow in listObj:
  #   df.loc[len(df)] = listRow
  #while count < len(listObj):
  count=0
  while count < 10:
    dfFromList = pd.DataFrame()
    dfFromList['a'] = listObj[count][0]
    dfFromList['b'] = listObj[count][1]
    #print(listObj[count])
    #print(dfFromList)
    #df.loc[len(df)] = listObj[count]
    df= df.append(dfFromList)    
    count= count+1
  return df;
def determine_acc_simple_bert(listObj,pred):
  df = turn_list_to_df(listObj)
  df = pd.DataFrame(pred)
  pred_df = pd.DataFrame(pred)
  #df['prediction'] = pred.tolist()
  df = df.join(pred_df)
  print(df.head(0))
  # df['prediction'] = df['prediction'].replace({0:'unrelated',1:'agree', 2:'disagree' , 3:'discuss'})
  # df['correct'] = np.where(df['stance']==df['pred'],1,0)
  # acc = (df['correct'].sum()/(len(df)))
  # return acc
  return 0

def determine_acc_simple_bert(listObj,pred):
  dfStart = pd.DataFrame(listObj, columns=list('xy'))  
  pred_df = pd.DataFrame(pred)
  #df['prediction'] = pred.tolist()
  #dfStart = df.join(pred_df)
  predMini = pred[:49964]
  dfStart['pred']= predMini
  print(dfStart.columns)
  #print(dfStart.head(0))
  # df['prediction'] = df['prediction'].replace({0:'unrelated',1:'agree', 2:'disagree' , 3:'discuss'})
   #dfStart['correct'] = np.where(dfStart['stance']==dfStart['pred'],1,0)
   #acc = (df['correct'].sum()/(len(df)))
  # return acc
  return 0

def determine_acc_simple_bert_v2(df,pred):
  #predMini = pred[:49971]
  #dfMini = df[:49971]
  dfMini = df
  #df['pred']= predMini
  dfMini['pred'] = pred
  dfMini['correct'] = np.where(dfMini['labels']==dfMini['pred'],1,0)
  acc = (dfMini['correct'].sum()/(len(dfMini)))
  return acc

In [None]:
print(f'len of comp_list is {len(comp_list)} len of predictions {len(predictions)}')

len of comp_list is 25413 len of predictions 25413


## Geting Testing Accuracy for BERT Simple Model using the Testing DF & predictions array


In [28]:
acc2 = determine_acc_simple_bert_v2(df_comp,predictions)
print(f'acc {acc2}')

acc 0.8409868964703104


## Determining the score of the BERT Simple Model

In [29]:
LABELS = ['agree', 'disagree', 'discuss', 'unrelated']
LABELS_RELATED = ['unrelated','related']
RELATED = LABELS[0:3]

def score_submission(gold_labels, test_labels):
    score = 0.0
    cm = [[0, 0, 0, 0],
          [0, 0, 0, 0],
          [0, 0, 0, 0],
          [0, 0, 0, 0]]

    for i, (g, t) in enumerate(zip(gold_labels, test_labels)):
        g_stance, t_stance = g, t
        if g_stance == t_stance:
            score += 0.25
            if g_stance != 'unrelated':
                score += 0.50
        if g_stance in RELATED and t_stance in RELATED:
            score += 0.25

        cm[LABELS.index(g_stance)][LABELS.index(t_stance)] += 1

    return score, cm


def print_confusion_matrix(cm):
    lines = []
    header = "|{:^11}|{:^11}|{:^11}|{:^11}|{:^11}|".format('', *LABELS)
    line_len = len(header)
    lines.append("-"*line_len)
    lines.append(header)
    lines.append("-"*line_len)

    hit = 0
    total = 0
    for i, row in enumerate(cm):
        hit += row[i]
        total += sum(row)
        lines.append("|{:^11}|{:^11}|{:^11}|{:^11}|{:^11}|".format(LABELS[i],
                                                                   *row))
        lines.append("-"*line_len)
    print('\n'.join(lines))


def report_score(actual,predicted):
    score,cm = score_submission(actual,predicted)
    best_score, _ = score_submission(actual,actual)

    print_confusion_matrix(cm)
    print("Score: " +str(score) + " out of " + str(best_score) + "\t("+str(score*100/best_score) + "%)")
    return score*100/best_score

In [30]:
def convert_int_encoding_to_string_df(df):
  df_copy = df;
  df_copy['labels'] = df_copy['labels'].replace({0:'unrelated',1:'agree', 2:'disagree' , 3:'discuss'})
  df_copy['pred'] = df_copy['pred'].replace({0:'unrelated',1:'agree', 2:'disagree' , 3:'discuss'})
  return df_copy

In [31]:

df_comp_encoded = convert_int_encoding_to_string_df(df_comp)
labels_list_comp = df_comp_encoded['labels'].tolist()
pred_list_comp = df_comp_encoded['pred'].tolist()


In [32]:
print(report_score(labels_list_comp,pred_list_comp))

-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |    916    |    14     |    653    |    320    |
-------------------------------------------------------------
| disagree  |    238    |    19     |    270    |    170    |
-------------------------------------------------------------
|  discuss  |    722    |    19     |   3171    |    552    |
-------------------------------------------------------------
| unrelated |    266    |     7     |    810    |   17266   |
-------------------------------------------------------------
Score: 8901.5 out of 11651.25	(76.3995279476451%)
76.3995279476451


## Create CSV output of the model prediction for codalab

In [33]:
import csv
from google.colab import files
def generate_answer_csv(stances, predicted):
    stances['pred'] = predicted;  
    path= '/content/drive/MyDrive/Colab Notebooks/MSCI598ProjectOutput/answer.csv'
    with open(str(path), 'w', encoding='UTF8') as f:
        writer = csv.writer(f)
        # write the header
        header = ['Headline', 'Body ID', 'Stance']
        writer.writerow(header)

        for index, row in stances.iterrows():            
            headline = row['Headline']
            body_id = row['Body ID']
            stance = row['pred']
            writer.writerow([headline, body_id, stance])
    files.download(path)

In [37]:
import csv
from google.colab import files
def generate_answer_csv_v2(stances, predicted):
    stances = stances.drop(columns=['Stance','articleBody'])
    stances['Stance'] = predicted;  
    #stances = stances.drop(columns=['pred'])
    path= '/content/drive/MyDrive/Colab Notebooks/MSCI598ProjectOutput/answer.csv'    
    header = ['Headline', 'Body ID', 'Stance']        
    df_for_print = stances;
    #df_for_print = df_for_print.drop(columns=['Stance','articleBody'])
    print(len(df_for_print))
    df_for_print.to_csv(path,index=False)
    files.download(path)

In [None]:
print(f'len of df_full_everything {len(df_full_everything)} + len of pred_list_comp {len(pred_list_comp)}')

len of df_full_everything 25413 + len of pred_list_comp 25413


In [38]:
#generate_answer_csv(df_full_everything,pred_list_comp)
generate_answer_csv_v2(df_full_everything,pred_list_comp)

25413


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
print(df_full_everything.head(2))

                                            Headline  Body ID     Stance  \
0  Ferguson riots: Pregnant woman loses eye after...     2008  unrelated   
1  Apple Stores to install safes to secure gold A...     2008  unrelated   

                                         articleBody       pred  
0  A RESPECTED senior French police officer inves...      agree  
1  A RESPECTED senior French police officer inves...  unrelated  


In [None]:
print(pred_list_comp[2])

agree


##Test on unlabelled comp

In [None]:
## on unlabelled comp

In [None]:
stance_comp_unlabelled = generate_df_from_csv("/content/drive/MyDrive/Colab Notebooks/competition_test_stances_unlabeled.csv")
body_comp_unlabelled = generate_df_from_csv("/content/drive/MyDrive/Colab Notebooks/competition_test_bodies.csv")

In [None]:
print(body_comp_unlabelled.head(1))

   Body ID                                        articleBody
0        1  Al-Sisi has denied Israeli reports stating tha...


In [None]:
def produce_set_for_simpleBERT_unlabelled(stance,body):
  full_sb = pd.merge(stance,body,on="Body ID",how='left')
  #print(len(full_sb.columns))
  #print(full_sb.head(4))
  #make full_sb shorter body
  full_sb['articleBody'] = full_sb['articleBody'].str.slice(0,50)
  df_simple= full_sb[['Headline', 'articleBody']].copy()
  #df_simple['Stance'] = df_simple['Stance'].replace({'unrelated':0,'agree': 1, 'disagree':2 , 'discuss':3})
  df_simple.columns = ['text_a',"text_b"]
  return df_simple,full_sb

def convert_int_encoding_to_string_df_unlabelled(df):
  df_copy = df;  
  df_copy['pred'] = df_copy['pred'].replace({0:'unrelated',1:'agree', 2:'disagree' , 3:'discuss'})
  return df_copy

import csv
from google.colab import files
def generate_answer_csv_v2_unlabelled(stances, predicted):
    stances = stances.drop(columns=['articleBody'])
    stances['Stance'] = predicted;  
    #stances = stances.drop(columns=['pred'])
    path= '/content/drive/MyDrive/Colab Notebooks/MSCI598ProjectOutput/answer.csv'    
    header = ['Headline', 'Body ID', 'Stance']        
    df_for_print = stances;
    #df_for_print = df_for_print.drop(columns=['Stance','articleBody'])
    print(len(df_for_print))
    df_for_print.to_csv(path,index=False)
    files.download(path)

In [None]:
import numpy as np
def determine_acc_simple_bert_v3_unlabelled(df,pred):
  #predMini = pred[:49971]
  #dfMini = df[:49971]
  dfMini = df
  #df['pred']= predMini
  dfMini['pred'] = pred
  dfMini['correct'] = np.where(dfMini['labels']==dfMini['pred'],1,0)
  acc = (dfMini['correct'].sum()/(len(dfMini)))
  return acc

In [None]:
df_comp_unlabelled,df_full_everything_unlabelled = produce_set_for_simpleBERT_unlabelled(stance_comp_unlabelled,body_comp_unlabelled)
df_comp_unlabelled.head(5)

Unnamed: 0,text_a,text_b
0,Ferguson riots: Pregnant woman loses eye after...,A RESPECTED senior French police officer inves...
1,Crazy Conservatives Are Sure a Gitmo Detainee ...,Dave Morin's social networking company Path is...
2,A Russian Guy Says His Justin Bieber Ringtone ...,A bereaved Afghan mother took revenge on the T...
3,"Zombie Cat: Buried Kitty Believed Dead, Meows ...",Hewlett-Packard is officially splitting in two...
4,Argentina's President Adopts Boy to End Werewo...,An airline passenger headed to Dallas was remo...


In [None]:
comp_list_unlabelled = df_to_listOflist(df_comp_unlabelled)
#print(comp_list.pop())

# test the model
predictions_unlabelled, raw_outputs_unlabelled = model.predict(comp_list_unlabelled)

#generate_answer_csv_v2(df_full_everything_unlabelled,pred_list_comp)

  0%|          | 0/25413 [00:00<?, ?it/s]

  0%|          | 0/255 [00:00<?, ?it/s]

In [None]:
df_comp_unlabelled['pred'] = predictions_unlabelled
df_comp_encoded_unlabelled = convert_int_encoding_to_string_df_unlabelled(df_comp_unlabelled)
#labels_list_comp = df_comp_encoded_unlabelled['labels'].tolist()
pred_list_comp_unlabelled = df_comp_encoded_unlabelled['pred'].tolist()

In [None]:
generate_answer_csv_v2_unlabelled(df_full_everything_unlabelled,pred_list_comp_unlabelled)

25413


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
determine_acc_simple_bert_v3_unlabelled(df_full_everything_unlabelled,pred_list_comp_unlabelled)

KeyError: ignored

In [None]:
report_score(df_full_everything_unlabelled,pred_list_comp_unlabelled)

NameError: ignored

In [None]:
# Preparing eval data
eval_data = [
    [
        "Theoden was the king of Rohan",
        "Gimli's preferred weapon was a battle axe",
        1,
    ],
    [
        "Merry was the king of Rohan",
        "Legolas was taller than Gimli",
        0,
    ],
]
eval_df = pd.DataFrame(eval_data)
eval_df.columns = ["text_a", "text_b", "labels"]

In [None]:
print(f'result {result}')
print(f'model_outputs {model_outputs}')

result {'mcc': 0.0, 'eval_loss': 3.824929714202881}
model_outputs [[ 5.89453125 -1.75097656 -1.97949219 -2.04882812]
 [ 5.76953125 -1.7265625  -2.00390625 -1.94335938]]


In [None]:
# Make predictions with the model
predictions, raw_outputs = model.predict(
    [
        [
            "Legolas was an expert archer",
            "Legolas was taller than Gimli",
        ],[
            "Legolas was an expert archer",
            "Legolas was an expert archer",
        ],[
           "Theoden was the king of Rohan",
           "Gimli's preferred weapon was a battle axe"
           ]
    ]
)

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
print(f'predictions now {predictions}')

predictions [3 3 0]


In [None]:
model.train_model(train_df, eval_df=eval_df, pearson_corr=pearson_corr, spearman_corr=spearman_corr)