<a href="https://colab.research.google.com/github/MichelleFn/Finetuning-greek-bert-for-goverment-gazetes/blob/main/Binary_text_classification_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


**Installing libraries:**




In [None]:
!pip install unicodedata
!pip install transformers
!pip install torch
!pip install tensorflow
!pip install nlpaug

In [None]:
import unicodedata
from transformers import AutoTokenizer, TFAutoModel
import os, glob
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
import tensorflow as tf
from keras import backend as K
from imblearn.over_sampling import RandomOverSampler
import nlpaug.augmenter.word as naw
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

**Mounting csv file from google drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')
folder='/content/drive/My Drive/csv/csv/'

**Loading Bert tokenizer and TensorFlow model**

In [None]:
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/bert-base-greek-uncased-v1")
bert = TFAutoModel.from_pretrained("nlpaueb/bert-base-greek-uncased-v1",hidden_dropout_prob=0.5)
aug = naw.ContextualWordEmbsAug(model_path='nlpaueb/bert-base-greek-uncased-v1', action="substitute")

**Stop words:**

In [None]:
stop_words  = nltk.corpus.stopwords.words('greek')

#adding extra stop words that were not included
new_stopwords = ['της','τη','τους','ένας','ενός','ένα','μια','μιας','μιαν','αυτός','αυτή','αυτό','αυτοί','αυτά','αυτών','ούτος','αυτούς','εκείνος','εκείνη','εκείνο','εκείνοι','εκείνων','εκείνα','ποιος','ποια','ποιοι','ποιων','ποιους','ποιες','πια','είμαι','είναι','είμαστε','είστε','εγώ','εσύ','εμείς','εσείς','άλλος','άλλη','άλλο',
'άλλων','άλλους','άλλα','κατ’','ως','ή','ούτε','ποτέ','πότε','προς','πρός','υπέρ','άμα','πέρι','οπως','όπως','από','ενώ','συν','πώς','εάν','προ','μη','ίσως','κάθε','καθε','ότι','ό,τι','όσο','στα','στους']

stop_words.extend(new_stopwords)
print(stop_words)

**Class for reading csv files and creating datasets:**

In [None]:
class csv_merge:
    def __init__(self):
        pass

    def create_dataset(self): #creates one big dataset out of all csvs in drive 
     
        
        files = os.path.join(folder, "*.csv")
        files = glob.glob(files)
        temp_df = pd.concat(map(pd.read_csv, files), ignore_index=True)
        df=self.preprocess_data(temp_df)
        
        return df

    def preprocess_data(self,df):
      
      # df['RawParagraph']  = df['RawParagraph'].str.replace('[(,),.,/,\,|,«,»,΄,#,~,!,@,#,$,%,^,&,*,_,+,=,{,},[,:,?,-]', '') #the ' and ] are missing
      
     
      df["RawParagraph"] = df["RawParagraph"].str.lower() #makes all letters lower case since we are using Bert uncased

      
      df = df.replace(r'\n',' ', regex=True) #removes '/n' character
      
      df['RespAPrediction'] = df['RespAPrediction'].replace(['Non-RespA','RespA',' RespA','RespA '],[0,1,1,1]).astype(int) #makes labels 0 or 1 (the space varient are for an error I can't find in the csvs)
    
      df['RawParagraph'] = df['RawParagraph'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)])) #removes stop words

      
      
      df = df.rename(columns={'RawParagraph': 'text','RespAPrediction': 'label'})  #renames dataset collumns
      
      return df

    def csv_dataset(self): #returns a list of datasets that contain each csv
     
      dataframes_list = []

      list_of_names =['csva','csvb','csvc','csvd','csve','csv0','csv1','csv3','csv4','csv5','csv6','csv7']#,'csv8','csv9','csv10']#,'results20080100058','results20080100036']#,'csv11','csv12']#,'csv13','csv14','csv15']#]
      
      col_list=['RawParagraph','RespAPrediction'] #the column names that are neccesary
      
      for i in range(len(list_of_names)):

        temp_df = pd.read_csv(folder+list_of_names[i]+".csv",usecols=col_list)
        df=self.preprocess_data(temp_df)
        dataframes_list.append(df)

      

      return dataframes_list


**Creating list of datasets**

In [None]:
csv_merge =  csv_merge()

In [None]:
 #Calling the methods and creating a list 
dataframes_list=csv_merge.csv_dataset()
text=dataframes_list[0]['text']

In [None]:
display(dataframes_list[5])


**Encoding input text:**

In [None]:
def encode_bert(text): #encoding text for Bert

  seq_len= 256
  num_samples=len(text)

  #defining 2 empty numpy arrays for the input_ids and attention_mask
  Xids=np.zeros((num_samples,seq_len))
  Xmask=np.zeros((num_samples,seq_len))
  
  for i,sentence in enumerate(text):
    
    tokens = tokenizer.encode_plus(sentence, max_length=seq_len, truncation=True, padding='max_length',add_special_tokens=True,return_tensors='tf')
   
    Xids[i,:]=tokens['input_ids']
    Xmask[i,:]=tokens['attention_mask']


  
  inputs = {
      'input_ids': Xids,
      'attention_mask': Xmask
      }
  

  return inputs

**Creating data combinations**

In [None]:
"""
Creating combinations of feks for 10 fold cross validation

"""
def create_dataset_combinations(fold_loop_number,all_csvs): #Splitting csvs into train and test
  
  x_test=0
  y_test=0
  df= pd.DataFrame(columns=['text','label'])
  x_train=[]
  y_train=[]
  y=0
  special_tokens_train=[]

  print('FEK',fold_loop_number,'used for testing, the rest for training')
  for csv in all_csvs: #iterating through each dataset in the list
    if y==fold_loop_number: #choosing the test dataset
      x_test=csv['text'].values
      y_test=csv['label'].values
  
    else: #the remaining datasets are merged into one
     
      x_train.append(csv['text'].values)
      y_train.append(csv['label'].values)

    y=y+1
    
  x_train=np.concatenate(x_train, axis=0 )
  y_train=np.concatenate(y_train,axis=0)
  
  return x_test,y_test,x_train,y_train

**Creating model**:

In [None]:
def create_model(optimizer,loss,acc): #Bert model
  seq_len=256
  input_ids=tf.keras.layers.Input(shape=(seq_len,),name='input_ids',dtype='int32')
  mask=tf.keras.layers.Input(shape=(seq_len,),name='attention_mask',dtype='int32')

  embeddings=bert.bert(input_ids,attention_mask=mask)[1]
  # layer = tf.keras.layers.Dropout(0.4)(embeddings)

  x = tf.keras.layers.Dense(512,activation='relu')(embeddings)
  layer2 = tf.keras.layers.Dropout(0.4)(x)

  y = tf.keras.layers.Dense(2,activation='sigmoid',name='outputs')(layer2)
  model=tf.keras.Model(inputs=[input_ids,mask],outputs=y)
  model.compile(optimizer=optimizer, loss=loss, metrics=[acc])
  return model

In [None]:
optimizer= tf.keras.optimizers.Adam(lr=0.0001,decay=1e-6)
loss=tf.keras.losses.BinaryCrossentropy()
acc=tf.keras.metrics.BinaryAccuracy('accuracy')


 **10 fold cross validation:**

In [None]:
for i in range(9):
  i=4
  tf.keras.backend.clear_session()
  x_test,y_test,x_train,y_train= create_dataset_combinations(i,dataframes_list)

  print('Size of training set: ',len(x_train))
  print('Size of test set: ', len(x_test))

  val_inputs=encode_bert(x_test) 
  test_labels=np.zeros((len(y_test),y_test.max()+1))
  test_labels[np.arange(len(y_test)),y_test] = 1
  
  train_inputs=encode_bert(x_train)


  train_label_array=y_train
  train_labels=np.zeros((len(y_train),y_train.max()+1))
  train_labels[np.arange(len(y_train)),y_train] = 1
  

  model = create_model(optimizer,loss,acc)
  model.fit(train_inputs, train_labels,epochs=3,batch_size=16, validation_data=(val_inputs,test_labels))
  results=model.evaluate(val_inputs,test_labels)



  
  

  

**Stratified 10 cross fold validation:**

In [None]:
df= csv_merge.create_dataset()

In [None]:
X_data=df['text']
Y_data=df['label']

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
#9 fold cross validation with complete dataset that is split into train and test each time
skf = StratifiedKFold(n_splits=10,shuffle=True, random_state=1)


a=0
score=[]
for train_index, test_index in skf.split(X_data, Y_data):
    
    print(f"Generating Inputs for fold {a}")
   
    
    
    train_X, test_X = X_data[train_index], X_data[test_index]
    train_y, test_y = Y_data[train_index], Y_data[test_index]

    train_inputs=encode_bert(train_X)

    
    train_label_array=train_y.values

    
    train_labels=np.zeros((len(train_y),train_label_array.max()+1))
    train_labels[np.arange(len(train_y)),train_label_array] = 1
    

    
    val_inputs=encode_bert(test_X)
    
    test_label_array=test_y.values
    test_labels=np.zeros((len(test_y),test_label_array.max()+1))
    test_labels[np.arange(len(test_y)),test_label_array] = 1
    
  
    model = create_model(optimizer,loss,acc)
    early_stopping=tf.keras.callbacks.EarlyStopping(patience=5)
    model.fit(train_inputs, train_labels,epochs=5,batch_size=8,
              validation_data=(val_inputs,test_labels),callbacks=early_stopping)
    results=model.evaluate(val_inputs,test_labels)
  

    
    a+=1

    

In [None]:
#for testing purposes,only the first fek as testing and the rest for training
seq_len= 512
d3 = pd.concat([dataframes_list[0],dataframes_list[1],dataframes_list[2],dataframes_list[3],dataframes_list[4],dataframes_list[5],dataframes_list[6],dataframes_list[7]])
print(d3)
train_inputs=encode_bert(d3['text'])
print(train_inputs)
train_y=d3['label']

label_array=train_y.values
print(label_array)
len(label_array)

train_label_array=train_y
train_labels=np.zeros((len(train_y),train_label_array.max()+1))
train_labels[np.arange(len(train_y)),train_label_array] = 1
print(train_labels)
test_inputs=encode_bert(dataframes_list[8]['text'])
test_y=d3['label']
test_label_array=test_y
test_labels=np.zeros((len(test_y),test_label_array.max()+1))
test_labels[np.arange(len(test_y)),test_label_array] = 1

early_stopping=tf.keras.callbacks.EarlyStopping(patience=5)
model = create_model(optimizer,loss,acc)
model.fit(train_inputs, train_labels,epochs=5,batch_size=8,
              validation_data=(test_inputs,test_labels),callbacks=early_stopping)
results=model.evaluate(val_inputs,test_labels)
