# This notebook holds classes of that implements the classification models. 

This notebook contains these classes:

**CModel** is a parent class of machine learning models. Defines the interface for training and prediction.
* **CRandom_Forest** child class that implements the random forest classifier  and its parameters..
* **CGausian_Naive_Bayes** implements the Naive Bayes classifier.
  
**CNeuralNetwork** is a parent class of neural network classifiers. Defines the interface for training and prediction.

* **CLSTM** implements the LSTM neural network and its parameters.
* **CCCN** implements the Convolutional neural network and its parameters.

**Firstly, we import all necessary libraries**

In [None]:
import math
import time
import pandas as pd
import numpy as np
import sklearn as skl
import tensorflow as tf
import matplotlib.pyplot as plt
import keras
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB 
from tensorflow.keras.layers import TextVectorization
from keras.models import Sequential 
from keras.layers import Activation, Dense,Flatten, LSTM, Dropout 
from keras.layers import Input, InputLayer
from tensorflow.keras.layers import Embedding
import keras.callbacks
%matplotlib inline 

In [None]:
class CModel:

    '''
    This superclass defines interface for the machine learning models classe used in experiments.

    Attributes
    ----------
    Xtrain: pandas.Series
            Preprocessed training input samples of articles. 
    Xtest: pandas.Series
           Preprocessed test input samples of articles.
    ytrain: pandas.Series
            Training target values used for training the model.
    param_comb: sklearn.model_selection.ParameterGrid
                Hyperparameters that are being tuned during training.
    best_hyper_params: sklearn.model_selection.ParameterGrid
                       Set of hyperparameters for the given model that reached the highest score during training.

    Methods
    -------
    abstractmethod tune_hyperparameters(): 
        Abstract method that finds the best model for prediction on provided data using hyperparameter tuning. Then saves the best model 
        Returns: None
    
    abstractmethod make_prediction():
        Abstract method that does prediciton using best pretrained classifiaction model on provided test data.
        Returns: numpy.array
    '''
    
    def __init__(self,Xtrain:list,Xtest:list,ytrain:pd.Series) -> None:

        '''
        Constructor of superclass. Stores input data to internal parameters.
        Parameters
        ----------
        Xtrain: pandas.Series
                Preprocessed training input samples of articles. 
        Xtest: pandas.Series
               Preprocessed test input samples of articles.
        
        ytrain: pandas.Series
                Training target values used for training the model.
        Returns
        -------
        None
        '''
        self.Xtrain = Xtrain
        self.Xtest = Xtest
        self.ytrain = ytrain
        
        def tune_hyperparameters(self)->None:
            '''
            Abstract method that trains given ML model and create a new one with set hyperparameters based on accuracy of k-fold validation.
            Returns
            -------
            None
            '''
            pass
        def make_prediction(self) -> np.ndarray:
            '''
            Abstract method that takes the hyperparameters based on training. Pretrains it and make prediction on test data. 
            Retruns prediction result. 
            Returns
            -------
            numpy.array
                    Calculated prediction on test data.
            '''
            pass
    
    
    

In [None]:
class CRandom_Forest(CModel):
    '''
    This class holds the random forest ML model. Its a child class of CModel.

    
    Attributes
    ----------
    Xtrain: pandas.Series
            Preprocessed training input samples of articles. 
    Xtest: pandas.Series
           Preprocessed test input samples of articles.
    ytrain: pandas.Series
            Training target values used for training the model.
    param_comb: sklearn.model_selection.ParameterGrid
                Hyperparameters that are being tuned during training.
    best_hyper_params: sklearn.model_selection.ParameterGrid
                       Set of hyperparameters for the given model that reached the highest score during training.

    Methods
    -------
    tune_hyperparameters(): 
        Finds the best model for prediction on provided data using hyperparameter tuning. Then saves the best model 
        Returns None
    
    make_prediction():
        Does prediciton using best pretrained classifiaction model on provided test data.
        Returns numpy.array
    
   
    '''
    def __init__(self,*args)-> None:
        '''
        Subclass constructors of Random forest model, reqiures same arguments as constructor of class CModel.
        
        Parameters
        ----------
        Xtrain: pandas.Series
                Preprocessed training input samples of articles. 
        Xtest: pandas.Series
                Preprocessed test input samples of articles.
        ytrain: pandas.Series
                Training target values used for training the model.
        
        Returns
        -------
        None
        '''

        super(CRandom_Forest,self).__init__(*args)
      
        self.param_comb = ParameterGrid({'n_estimators' : range(150,200,10), 'max_depth' : range(5,8),
                         'bootstrap' : [True, False]})
       
        
        
    def tune_hyperparameters(self) -> None:
        
        '''
        Trains random forest model with all possible hyperparameters defined in param_comb and evaluates it. 
        Finally, it saves the combintion that reached best accuracy in k-fold validation.
        
        Returns
        -------
        None
        '''
        val_acc =[]
        for params in self.param_comb:
            clf_forest = RandomForestClassifier(n_estimators=params['n_estimators'], 
                 max_depth=params['max_depth'],bootstrap = params['bootstrap'],n_jobs=-1 )
            clf_forest.fit(self.Xtrain, self.ytrain)
            val_acc.append(np.mean(cross_val_score(clf_forest, self.Xtrain, self.ytrain, cv=5, scoring='balanced_accuracy', n_jobs=-1)))
        #plot_performance(val_acc)
        self.best_hyper_params = self.param_comb[np.argmax(val_acc)]
        


    def make_prediciton(self) -> np.array:
        '''
        Creates new random forest model with set of best hyperparameters, trains it 
        on train data and calculates prediction for the test set. Finally returns the results of prediction.
        
        Returns
        -------
        numpy.array
            Calculated prediction on test data.
        '''
        self.tune_hyperparameters()

        best_forest = RandomForestClassifier(n_estimators=self.best_hyper_params['n_estimators'], 
                  max_depth=self.best_hyper_params['max_depth'],bootstrap = self.best_hyper_params['bootstrap'],n_jobs=-1  )
        best_forest.fit(self.Xtrain,self.ytrain)
        return best_forest.predict(self.Xtest)
        
       
  

In [None]:
class CGausian_Naive_Bayes(CModel):
    '''
    Attributes
    ----------
    Xtrain: pandas.Series
            Preprocessed training input samples of articles. 
    Xtest: pandas.Series
           Preprocessed test input samples of articles.
    ytrain: pandas.Series
            Training target values used for training the model.
    param_comb: sklearn.model_selection.ParameterGrid
                Hyperparameters that are being tuned during training.
    best_hyper_params: sklearn.model_selection.ParameterGrid
                       Set of hyperparameters for the given model that reached the highest score during training.

    Methods
    -------
    tune_hyperparameters(): 
        Abstract method that finds the best model for prediction on provided data using hyperparameter tuning. Then saves the best model 
        Returns None
    
    make_prediction():
        Abstract method that does prediciton using best pretrained classifiaction model on provided test data.
        Returns numpy.array
    
    '''
    
    def __init__(self,*args)-> None:
        '''
        Subclass constructors of Naive bayes model, reqiures same arguments as constructor of class CModel.
        Parameters
        ----------
        Xtrain: pandas.Series
                Preprocessed training input samples of articles. 
        Xtest: pandas.Series
                Preprocessed test input samples of articles.
        ytrain: pandas.Series
                Training target values used for training the model.
                
        Returns
        -------
        None
        '''
        super(CGausian_Naive_Bayes,self).__init__(*args)
        
        self.param_comb = ParameterGrid({ 'var_smoothing':np.logspace(-12, -6, num=6, base=10)})
        
    def tune_hyperparameters(self)-> None:
        '''
        Trains Gausian Naive Bayes model with all possible hyperparameters defined in param_comb and evaluates it.
        Finally, it saves the combintion that reached best accuracy in k-fold validation.
        
        Returns
        -------
        None
        '''
        val_acc = []
        
        for params in self.param_comb:
            clf_GaussianNB = GaussianNB(var_smoothing=params['var_smoothing'] )
            clf_GaussianNB.fit(self.Xtrain, self.ytrain)
            val_acc.append(np.mean(cross_val_score(clf_GaussianNB, self.Xtrain, self.ytrain, cv=5, scoring='balanced_accuracy',n_jobs=-1)))
        #plot_performance(val_acc)
        self.best_hyper_params = self.param_comb[np.argmax(val_acc)]
    def make_prediciton(self)-> np.array:
        '''
        Creates new gausian naive bayes model with set of best hyperparameters, trains it on train data and calculates prediction for the test set. 
        Finally returns the results.
        
        Returns
        -------
        numpy.array
            Calculated prediction on test data.
        '''

        self.tune_hyperparameters()

        best_nb = GaussianNB(var_smoothing=self.best_hyper_params['var_smoothing'])
        best_nb.fit(self.Xtrain,self.ytrain)
        return best_nb.predict(self.Xtest)

      
    

In [None]:
class CNeuralNetwork:
    '''
    This superclass defines interface for the neural network model classes used in experiments.
    
    Attributes
    ----------
    Xtrain: pandas.Series
            Preprocessed training input samples of articles. 
    Xtest: pandas.Series
            Preprocessed test input samples of articles.
    ytrain: pandas.Series
            Training target values used for training the model.
    embedding_layer: keras.layers.Embedding
            Pre-made embedding layer in preprocessing.
    input_length: int
            Maximal size of one article.
    preprocessing: string
            Current preprocessing, used in the experiment.
    log_path: string
            File into which are written log data including resaults and chosen hyperparameters.
    dataset_path: string
            Relative path to folder which stores the results.
    labels_count: int
            Number of neurons in output layer based on dataset.
    loss: string
            Defines loss function used in model training.
    name: string
            Name of neural network model
    
    Methods
    -------
    plot_loss(): Plots training and validation loss over training epochs.
    Returns: None
    
    plot_accuracy(): Plots training and validation accuracy over training epochs.
    Returns: None
    
    plot_performance(): Prepares best chosen hyperparameters as string output and calls plot_loss and plot_accuracy.
    Returns: None
    
    train_network(): Tunes the hyperparameters of the network using the k-fold validation. Selects the best hyperparameter combination and saves it.
    Returns: None
    
    make_prediciton(): Firstly, it trains the model using train_network function. Then trains the model with the best set of hypeparameters. And returns the prediction on test data.
    Returns: numpy.array
    
    create_model(parameters: ParameterGrid): Abstract method that in each subclasses builds a model with predefined hyperparameters. Returns the architecture of concrete model.
    Returns: keras.Sequential         
    '''
    def __init__(self,Xtrain:np.array,Xtest:np.array,ytrain:np.array, embedding_layer:keras.layers.Embedding, input_length: int,preprocessing:str, log_path:str,dataset_path:str)-> None:
        '''
        Constructor of superclass. Stores input data to internal parameters.
        Parameters
        ----------
        Xtrain: pandas.Series
                Preprocessed training input samples of articles. 
        Xtest: pandas.Series
                Preprocessed test input samples of articles.
        ytrain: pandas.Series
                Training target values used for training the model.
        embedding_layer: keras.layers.Embedding
                Pre-made embedding layer in preprocessing.
        input_length: int
                Maximal size of one article.
        preprocessing: string
                Current preprocessing, used in the experiment.
        log_path: string
                File into which are written log data including resaults and chosen hyperparameters.
        dataset_path: string
                Relative path to folder which stores the results.

        Returns
        -------
        None
        '''
        self.Xtrain = Xtrain
        self.Xtest = Xtest
        self.ytrain = ytrain
        self.preprocessing = preprocessing
        self.embedding_layer = embedding_layer
        self.labels_count = 1
        self.loss = 'binary_crossentropy'
        self.input_length = input_length
        self.log_path = log_path
        self.dataset_path = dataset_path 
        self.name = ''
        self.num_epochs = tf.constant(15)
        self.batch_size = tf.constant(32)
        
        

    def plot_loss(self,history:tf.keras.callbacks.History,params: ParameterGrid)-> None:
        '''
        Plots loss values reached over training.
        Parameters
        ----------
        history: tensorflow.keras.callbacks.History
                Training history of the model.
        params: ParameterGrid
                Chosen hyperparameters that will be saved to log.
        
        Returns
        -------
        None
        '''
        
        plt.cla()
        plt.close()
        
        loss = history.history['loss']
        val_loss = history.history['val_loss']
        epochs = range(1,self.num_epochs+1)


        cm_path = self.dataset_path + 'figures/' + self.name+ ' '+ self.preprocessing+ ' ' + params +'loss.jpg'  
        plt.plot(epochs,loss,'b', label = 'Training loss')
        plt.plot(epochs,val_loss,'orange', label = 'Validation loss')
        plt.xlabel('Epochs')
        plt.ylabel('loss')
        plt.legend()
        plt.savefig(cm_path)
        plt.cla()
        plt.close()
        
        
        
    def plot_accuracy(self,history:tf.keras.callbacks.History,params: ParameterGrid) -> None:

        '''
        Plots the accuracy values reached over training.
        
        Parameters
        ----------
        history: tensorflow.keras.callbacks.History
                Training history of the model.
        params: ParameterGrid
                Chosen hyperparameters that will be saved to log.
                
        Returns
        -------
        None
        
        '''
        
        plt.cla()
        plt.close()
        cm_path = self.dataset_path  + 'figures/'+self.name+ '_'+ self.preprocessing+ '_' + params +'accuracy.jpg'  
        acc = history.history['acc']
        val_acc = history.history['val_acc']
        epochs = range(1,self.num_epochs+1)
        plt.plot(epochs,acc,'b', label = 'Training acc')
        plt.plot(epochs,val_acc,'orange', label = 'Validation acc')
        plt.xlabel('Epochs')
        plt.ylabel('Accuracy')
        plt.legend()
        plt.savefig(cm_path)
        plt.cla()
        plt.close()
        
    
    def plot_performance(self,history:tf.keras.callbacks.History,params:ParameterGrid) ->None:
        '''
        Draws performance of accuracy and loss during training. Finally it writes the loss and the accuarcy to the log file.
        
        Parameters
        ----------
        history: tensorflow.keras.callbacks.History
                Training history of the model.
        params: ParameterGrid
                Chosen hyperparameters that will be saved to log.
                
        Returns
        -------
        None
        '''
        params = str(params)
        params = params.replace('\'','').replace('{','').replace('}','').replace(',','').replace(':','').replace(' ','_')
        
        self.plot_accuracy(history,params)
        self.plot_loss(history,params)
        

    def train_network(self)->None:
        '''
        Tunes the hyperparameters of the network using the k-fold validation. Selects the best hyperparameter combination and saves it.
        
        Returns
        -------
        None
        '''
        #set to 5
        kfold = KFold(n_splits=5, shuffle=True)
        val_acc = []
        val_loss = []
        
        for params in self.param_comb:
            acc_per_fold = []
            loss_per_fold = []
            for train, val in kfold.split(self.Xtrain, self.ytrain):
                
                model = self.create_model(params)
                model.compile(loss=self.loss, optimizer="adam", metrics=["acc"])
                history = model.fit(self.Xtrain[train], np.array(self.ytrain)[train],validation_data=(self.Xtrain[val], np.array(self.ytrain)[val]), batch_size=self.batch_size, epochs=self.num_epochs,verbose=0)
                acc_per_fold.append(history.history['val_acc'])
                loss_per_fold.append(history.history['val_loss'])
                
            self.plot_performance(history,params)
        
            
            val_acc.append(np.mean(acc_per_fold))  
            val_loss.append(np.mean(loss_per_fold))
        
        self.best_hyper_params = self.param_comb[np.argmax(val_acc)] 
        with open(self.log_path,'a') as f:
            f.write(str(self.best_hyper_params))
            f.write('\n')
            f.close()
        
        return acc_per_fold,loss_per_fold
    
    def make_prediciton(self)->np.array:
        '''
        Firstly, it trains the model using train_network function. 
        Then trains the model with the best set of hypeparameters. And returns the prediction on test data.
        
        Returns
        -------
        numpy.array
            Calculated prediction on test data.
        '''

        
        self.train_network()
        model = self.create_model(self.best_hyper_params)
        model.compile(loss=self.loss, optimizer="adam", metrics=["acc"])
        model.fit(self.Xtrain, np.array(self.ytrain), batch_size=self.batch_size, epochs=self.num_epochs,verbose=0)
        
        return np.round(model.predict(self.Xtest,batch_size=self.batch_size,verbose=0))

    def create_model(self,parameters: ParameterGrid)->keras.Sequential:
        '''
        Abstract method that in each subclasses builds a model with predefined hyperparameters. Returns the architecture of concrete model.
        
        Parameters
        ----------
        parameters: ParameterGrid
                New hyperparameters of the model.
                
        Returns
        -------
        keras.Sequential
                NN model with set hyperparameters.
        '''
        pass
        

In [None]:
class CLSTM(CNeuralNetwork):
    '''
    This class holds simple LSTM model with embedding layer.

    Attributes
    ----------
    Xtrain: pandas.Series
            Preprocessed training input samples of articles. 
    Xtest: pandas.Series
            Preprocessed test input samples of articles.
    ytrain: pandas.Series
            Training target values used for training the model.
    embedding_layer: keras.layers.Embedding
            Pre-made embedding layer in preprocessing.
    input_length: int
            Maximal size of one article.
    preprocessing: string
            Current preprocessing, used in the experiment.
    log_path: string
            File into which are written log data including resaults and chosen hyperparameters.
    dataset_path: string
            Relative path to folder which stores the results.
    labels_count: int
            Number of neurons in output layer based on dataset.
    loss: string
            Defines loss function used in model training.
    name: string
            Name of neural network model
    
    Methods
    -------
    plot_loss(): Plots training and validation loss over training epochs.
    Returns: None
    
    plot_accuracy(): Plots training and validation accuracy over training epochs.
    Returns: None
    
    plot_performance(): Prepares best chosen hyperparameters as string output and calls plot_loss and plot_accuracy.
    Returns: None
    
    train_network(): Tunes the hyperparameters of the network using the k-fold validation. Selects the best hyperparameter combination and saves it.
    Returns: None
    
    make_prediciton(): Firstly, it trains the model using train_network function. Then trains the model with the best set of hypeparameters. And returns the prediction on test data.
    Returns: numpy.array
    
    create_model(parameters: ParameterGrid): Builds the LSTM model with predetermined hyperparameters. Returns a built one.
    Returns: keras.Sequential
    '''
    def __init__(self,*args)->None:
        '''
        Subclass constructors of LSTM NN model, reqiures same arguments as constructor of class CModel.
        
        Parameters
        ----------
        Xtrain: pandas.Series
                Preprocessed training input samples of articles. 
        Xtest: pandas.Series
                Preprocessed test input samples of articles.
        ytrain: pandas.Series
                Training target values used for training the model.
        embedding_layer: keras.layers.Embedding
                Pre-made embedding layer in preprocessing.
        input_length: int
                Maximal size of one article.
        preprocessing: string
                Current preprocessing, used in the experiment.
        log_path: string
                File into which are written log data including resaults and chosen hyperparameters.
        dataset_path: string
                Relative path to folder which stores the results.
        Returns
        -------
        None
        '''
        
        super(CLSTM,self).__init__(*args)
        self.name = "LSTM"
        self.param_comb = ParameterGrid({ 'drop_out' : np.arange(0.3,0.6,0.1),'lstm_units': 2 ** np.arange(7,9,1) })
        
        
    def create_model(self,parameters: ParameterGrid)-> keras.Sequential:
        #https://www.researchgate.net/publication/354589045_OPCNN-FAKE_Optimized_Convolutional_Neural_Network_for_Fake_News_Detection
        '''
        Builds the LSTM model with predetermined hyperparameters. Returns a built one.
        
        Parameters
        ----------
        parameters: ParameterGrid
                New hyperparameters of the model.
                
        Returns
        -------
        keras.Sequential
                NN model with set hyperparameters.
        '''
        modelLSTM = Sequential()
        modelLSTM.add(Input(shape=(self.input_length,)))
        modelLSTM.add(self.embedding_layer)
        modelLSTM.add(LSTM(int(parameters['lstm_units'])))
        modelLSTM.add(Dropout(parameters['drop_out']))
        modelLSTM.add(Flatten()) 
        modelLSTM.add(Dense(self.labels_count,activation='sigmoid'))
        return modelLSTM
        

In [None]:
class CCNN(CNeuralNetwork):
    '''
    This class holds simple CNN model.
    Attributes
    ----------
    Xtrain: pandas.Series
            Preprocessed training input samples of articles. 
    Xtest: pandas.Series
            Preprocessed test input samples of articles.
    ytrain: pandas.Series
            Training target values used for training the model.
    embedding_layer: keras.layers.Embedding
            Pre-made embedding layer in preprocessing.
    input_length: int
            Maximal size of one article.
    preprocessing: string
            Current preprocessing, used in the experiment.
    log_path: string
            File into which are written log data including resaults and chosen hyperparameters.
    dataset_path: string
            Relative path to folder which stores the results.
    labels_count: int
            Number of neurons in output layer based on dataset.
    loss: string
            Defines loss function used in model training.
    name: string
            Name of neural network model
    
    Methods
    -------
    plot_loss(): Plots training and validation loss over training epochs.
    Returns: None
    
    plot_accuracy(): Plots training and validation accuracy over training epochs.
    Returns: None
    
    plot_performance(): Prepares best chosen hyperparameters as string output and calls plot_loss and plot_accuracy.
    Returns: None
    
    train_network(): Tunes the hyperparameters of the network using the k-fold validation. Selects the best hyperparameter combination and saves it.
    Returns: None
    
    make_prediciton(): Firstly, it trains the model using train_network function. Then trains the model with the best set of hypeparameters. And returns the prediction on test data.
    Returns: numpy.array
    
    create_model(parameters: ParameterGrid):  Builds the LSTM model with predetermined hyperparameters. Returns a built one.
    Returns: keras.Sequential
    '''
    def __init__(self,*args) ->None:
        '''
        Subclass constructors of Covolutional NN model, reqiures same arguments as constructor of class CModel.
        
        Parameters
        ----------
        Xtrain: pandas.Series
                Preprocessed training input samples of articles. 
        Xtest: pandas.Series
                Preprocessed test input samples of articles.
        ytrain: pandas.Series
                Training target values used for training the model.
        embedding_layer: keras.layers.Embedding
                Pre-made embedding layer in preprocessing.
        input_length: int
                Maximal size of one article.
        preprocessing: string
                Current preprocessing, used in the experiment.
        log_path: string
                File into which are written log data including resaults and chosen hyperparameters.
        dataset_path: string
                Relative path to folder which stores the results.

        Returns
        -------
        None
        '''
        super(CCNN,self).__init__(*args)
        self.name = "CNN"
        self.param_comb = ParameterGrid({'filters' : 2 ** np.arange(5,8,1), 'drop_out' : np.arange(0.3,0.6,0.1), 'kernel': np.arange(8,14,2) })
       
    def create_model(self,parameters: ParameterGrid)-> keras.Sequential:
        '''
        
        Parameters
        ----------
        parameters: ParameterGrid
                New hyperparameters of the model.
                
        Returns
        -------
        keras.Sequential
                NN model with set hyperparameters.
        '''
        
        modelCNN = Sequential()
        modelCNN.add(Input(shape=(self.input_length,)))
        modelCNN.add(self.embedding_layer)
        modelCNN.add(Dropout(parameters['drop_out']))
        modelCNN.add(layers.Conv1D(filters=int(parameters['filters']), kernel_size=int(parameters['kernel']), strides=1, padding="causal", activation="relu"))
        modelCNN.add(layers.MaxPooling1D(5))
        modelCNN.add(Flatten())  
        modelCNN.add(layers.Dense(self.labels_count,  activation="sigmoid"))  
        return modelCNN
        
    