# README
---

### How do we run tests?
In practice, we run tests and gather results locally because the colab runtime can disconnect from time to time, and it takes a significant amount of time to run some tests (Part 1.2 for example). We run tests locally and save results to corresponding `json` files under the `output` folder, which we shall analyze later.

### The file structure?
```Text
\--- sample_data
    \--- diabetes
        \--- diabetes_train.csv
        \--- diabetes_val.csv
        \--- diabetes_test.csv
    \--- fake_news
        \--- fake_news_train.csv
        \--- fake_news_val.csv
        \--- fake_news_test.csv
\--- output
    \--- part1-1.json
    \--- part1-1-test.json
    \--- part1-2.json
    \--- part1-3.json
    \--- part1-3-test.json
    \--- part1-4.json
    \--- part2-pre.json
    \--- part2-tfidf.json
    \--- part2-model.json
```

# **Libaray**

## Math
This libaray contains all the necessary mathematical expressions in the project. Including *sigmoid* and *cartesian*. 

---

#### NOTICE
**Both of the below math functions are copied from the previous project, written by Zeying(Ing) Tian.**



In [None]:
import numpy as np


def sigmoid(x: np.ndarray) -> np.ndarray:
    """
    Standard sigmoid function.
    :param x:
    :return:
    """
    return 1.0 / (1.0 + np.exp(-x))


def cartesian(arrays, out=None):
    """
    Generate a cartesian product of input arrays.

    Parameters
    ----------
    arrays : list of array-like
        1-D arrays to form the cartesian product of.
    out : ndarray
        Array to place the cartesian product in.

    Returns
    -------
    out : ndarray
        2-D array of shape (M, len(arrays)) containing cartesian products
        formed of input arrays.

    Examples
    --------
    array([[1, 4, 6],
           [1, 4, 7],
           [1, 5, 6],
           [1, 5, 7],
           [2, 4, 6],
           [2, 4, 7],
           [2, 5, 6],
           [2, 5, 7],
           [3, 4, 6],
           [3, 4, 7],
           [3, 5, 6],
           [3, 5, 7]])

    """

    arrays = [np.asarray(x) for x in arrays]
    n = np.prod([x.size for x in arrays])
    if out is None:
        out = np.zeros([n, len(arrays)], dtype=object)
    m = n // arrays[0].size
    out[:, 0] = np.repeat(arrays[0], m)
    if arrays[1:]:
        temp = cartesian(arrays[1:])
        if len(temp.shape) == 1:
            temp = temp[:, np.newaxis]
        for start_index in [k * m for k in range(arrays[0].size)]:
            out[start_index:start_index + m, 1:] = temp
    return out


## Common Types
This part defines interfaces and common types used in this project. These definitions facilitate codes development and does not influence final performance.


In [None]:
from typing import Dict, Any, List, Callable, Tuple, Optional
import numpy as np

PreprocessPipeline = List[Callable[[np.ndarray, np.ndarray, Optional[bool]], Tuple[np.ndarray, np.ndarray]]]

AllPossibleModelParameters = Dict[str, List[Any]]

CrossValidationMean = Dict[str, Any]


class LearningModel:

    def fit(self, x: np.ndarray, y: np.ndarray, **kwargs):
        pass

    def predict(self, x: np.ndarray) -> np.ndarray:
        pass

    def get_params(self) -> Dict[str, Any]:
        pass

    def set_params(self, new_params: Dict[str, Any]) -> bool:
        pass


## Data processing and analyzing
These utilities provide convenient methods to process data, as well as many commonly used codes for cross validation, grid search, etc. Notice that the grid search, which is encapsulated in `get_best_model_parameter` function, shall return records of model performance along the fitting process, which can ease our analysis. As we do not have 5-fold cross validation in this project, we merely apply fitted models on the provided validation set, and this part of logic is defined in `cross_validate_with_val_data`. Some of the methods defined this part is unused.

---

#### NOTICE
**Codes below are written by Zeying(Ing) Tian and is copied from the previous project.**

In [None]:
!pip install simple_chalk

from typing import List, Tuple, Callable, Dict, Any
from sklearn.metrics import classification_report
import numpy as np
from simple_chalk import chalk
from tqdm import tqdm

np.set_printoptions(linewidth=200)

CrossValidationMean = Dict[str, Any]


def preprocess_data(x: np.ndarray,
                    y: np.ndarray,
                    plugins: List[Callable[[np.ndarray, np.ndarray, bool], Tuple[np.ndarray, np.ndarray]]],
                    verbose=True) -> Tuple[np.ndarray, np.ndarray]:
    """
    Pre-processing data by a series of plug-ins.
    :param x: Features, or raw data
    :param y: Labels, or placeholder
    :param plugins: A series of callable functions that process the data.
    :param verbose: Decide whether to print logs.
    :return: The first value is training X, whereas the second value is the label y.
    """
    number_of_instances, number_of_features = x.shape[0], x.shape[1]

    if verbose:
        print(f'{chalk.bold("-" * 15 + "STARTING PRE-PROCESSING DATA" + "-" * 15)}\n'
              f'{chalk.bold("TOTAL ENTRIES:  ")} {number_of_instances}\n'
              f'{chalk.bold("TOTAL FEATURES: ")} {number_of_features}\n')

    for plugin in plugins:
        x, y = plugin(x, y, verbose)

    return x, y


def get_best_model_parameter(
        model_parameters: Dict[str, List[Any]],
        model: Callable[[Any], LearningModel],
        x: np.ndarray,
        y: np.ndarray,
        cross_validator: Callable[
            [np.ndarray, np.ndarray, int, LearningModel, np.ndarray, np.ndarray], CrossValidationMean],
        verbose=True,
        method='accuracy',
        val_x: np.ndarray = None,
        val_y: np.ndarray = None,
) -> Tuple[Dict[str, Any], List[Tuple[Dict[str, Any], Dict[str, Any]]]]:
    """
    Use a grid search to search for
    all possible combinations in the
    parameters' space, and select the
    best one given the f1 score from
    the validator.
    :param method:
    :param verbose:
    :param model_parameters: A defined dict of possible model parameters.
    :param model: A model constructor
    :param x: X
    :param y: y
    :param val_x: val_x
    :param val_y: val_y
    :param cross_validator: A cross validation function.
    :return: Selected best combination, and a list of tuples containing a combination and its corresponding result
    """
    if verbose:
        print(f'{chalk.bold("-" * 15 + "START FINDING BEST PARAMETERS" + "-" * 15)}\n')

    model_parameter_keys = [*model_parameters.keys()]
    all_combinations = cartesian([model_parameters[key] for key in model_parameter_keys])

    best_combination = dict()
    best_macro_f1_score = 0
    best_weighted_f1_score = 0
    best_accuracy = 0
    results = []

    for combination in tqdm(all_combinations, leave=True):
        combination_input = dict()
        for key_index in range(len(combination)):
            combination_input[model_parameter_keys[key_index]] = combination[key_index]
        m = model(**combination_input)
        result = cross_validator(x, y, 5, m, val_x, val_y)

        results += [(combination_input, result)]

        if result['weighted f1'] > best_weighted_f1_score and method == 'weighted f1':
            best_combination = combination_input
            best_weighted_f1_score = result['weighted f1']
        elif result['macro f1'] > best_macro_f1_score and method == 'macro f1':
            best_combination = combination_input
            best_macro_f1_score = result['macro f1']
        elif result['accuracy'] > best_accuracy and method == 'accuracy':
            best_combination = combination_input
            best_accuracy = result['accuracy']

    if verbose:
        print(f'{chalk.bold("-" * 15 + "BEST PARAMETERS FOUND" + "-" * 15)}\n'
              f'{chalk.greenBright(best_combination)}\n')

    return best_combination, results


def cross_validate(
        x: np.ndarray,
        y: np.ndarray,
        n_fold: int,
        model: LearningModel,
        val_x: np.ndarray = None,
        val_y: np.ndarray = None,
) -> CrossValidationMean:
    if val_x is not None and val_y is not None:
        return cross_validate_with_val_data(x, y, model, val_x, val_y)
    else:
        return cross_validate_with_n_fold(x, y, n_fold, model)


def cross_validate_with_val_data(
        x: np.ndarray,
        y: np.ndarray,
        model: LearningModel,
        val_x: np.ndarray = None,
        val_y: np.ndarray = None,
) -> CrossValidationMean:
    _, epoch_run, change, is_converged, accuracy_report = model.fit(x, y, val_x=val_x, val_y=val_y)
    training_predictions = model.predict(x)
    training_report = classification_report(y.astype(int), training_predictions.astype(int),
                                            output_dict=True, zero_division=0)

    validation_predictions = model.predict(val_x)
    validation_report = classification_report(val_y.astype(int), validation_predictions.astype(int),
                                              output_dict=True, zero_division=0)

    return {
        "training macro f1": training_report["macro avg"]["f1-score"],
        "training weighted f1": training_report["weighted avg"]["f1-score"],
        "training accuracy": training_report["accuracy"],
        "number_of_epochs_to_converge": epoch_run,
        "final_gradient_change": float(change),
        "is_converged": bool(is_converged),
        "accuracy_report": accuracy_report,
        "macro f1": validation_report["macro avg"]["f1-score"],
        "weighted f1": validation_report["weighted avg"]["f1-score"],
        "accuracy": validation_report["accuracy"],
    }


def cross_validate_with_n_fold(
        x: np.ndarray,
        y: np.ndarray,
        n_fold: int,
        model: LearningModel,
) -> CrossValidationMean:
    """
    Implement a cross validate algorithm
    to check how the model performs.
    :param x: X
    :param y: y
    :param n_fold: N fold
    :param model: A model
    :return: The report.
    """
    complete_data = np.append(x, y[:, np.newaxis], axis=1)
    total_number_of_instances = complete_data.shape[0]
    bucket_size = total_number_of_instances // n_fold

    train_weighted_f1 = 0
    train_macro_f1 = 0
    train_accuracy = 0

    val_macro_f1 = 0
    val_weighted_f1 = 0
    val_accuracy = 0

    for i in range(n_fold):

        # Computing the validation set and training set.
        validation_set_start_index, validation_set_end_index = i * bucket_size, min(i * bucket_size + bucket_size,
                                                                                    total_number_of_instances) - 1
        validation_set = complete_data[validation_set_start_index:validation_set_end_index + 1]
        training_set = None
        if validation_set_start_index == 0:
            training_set = complete_data[validation_set_end_index + 1:]
        elif validation_set_end_index == total_number_of_instances - 1:
            training_set = complete_data[:validation_set_start_index]
        else:
            training_set = np.append(
                complete_data[:validation_set_start_index],
                complete_data[validation_set_end_index + 1:],
                axis=0
            )

        # Make Predictions
        model.fit(training_set[:, :-1], training_set[:, -1].astype(int))

        training_predictions = model.predict(training_set[:, :-1])
        training_report = classification_report(training_set[:, -1].astype(int), training_predictions.astype(int),
                                                output_dict=True, zero_division=0)

        validation_predictions = model.predict(validation_set[:, :-1])
        validation_report = classification_report(validation_set[:, -1].astype(int), validation_predictions.astype(int),
                                                  output_dict=True, zero_division=0)

        train_macro_f1 += training_report["macro avg"]["f1-score"]
        train_weighted_f1 += training_report["weighted avg"]["f1-score"]
        train_accuracy += training_report["accuracy"]

        val_macro_f1 += validation_report["macro avg"]["f1-score"]
        val_weighted_f1 += validation_report["weighted avg"]["f1-score"]
        val_accuracy += validation_report["accuracy"]

    return {
        "training macro f1": train_macro_f1 / n_fold,
        "training weighted f1": train_weighted_f1 / n_fold,
        "training accuracy": train_accuracy / n_fold,

        "accuracy": val_accuracy / n_fold,
        "macro f1": val_macro_f1 / n_fold,
        "weighted f1": val_weighted_f1 / n_fold,
    }


def calculate_inaccuracy(x: np.ndarray, y: np.ndarray, model: LearningModel) -> float:
    """
    Calculate inaccuracies.
    :param x: X
    :param y: y
    :param model: A scikit model
    :return: Inaccuracy rate
    """
    trial_predict = model.predict(x)
    number_of_instances = y.shape[0]
    return np.count_nonzero(np.not_equal(trial_predict, y)) / number_of_instances




## File reading and Data visualization

Define utility function to read csv files, as well as importing some common plotting libraries such as matplotlib.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import simple_chalk as chalk
from typing import Callable, List, Dict, Tuple


def read_csv(
        file_path: str,
        verbose: bool = True,
        name_of_columns: List[str] = None,
) -> pd.DataFrame:
    """
    Read a CSV format file
    and load it into a pandas
    data frame.
    :param file_path: The path to the CSV file.
    :param verbose: Decide whether to print logs.
    :param name_of_columns: The name of the pandas columns, if any.
    :return: A pandas data frame.
    """

    df = pd.read_csv(file_path)
    if name_of_columns:
        df.columns = name_of_columns

    if verbose:
        print(f'{chalk.bold.greenBright("The data set has successfully been loaded.")}\n'
              f'{chalk.bold("PATH: ")} {file_path}\n'
              f'{chalk.bold("-" * 15 + "PRINTING DATA PREVIEW" + "-" * 15)}\n'
              f'{df.head()}\n')

    return df


## Logistic Regression Model

The model defined below is the Logistic Regression model, which is developed upon the codes provided. Essentially we have added interfaces to allow for mini-batch and momentum. One thing to notice is that we keep track of the training and validation accuracy, as well as its loss, during fitting. As such, we can analyze how the model performs.

In [None]:
import numpy as np

from typing import Dict, Tuple, List, Any
from simple_chalk import chalk



def calculate_accuracy(y_pred: np.ndarray, y_true: np.ndarray) -> float:
    return float(np.count_nonzero(y_pred == y_true) / len(y_pred))


class LogisticRegression(LearningModel):

    def __init__(
            self,
            add_bias: bool = True,
            learning_rate: float = .1,
            epsilon: float = 6.5e-3,
            verbose: bool = False,
            mini_batch: int = None,
            momentum: float = None,
            epoch: int = None,
            accuracy_record_num: int = 20
    ):
        self.add_bias = add_bias
        self.learning_rate = learning_rate
        self.epsilon = epsilon  # to get the tolerance for the norm of gradients
        self.mini_batch = mini_batch
        self.momentum = momentum
        self.verbose = verbose
        self.epoch = epoch
        self.accuracy_record_num = accuracy_record_num

        # Model Parameters.
        self.weights = None
        self.history_gradients = None

    def fit(
            self,
            x: np.ndarray,
            y: np.ndarray,
            **kwargs
    ) -> Tuple[
        LearningModel,
        int,
        float,
        bool,
        List[Tuple[int, float, float, float, float]]
    ]:

        # Prepare X
        if x.ndim == 1:
            x = x[:, None]
        if self.add_bias:
            number_of_instances = x.shape[0]
            x = np.column_stack([x, np.ones(number_of_instances)])

        # Set up parameters
        val_x, val_y = kwargs["val_x"], kwargs["val_y"]
        number_of_instances, number_of_features = x.shape
        self.weights = np.zeros(number_of_features)
        self.history_gradients = list()
        raw_gradients, epoch_run = np.inf, 0
        accuracy_record = list()
        accuracy_check_point = self.epoch // self.accuracy_record_num

        while epoch_run < self.epoch:
            # Get the segmented training
            # sets based on batch size.
            training_sets = self.separate_training_data(x, y)

            for batch in training_sets:
                raw_gradients = self.gradient(batch[0], batch[1])
                self.update_weights(raw_gradients)

            epoch_run += 1

            if epoch_run % accuracy_check_point == 0:
                accuracy_record.append(
                    (
                        epoch_run,
                        calculate_accuracy(self.predict(x), y),
                        calculate_accuracy(self.predict(val_x), val_y),
                        np.linalg.norm(raw_gradients),
                        self.loss(x, y)
                    )
                )

            if np.linalg.norm(raw_gradients) <= self.epsilon:
                break

        if self.verbose:
            print(
                f'{chalk.bold("-" * 15 + "COMPLETED FITTING" + "-" * 15)}\n'
                f'EPOCHS: {chalk.green.bold(epoch_run)}\n'
                f'GRADIENT CHANGE: {chalk.yellowBright.bold(np.linalg.norm(raw_gradients))}\n'
                f'FINAL WEIGHTS: {chalk.blueBright(self.weights)}\n')

        return self, epoch_run, np.linalg.norm(raw_gradients), np.linalg.norm(
            raw_gradients) <= self.epsilon, accuracy_record

    def update_weights(self, raw_gradients: np.ndarray) -> None:
        """
        Given the raw_gradients
        :param raw_gradients:
        :return:
        """

        g = self.update_weights_momentum(len(self.history_gradients), raw_gradients) \
            if self.momentum \
            else raw_gradients

        self.history_gradients.append(g)

        self.weights -= self.learning_rate * g

    def update_weights_momentum(self, t: int, raw_gradients: np.ndarray) -> np.ndarray:
        if t == 0:
            return (1 - self.momentum) * raw_gradients
        else:
            return self.momentum * self.history_gradients[t - 1] + (1 - self.momentum) * raw_gradients

    def separate_training_data(
            self,
            x: np.ndarray,
            y: np.ndarray,
    ) -> List[Tuple[np.ndarray, np.ndarray]]:
        """
        Separate the training data into
        segments according to the batch size.
        :param self:
        :param x: X
        :param y: y
        :return: A list of segmented batches.
        """
        if self.mini_batch >= x.shape[0]:
            return [(x, y)]

        complete_data = np.append(x if x.ndim > 1 else x[:, None], y if y.ndim > 1 else y[:, None], axis=1)
        complete_data_copy = complete_data
        np.random.shuffle(complete_data)

        result = list()

        while complete_data.shape[0] > 0:
            if complete_data.shape[0] >= self.mini_batch:
                result.append((complete_data[:self.mini_batch, :-1], complete_data[:self.mini_batch, -1]))
                complete_data = complete_data[self.mini_batch:]
            else:
                data_still_needed = self.mini_batch - complete_data.shape[0]
                last_data_set = np.append(complete_data_copy[:data_still_needed], complete_data, axis=0)
                result.append((last_data_set[:, :-1], last_data_set[:, -1]))
                break

        return result

    def predict(self, x):
        """
        Make predictions based on the weights given.
        :param x: X
        :return: The predicted y.
        """
        # Prepare X
        if x.ndim == 1:
            x = x[:, None]
        number_of_tests = x.shape[0]
        if self.add_bias and x.shape[1] != len(self.weights):
            x = np.column_stack([x, np.ones(number_of_tests)])

        # Make predictions
        result = sigmoid(np.dot(x, self.weights))
        positive = result > 0.5
        result[positive] = 1
        result[~positive] = 0
        return result.astype(int)

    def get_params(self):
        return self.__dict__

    def set_params(self, new_params: Dict[str, Any]):
        for k, v in new_params.items():
            if k == 'add_bias':
                self.add_bias = v
            elif k == 'learning_rate':
                self.learning_rate = v
            elif k == 'epsilon':
                self.epsilon = v
            elif k == 'verbose':
                self.verbose = v
            elif k == 'mini_batch':
                self.mini_batch = v
            elif k == 'momentum':
                self.momentum = v
        return True

    def gradient(self, x, y):
        """
        Calculate the gradient of logistic regression.
        :param x:
        :param y:
        :return:
        """
        number_of_instances, number_of_features = x.shape
        yh = sigmoid(np.dot(x, self.weights))  # predictions  size N
        grad = np.dot(x.T, yh - y) / number_of_instances  # divide by N because cost is mean over N points
        return grad

    def loss(self, x: np.ndarray, y: np.ndarray) -> float:
        z = np.dot(x, self.weights)
        return float(np.mean(y * np.log1p(np.exp(-z)) + (1 - y) * np.log1p(np.exp(z))))


# **Part 1**

Import datasets

In [None]:
import pprint as pp
import json
training_data = read_csv("sample_data/diabetes/diabetes_train.csv").to_numpy()
test_data = read_csv("sample_data/diabetes/diabetes_test.csv").to_numpy()
val_data = read_csv("sample_data/diabetes/diabetes_val.csv").to_numpy()

[1;92mThe data set has successfully been loaded.[0m
[1mPATH: [0m sample_data/diabetes/diabetes_train.csv
[1m---------------PRINTING DATA PREVIEW---------------[0m
   Pregnancies  Glucose  BloodPressure  ...  DiabetesPedigreeFunction  Age  Outcome
0            5      144             82  ...                     0.452   58        1
1            0      128             68  ...                     1.391   25        1
2            9      156             86  ...                     1.189   42        1
3            1      144             82  ...                     0.335   46        1
4            0      179             90  ...                     0.686   23        1

[5 rows x 9 columns]

[1;92mThe data set has successfully been loaded.[0m
[1mPATH: [0m sample_data/diabetes/diabetes_test.csv
[1m---------------PRINTING DATA PREVIEW---------------[0m
   Pregnancies  Glucose  BloodPressure  ...  DiabetesPedigreeFunction  Age  Outcome
0            5      117             92  ...         

## Part 1.1 - Finding a baseline model

In this part, we shall find a baseline model that works. Concretely, we select the learning rate to be `2e-4`. Finally, we also record the model's performance on the test dataset.

In [None]:

params = {
    "learning_rate": [2e-4],
    "epoch": [3e6],
    "mini_batch": [600],
    "momentum": [None],
    "accuracy_record_num": [500]
}

best_param, results = get_best_model_parameter(
    params,
    LogisticRegression,
    training_data[:, :-1],
    training_data[:, -1],
    cross_validate,
    val_x=val_data[:, :-1],
    val_y=val_data[:, -1]
)
print(f'{chalk.bold("-" * 15 + "BEST PARAMETERS FOUND" + "-" * 15)}\n')
pp.pprint(best_param)
print(f'{chalk.bold("-" * 15 + "ALL COMBINATION DATA" + "-" * 15)}\n')
pp.pprint(results)

# Write results to a file
f = open("./output/part1-1.json", 'w')
f.write(json.dumps(results, indent=4))
f.close()

# ------------------------------------------------------
# These are used to record results on the test data set.
# ------------------------------------------------------
best_param, results = get_best_model_parameter(
    params,
    LogisticRegression,
    training_data[:, :-1],
    training_data[:, -1],
    cross_validate,
    val_x=test_data[:, :-1],
    val_y=test_data[:, -1]
)

# Write results to a file
f = open("./output/part1-1-test.json", 'w')
f.write(json.dumps(results, indent=4))
f.close()

    

[1m---------------START FINDING BEST PARAMETERS---------------[0m



100%|██████████| 1/1 [00:38<00:00, 38.97s/it]


[1m---------------BEST PARAMETERS FOUND---------------[0m
[92m{'learning_rate': 0.0002, 'epoch': 3000000.0, 'mini_batch': 600, 'momentum': None, 'accuracy_record_num': 500}[0m

[1m---------------BEST PARAMETERS FOUND---------------[0m

{'accuracy_record_num': 500,
 'epoch': 3000000.0,
 'learning_rate': 0.0002,
 'mini_batch': 600,
 'momentum': None}
[1m---------------ALL COMBINATION DATA---------------[0m

[({'accuracy_record_num': 500,
   'epoch': 3000000.0,
   'learning_rate': 0.0002,
   'mini_batch': 600,
   'momentum': None},
  {'accuracy': 0.75,
   'accuracy_report': [(6000,
                        0.6911764705882353,
                        0.65,
                        0.08713942715558523,
                        0.5586109578949447),
                       (12000,
                        0.6911764705882353,
                        0.67,
                        0.04462687552685464,
                        0.5538811039944188),
                       (18000,
                

###Graph plotting example

Code below is part of actual graph plotting code for Part1.1

In [None]:
f = open('./output/part1-1.json',)
data = json.load(f)

epoch = []
val_acc = []
train_acc = []
norm = []
loss = []
for i in data[0][1]["accuracy_report"]:
    epoch.append(i[0])
    train_acc.append(i[1])
    val_acc.append(i[2])
    norm.append(i[3])
    loss.append(i[4])

plt.plot(epoch, train_acc, label = "Training accuracy")
plt.plot(epoch, val_acc, label = "Vallidation accuracy")
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.title('accuracy vs epoch')
plt.legend()
plt.show()
plt.plot(epoch, norm)
plt.title('norm vs epoch')
plt.show()
plt.plot(epoch, loss)
plt.title('loss vs epoch')
plt.show()

## Part 1.2 - Mini-batch

Here, we explore how `mini-batch` affects model performance. We have selected a range of batch sizes, namely 16, 64, 128, 256. As we have found out, the mini-batch models do not converge, and thus we do not apply these models on the test dataset.

---

#### NOTICE

This part takes a long time to run.

In [None]:
params = {
    "learning_rate": [2e-4],
    "epoch": [3e6],
    "mini_batch": [16,64,128,256],
    "momentum": [None],
    "accuracy_record_num": [500]
}

best_param, results = get_best_model_parameter(
    params,
    LogisticRegression,
    training_data[:, :-1],
    training_data[:, -1],
    cross_validate,
    val_x=val_data[:, :-1],
    val_y=val_data[:, -1]
)

print(f'{chalk.bold("-" * 15 + "BEST PARAMETERS FOUND" + "-" * 15)}\n')
pp.pprint(best_param)
print(f'{chalk.bold("-" * 15 + "ALL COMBINATION DATA" + "-" * 15)}\n')
pp.pprint(results)

# Write results to a file
f = open("./output/part1-2.json", 'w')
f.write(json.dumps(results, indent=4))
f.close()

###Graph plotting example

Code below is part of actual graph plotting code for Part1.2

In [None]:
f2 = open('./output/part1-2.json',)
data = json.load(f2)

epoch16 = []
val_acc16 = []
train_acc16 = []
norm16 = []
loss16 = []

epoch64 = []
val_acc64 = []
train_acc64 = []
norm64 = []
loss64 = []

epoch128 = []
val_acc128 = []
train_acc128 = []
norm128 = []
loss128 = []

epoch256 = []
val_acc256 = []
train_acc256 = []
norm256 = []
loss256 = []
for i in data[0][1]["accuracy_report"]:
    epoch16.append(i[0])
    train_acc16.append(i[1])
    val_acc16.append(i[2])
    norm16.append(i[3])
    loss16.append(i[4])
for i in data[1][1]["accuracy_report"]:
    epoch64.append(i[0])
    train_acc64.append(i[1])
    val_acc64.append(i[2])
    norm64.append(i[3])
    loss64.append(i[4])
for i in data[2][1]["accuracy_report"]:
    epoch128.append(i[0])
    train_acc128.append(i[1])
    val_acc128.append(i[2])
    norm128.append(i[3])
    loss128.append(i[4])
for i in data[3][1]["accuracy_report"]:
    epoch256.append(i[0])
    train_acc256.append(i[1])
    val_acc256.append(i[2])
    norm256.append(i[3])
    loss256.append(i[4])

plt.plot(epoch16, train_acc16, label = "size 16 Training accuracy")
plt.plot(epoch16, val_acc16, label = "size 16 Vallidation accuracy")
plt.ylim(0.55, 0.8)
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.title('size 16 accuracy vs epoch')

plt.legend()
plt.show()

plt.plot(epoch64, train_acc64, label = "size 64 Training accuracy")
plt.plot(epoch64, val_acc64, label = "size 64 Vallidation accuracy")
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.title('size 64 accuracy vs epoch')

plt.legend()
plt.show()
plt.plot(epoch128, train_acc128, label = "size 128 Training accuracy")
plt.plot(epoch128, val_acc128, label = "size 128 Vallidation accuracy")
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.title('size 128 accuracy vs epoch')

plt.legend()
plt.show()

plt.plot(epoch256, train_acc256, label = "size 256 Training accuracy")
plt.plot(epoch256, val_acc256, label = "size 256 Vallidation accuracy")
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.title('size 256 accuracy vs epoch')

plt.legend()
plt.show()

plt.plot(epoch16, norm16, label = "size 16")
plt.plot(epoch64, norm64, label = "size 64")
plt.plot(epoch128, norm128, label = "size 128")
plt.plot(epoch256, norm256, label = "size 256")
plt.title('norm vs epoch')
plt.legend()
plt.show()
plt.plot(epoch16, loss16, label = "size 16")
plt.plot(epoch64, loss64, label = "size 64")
plt.plot(epoch128, loss128, label = "size 128")
plt.plot(epoch256, loss256, label = "size 256")
plt.title('loss vs epoch')
plt.legend()
plt.show()

plt.plot(epoch16, val_acc16, label = "size 16")
plt.plot(epoch64, val_acc64, label = "size 64")
plt.plot(epoch128, val_acc128, label = "size 128")
plt.plot(epoch256, val_acc256, label = "size 256")
plt.title('Vallidation Accuracy vs epoch')
plt.legend()
plt.show()

## Part 1.3 - Momentum

Here we explore how `momentum` affects model performance. We have set momentum to be 0.99, 0.96, 0.93, 0.6, and 0.3. Finally, we also record how the model perform on the test dataset.

In [None]:
params = {
    "learning_rate": [2e-4],
    "epoch": [3e6],
    "mini_batch": [600],
    "momentum": [0.99, 0.96, 0.93, 0.9, 0.6, 0.3],
    "accuracy_record_num": [500]
}

best_param, results = get_best_model_parameter(
    params,
    LogisticRegression,
    training_data[:, :-1],
    training_data[:, -1],
    cross_validate,
    val_x=val_data[:, :-1],
    val_y=val_data[:, -1]
)

print(f'{chalk.bold("-" * 15 + "BEST PARAMETERS FOUND" + "-" * 15)}\n')
pp.pprint(best_param)
print(f'{chalk.bold("-" * 15 + "ALL COMBINATION DATA" + "-" * 15)}\n')
pp.pprint(results)

# Write results to a file
f = open("./output/part1-3.json", 'w')
f.write(json.dumps(results, indent=4))
f.close()

# ------------------------------------------------------
# These are used to record results on the test data set.
# ------------------------------------------------------
best_param, results = get_best_model_parameter(
    params,
    LogisticRegression,
    training_data[:, :-1],
    training_data[:, -1],
    cross_validate,
    val_x=test_data[:, :-1],
    val_y=test_data[:, -1]
)

# Write results to a file
f = open("./output/part1-3-test.json", 'w')
f.write(json.dumps(results, indent=4))
f.close()

[1m---------------START FINDING BEST PARAMETERS---------------[0m



100%|██████████| 6/6 [04:31<00:00, 45.21s/it]


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
                       (450000,
                        0.75,
                        0.74,
                        0.018622734979946955,
                        0.5023558713982966),
                       (456000,
                        0.75,
                        0.74,
                        0.01850734617404017,
                        0.501942275429805),
                       (462000,
                        0.75,
                        0.74,
                        0.01839283918577687,
                        0.5015337852726208),
                       (468000,
                        0.7647058823529411,
                        0.74,
                        0.018279205342968376,
                        0.5011303306012416),
                       (474000,
                        0.7647058823529411,
                        0.74,
                        0.018166436122426112,
                        0.50073184218685

###Graph plotting example

Code below is part of actual graph plotting code for Part1.3

In [None]:
f = open('./output/part1-3.json',)
data = json.load(f)

epoch99 = []
val_acc99 = []
train_acc99 = []

for i in data[0][1]["accuracy_report"]:
    epoch99.append(i[0])
    train_acc99.append(i[1])
    val_acc99.append(i[2])

plt.plot(epoch99, train_acc99, label = "Training accuracy")
plt.plot(epoch99, val_acc99, label = "Vallidation accuracy")
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.title('accuracy vs epoch for full batch with a 0.99 momentum')
plt.legend()
plt.show()


epoch96 = []
val_acc96 = []
train_acc96 = []

for i in data[1][1]["accuracy_report"]:
    epoch96.append(i[0])
    train_acc96.append(i[1])
    val_acc96.append(i[2])

plt.plot(epoch96, train_acc96, label = "Training accuracy")
plt.plot(epoch96, val_acc96, label = "Vallidation accuracy")
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.title('accuracy vs epoch for full batch with a 0.96 momentum')
plt.legend()
plt.show()


epoch93 = []
val_acc93 = []
train_acc93 = []

for i in data[2][1]["accuracy_report"]:
    epoch93.append(i[0])
    train_acc93.append(i[1])
    val_acc93.append(i[2])

plt.plot(epoch93, train_acc93, label = "Training accuracy")
plt.plot(epoch93, val_acc93, label = "Vallidation accuracy")
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.title('accuracy vs epoch for full batch with a 0.93 momentum')
plt.legend()
plt.show()

epoch90 = []
val_acc90 = []
train_acc90 = []

for i in data[3][1]["accuracy_report"]:
    epoch90.append(i[0])
    train_acc90.append(i[1])
    val_acc90.append(i[2])

plt.plot(epoch90, train_acc90, label = "Training accuracy")
plt.plot(epoch90, val_acc90, label = "Vallidation accuracy")
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.title('accuracy vs epoch for full batch with a 0.90 momentum')
plt.legend()
plt.show()

epoch60 = []
val_acc60 = []
train_acc60 = []

for i in data[4][1]["accuracy_report"]:
    epoch60.append(i[0])
    train_acc60.append(i[1])
    val_acc60.append(i[2])

plt.plot(epoch60, train_acc60, label = "Training accuracy")
plt.plot(epoch60, val_acc60, label = "Vallidation accuracy")
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.title('accuracy vs epoch for full batch with a 0.60 momentum')
plt.legend()
plt.show()

epoch30 = []
val_acc30 = []
train_acc30 = []

for i in data[5][1]["accuracy_report"]:
    epoch30.append(i[0])
    train_acc30.append(i[1])
    val_acc30.append(i[2])

plt.plot(epoch30, train_acc30, label = "Training accuracy")
plt.plot(epoch30, val_acc30, label = "Vallidation accuracy")
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.title('accuracy vs epoch for full batch with a 0.30 momentum')
plt.legend()
plt.show()

## Part 1.4 - Momentum with small, large and full batch

In this part, we repeat the previous steps for the minimum and maximum batch size we have covered in Part 1.2, which are 16 and 256. Since we have already recorded the performance of full-batch models with momentum in the previous step, we do not include them here.

Similarly, we have found that the mini-batch models do not converge no matter the choice of momentum. Hence, we do not apply these models on the test dataset.

---

#### NOTICE

Notice that this part takes a significant amount of time to run.

In [None]:
params = {
    "learning_rate": [2e-4],
    "epoch": [3e6],
    "mini_batch": [16,256],
    "momentum": [0.99, 0.96, 0.93, 0.9, 0.6, 0.3],
    "accuracy_record_num": [500]
}

best_param, results = get_best_model_parameter(
    params,
    LogisticRegression,
    training_data[:, :-1],
    training_data[:, -1],
    cross_validate,
    val_x=val_data[:, :-1],
    val_y=val_data[:, -1]
)

print(f'{chalk.bold("-" * 15 + "BEST PARAMETERS FOUND" + "-" * 15)}\n')
pp.pprint(best_param)
print(f'{chalk.bold("-" * 15 + "ALL COMBINATION DATA" + "-" * 15)}\n')
pp.pprint(results)

# Write results to a file
f = open("./output/part1-4.json", 'w')
f.write(json.dumps(results, indent=4))
f.close()

###Graph plotting example

Code below is a part of actual graph plotting code for Part1.4

In [None]:
f = open('./output/part1-4.json',)
data = json.load(f)
f2 = open('./output/part1-2.json',)
data2 = json.load(f2)

epoch256_99 = []
val_acc256_99 = []

epoch256_96 = []
val_acc256_96 = []

epoch256_93 = []
val_acc256_93 = []

epoch256_90 = []
val_acc256_90 = []

epoch256_60 = []
val_acc256_60 = []

epoch256_30 = []
val_acc256_30 = []

epoch256_no = []
val_acc256_no = []

for i in data[6][1]["accuracy_report"]:
    epoch256_99.append(i[0])
    val_acc256_99.append(i[2])
for i in data[7][1]["accuracy_report"]:
    epoch256_96.append(i[0])
    val_acc256_96.append(i[2])
for i in data[8][1]["accuracy_report"]:
    epoch256_93.append(i[0])
    val_acc256_93.append(i[2])
for i in data[9][1]["accuracy_report"]:
    epoch256_90.append(i[0])
    val_acc256_90.append(i[2])
for i in data[10][1]["accuracy_report"]:
    epoch256_60.append(i[0])
    val_acc256_60.append(i[2])
for i in data[11][1]["accuracy_report"]:
    epoch256_30.append(i[0])
    val_acc256_30.append(i[2])
for i in data2[3][1]["accuracy_report"]:
    epoch256_no.append(i[0])
    val_acc256_no.append(i[2])

plt.plot(epoch256_no, val_acc256_no, label = "size 256 Vallidation accuracy with no momentum")
plt.plot(epoch256_99, val_acc256_99, label = "size 256 Vallidation accuracy with a 0.99 momentum")
plt.ylim(0.55, 0.8)
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.title('accuracy vs epoch for the model with a minibatch size 256')
plt.legend()
plt.show()

plt.plot(epoch256_no, val_acc256_no, label = "size 256 Vallidation accuracy with no momentum")
plt.plot(epoch256_96, val_acc256_96, label = "size 256 Vallidation accuracy with a 0.96 momentum")
plt.ylim(0.55, 0.8)
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.title('accuracy vs epoch for the model with a minibatch size 256')
plt.legend()
plt.show()

plt.plot(epoch256_no, val_acc256_no, label = "size 256 Vallidation accuracy with no momentum")
plt.plot(epoch256_93, val_acc256_93, label = "size 256 Vallidation accuracy with a 0.93 momentum")

plt.ylim(0.55, 0.8)
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.title('accuracy vs epoch for the model with a minibatch size 256')
plt.legend()
plt.show()

plt.plot(epoch256_no, val_acc256_no, label = "size 256 Vallidation accuracy with no momentum")
plt.plot(epoch256_90, val_acc256_90, label = "size 256 Vallidation accuracy with a 0.90 momentum")

plt.ylim(0.55, 0.8)
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.title('accuracy vs epoch for the model with a minibatch size 256')
plt.legend()
plt.show()

plt.plot(epoch256_no, val_acc256_no, label = "size 256 Vallidation accuracy with no momentum")
plt.plot(epoch256_60, val_acc256_60, label = "size 256 Vallidation accuracy with a 0.60 momentum")
plt.ylim(0.55, 0.8)
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.title('accuracy vs epoch for the model with a minibatch size 256')
plt.legend()
plt.show()

plt.plot(epoch256_no, val_acc256_no, label = "size 256 Vallidation accuracy with no momentum")
plt.plot(epoch256_30, val_acc256_30, label = "size 256 Vallidation accuracy with a 0.30 momentum")
plt.ylim(0.55, 0.8)
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.title('accuracy vs epoch for the model with a minibatch size 256')
plt.legend()
plt.show()

# **Part 2**

## Import sklearn model and nltk

In [None]:
import json
from typing import Dict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from tqdm import tqdm
import nltk

nltk.download('wordnet')
from nltk.stem import SnowballStemmer
import numpy as np
from simple_chalk import chalk

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Preprocess Data Functions

In [None]:
def preprocess_get_vectorizer_with_stemming(
        corpus: np.ndarray,
        params: Dict = None
) -> CountVectorizer:
    stemming_helper = SnowballStemmer('english')
    vectorizer = CountVectorizer(**params)
    vectorizer.fit(corpus[:, 0], corpus[:, 1])
    analyzer = vectorizer.build_analyzer()

    new_vectorizer = CountVectorizer(
        **params,
        analyzer=lambda s: (stemming_helper.stem(word) for word in analyzer(s)),
    )

    return new_vectorizer

## Import Datasets

In [None]:
wtraining_data = read_csv("sample_data/fake_news/fake_news_train.csv").to_numpy()
test_data = read_csv("sample_data/fake_news/fake_news_test.csv").to_numpy()
val_data = read_csv("sample_data/fake_news/fake_news_val.csv").to_numpy()

[1;92mThe data set has successfully been loaded.[0m
[1mPATH: [0m sample_data/fake_news/fake_news_train.csv
[1m---------------PRINTING DATA PREVIEW---------------[0m
                                                text  label
0  Indian fruit is so important to so many people...      0
1  FORT WORTH, Texas — Urú Inc. will hold a confe...      0
2  With three of the four new carriers, the Niger...      0
3  Let's start with the classic annual dividend r...      0
4  Following are some of the major events to have...      1

[1;92mThe data set has successfully been loaded.[0m
[1mPATH: [0m sample_data/fake_news/fake_news_test.csv
[1m---------------PRINTING DATA PREVIEW---------------[0m
                                                text  label
0  Is it possible the President’s pressure campai...      0
1  Pennsylvania is the place to go for farmers ma...      1
2  While he is frantically putting together a pro...      0
3  The one-man industry, Yasir Akhtar, is all set...      

## Logistic Regression Model from Skearn

---
#### NOTICE

Note that the codes below takes a long time to run, approximately 2 hours.

### Testing various feature extractors

We first experiment with various preprocessing options and determine which works the best.

In [None]:
params = [
    {
        "stemming": False,
        "use_tfidf": False,
        "vectorizer_params": {
            "binary": False,
            "lowercase": True,
            "ngram_range": (1, 1)
        },
        "tfidf_params": {},
        "clf_params": {
            "max_iter": 1000,
            "solver": "sag"
        }
    },
    {
        "stemming": False,
        "use_tfidf": False,
        "vectorizer_params": {
            "binary": False,
            "lowercase": True,
            "ngram_range": (1, 2)
        },
        "tfidf_params": {},
        "clf_params": {
            "max_iter": 1000,
            "solver": "sag"
        }
    },
    {
        "stemming": False,
        "use_tfidf": False,
        "vectorizer_params": {
            "binary": False,
            "lowercase": False,
            "ngram_range": (1, 1)
        },
        "tfidf_params": {},
        "clf_params": {
            "max_iter": 1000,
            "solver": "sag"
        }
    },
    {
        "stemming": False,
        "use_tfidf": False,
        "vectorizer_params": {
            "binary": False,
            "lowercase": False,
            "ngram_range": (1, 2)
        },
        "tfidf_params": {},
        "clf_params": {
            "max_iter": 1000,
            "solver": "sag"
        }
    },
    {
        "stemming": False,
        "use_tfidf": False,
        "vectorizer_params": {
            "binary": True,
            "lowercase": True,
            "ngram_range": (1, 1)
        },
        "tfidf_params": {},
        "clf_params": {
            "max_iter": 1000,
            "solver": "sag"
        }
    },
    {
        "stemming": False,
        "use_tfidf": False,
        "vectorizer_params": {
            "binary": True,
            "lowercase": True,
            "ngram_range": (1, 2)
        },
        "tfidf_params": {},
        "clf_params": {
            "max_iter": 1000,
            "solver": "sag"
        }
    },
    {
        "stemming": False,
        "use_tfidf": False,
        "vectorizer_params": {
            "binary": True,
            "lowercase": False,
            "ngram_range": (1, 1)
        },
        "tfidf_params": {},
        "clf_params": {
            "max_iter": 1000,
            "solver": "sag"
        }
    },
    {
        "stemming": False,
        "use_tfidf": False,
        "vectorizer_params": {
            "binary": True,
            "lowercase": False,
            "ngram_range": (1, 2)
        },
        "tfidf_params": {},
        "clf_params": {
            "max_iter": 1000,
            "solver": "sag"
        }
    },
    {
        "stemming": True,
        "use_tfidf": False,
        "vectorizer_params": {
            "binary": False,
            "lowercase": True,
            "ngram_range": (1, 1)
        },
        "tfidf_params": {},
        "clf_params": {
            "max_iter": 1000,
            "solver": "sag"
        }
    },
    {
        "stemming": True,
        "use_tfidf": False,
        "vectorizer_params": {
            "binary": False,
            "lowercase": True,
            "ngram_range": (1, 2)
        },
        "tfidf_params": {},
        "clf_params": {
            "max_iter": 1000,
            "solver": "sag"
        }
    },
    {
        "stemming": True,
        "use_tfidf": False,
        "vectorizer_params": {
            "binary": False,
            "lowercase": False,
            "ngram_range": (1, 1)
        },
        "tfidf_params": {},
        "clf_params": {
            "max_iter": 1000,
            "solver": "sag"
        }
    },
    {
        "stemming": True,
        "use_tfidf": False,
        "vectorizer_params": {
            "binary": False,
            "lowercase": False,
            "ngram_range": (1, 2)
        },
        "tfidf_params": {},
        "clf_params": {
            "max_iter": 1000,
            "solver": "sag"
        }
    },
    {
        "stemming": True,
        "use_tfidf": False,
        "vectorizer_params": {
            "binary": True,
            "lowercase": True,
            "ngram_range": (1, 1)
        },
        "tfidf_params": {},
        "clf_params": {
            "max_iter": 1000,
            "solver": "sag"
        }
    },
    {
        "stemming": True,
        "use_tfidf": False,
        "vectorizer_params": {
            "binary": True,
            "lowercase": True,
            "ngram_range": (1, 2)
        },
        "tfidf_params": {},
        "clf_params": {
            "max_iter": 1000,
            "solver": "sag"
        }
    },
    {
        "stemming": True,
        "use_tfidf": False,
        "vectorizer_params": {
            "binary": True,
            "lowercase": False,
            "ngram_range": (1, 1)
        },
        "tfidf_params": {},
        "clf_params": {
            "max_iter": 1000,
            "solver": "sag"
        }
    },
    {
        "stemming": True,
        "use_tfidf": False,
        "vectorizer_params": {
            "binary": True,
            "lowercase": False,
            "ngram_range": (1, 2)
        },
        "tfidf_params": {},
        "clf_params": {
            "max_iter": 1000,
            "solver": "sag"
        }
    },
]

# Running results
results = []

for param_combination in tqdm(params):
    use_stemming = param_combination['stemming']
    use_tfidf = param_combination['use_tfidf']
    tfidf_params = param_combination['tfidf_params']
    vectorizer_params = param_combination['vectorizer_params']
    clf_params = param_combination['clf_params']

    vectorizer, text_clf = None, None

    if use_stemming:
        vectorizer = preprocess_get_vectorizer_with_stemming(training_data, vectorizer_params)
    else:
        vectorizer = CountVectorizer(**vectorizer_params)

    if not use_tfidf:
        text_clf = Pipeline([
            ("vect", vectorizer),
            ("clf", LogisticRegression(**clf_params))
        ])
    else:
        text_clf = Pipeline([
            ("vect", vectorizer),
            ("tfidf", TfidfTransformer(**tfidf_params)),
            ("clf", LogisticRegression(**clf_params))
        ])

    text_clf.fit(training_data[:, 0], training_data[:, 1].astype(int))

    result = {}
    result["params"] = param_combination
    result["val"] = classification_report(
        val_data[:, -1].astype(int),
        text_clf.predict(val_data[:, 0]).astype(int),
        output_dict=True,
        zero_division=0
    )
    result["test"] = classification_report(
        test_data[:, -1].astype(int),
        text_clf.predict(test_data[:, 0]).astype(int),
        output_dict=True,
        zero_division=0
    )

    results.append(result)

f = open("./output/part2-pre.json", 'w')
f.write(json.dumps(results, indent=4))
f.close()

100%|██████████| 16/16 [2:09:21<00:00, 485.12s/it]


### The effects of Tfidf

In [None]:
params = [
    {
        "stemming": False,
        "use_tfidf": True,
        "vectorizer_params": {
            "binary": True,
            "lowercase": False,
            "ngram_range": (1, 2)
        },
        "tfidf_params": {
            "smooth_idf": True,
            "sublinear_tf": False
        },
        "clf_params": {
            "max_iter": 1000,
            "solver": "sag"
        }
    },
    {
        "stemming": False,
        "use_tfidf": True,
        "vectorizer_params": {
            "binary": True,
            "lowercase": False,
            "ngram_range": (1, 2)
        },
        "tfidf_params": {
            "smooth_idf": True,
            "sublinear_tf": True
        },
        "clf_params": {
            "max_iter": 1000,
            "solver": "sag"
        }
    },
    {
        "stemming": False,
        "use_tfidf": True,
        "vectorizer_params": {
            "binary": True,
            "lowercase": False,
            "ngram_range": (1, 2)
        },
        "tfidf_params": {
            "smooth_idf": False,
            "sublinear_tf": False
        },
        "clf_params": {
            "max_iter": 1000,
            "solver": "sag"
        }
    },
    {
        "stemming": False,
        "use_tfidf": True,
        "vectorizer_params": {
            "binary": True,
            "lowercase": False,
            "ngram_range": (1, 2)
        },
        "tfidf_params": {
            "smooth_idf": False,
            "sublinear_tf": True
        },
        "clf_params": {
            "max_iter": 1000,
            "solver": "sag"
        }
    },
]

# Running results
results = []

for param_combination in tqdm(params):
    use_stemming = param_combination['stemming']
    use_tfidf = param_combination['use_tfidf']
    tfidf_params = param_combination['tfidf_params']
    vectorizer_params = param_combination['vectorizer_params']
    clf_params = param_combination['clf_params']

    vectorizer, text_clf = None, None

    if use_stemming:
        vectorizer = preprocess_get_vectorizer_with_stemming(training_data, vectorizer_params)
    else:
        vectorizer = CountVectorizer(**vectorizer_params)

    if not use_tfidf:
        text_clf = Pipeline([
            ("vect", vectorizer),
            ("clf", LogisticRegression(**clf_params))
        ])
    else:
        text_clf = Pipeline([
            ("vect", vectorizer),
            ("tfidf", TfidfTransformer(**tfidf_params)),
            ("clf", LogisticRegression(**clf_params))
        ])

    text_clf.fit(training_data[:, 0], training_data[:, 1].astype(int))

    result = {}
    result["params"] = param_combination
    result["val"] = classification_report(
        val_data[:, -1].astype(int),
        text_clf.predict(val_data[:, 0]).astype(int),
        output_dict=True,
        zero_division=0
    )
    result["test"] = classification_report(
        test_data[:, -1].astype(int),
        text_clf.predict(test_data[:, 0]).astype(int),
        output_dict=True,
        zero_division=0
    )

    results.append(result)

f = open("./output/part2-tfidf.json", 'w')
f.write(json.dumps(results, indent=4))
f.close()

100%|██████████| 4/4 [04:46<00:00, 71.56s/it]


### Tuning hyper parameters for Logistic Regression

In [None]:
params = [
    {
        "stemming": False,
        "use_tfidf": False,
        "vectorizer_params": {
            "binary": True,
            "lowercase": False,
            "ngram_range": (1, 2)
        },
        "tfidf_params": {},
        "clf_params": {
            "max_iter": 1000,
            "solver": "sag",
            "penalty": "l2",
            "class_weight": None
        }
    },
    {
        "stemming": False,
        "use_tfidf": False,
        "vectorizer_params": {
            "binary": True,
            "lowercase": False,
            "ngram_range": (1, 2)
        },
        "tfidf_params": {},
        "clf_params": {
            "max_iter": 1000,
            "solver": "sag",
            "penalty": "l2",
            "class_weight": "balanced"
        }
    },
]

# Running results
results = []

for param_combination in tqdm(params):
    use_stemming = param_combination['stemming']
    use_tfidf = param_combination['use_tfidf']
    tfidf_params = param_combination['tfidf_params']
    vectorizer_params = param_combination['vectorizer_params']
    clf_params = param_combination['clf_params']

    vectorizer, text_clf = None, None

    if use_stemming:
        vectorizer = preprocess_get_vectorizer_with_stemming(training_data, vectorizer_params)
    else:
        vectorizer = CountVectorizer(**vectorizer_params)

    if not use_tfidf:
        text_clf = Pipeline([
            ("vect", vectorizer),
            ("clf", LogisticRegression(**clf_params))
        ])
    else:
        text_clf = Pipeline([
            ("vect", vectorizer),
            ("tfidf", TfidfTransformer(**tfidf_params)),
            ("clf", LogisticRegression(**clf_params))
        ])

    text_clf.fit(training_data[:, 0], training_data[:, 1].astype(int))

    result = {}
    result["params"] = param_combination
    result["val"] = classification_report(
        val_data[:, -1].astype(int),
        text_clf.predict(val_data[:, 0]).astype(int),
        output_dict=True,
        zero_division=0
    )
    result["test"] = classification_report(
        test_data[:, -1].astype(int),
        text_clf.predict(test_data[:, 0]).astype(int),
        output_dict=True,
        zero_division=0
    )

    results.append(result)

f = open("./output/part2-model.json", 'w')
f.write(json.dumps(results, indent=4))
f.close()

100%|██████████| 2/2 [15:17<00:00, 458.53s/it]
