# Logistic Regression

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## Scikit Learn

In [2]:
sklearn_logistic = LogisticRegression()

In [3]:
sklearn_logistic.fit([
  [0.1, 0.2],
  [0.3, 0.4],
  [0.5, 0.6],
], [1, 0, 1])

In [4]:
sklearn_logistic.coef_

array([[-8.94608035e-06,  7.34661849e-06]])

In [5]:
sklearn_logistic.intercept_

array([0.69314773])

## Solving for Predictions

$ f(x) = \frac{1}{1+e^{-x}} $

### Output

In [6]:
import joblib

trained_lr = joblib.load('./models/model_lr/best/lr.pkl')

trained_lr

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [7]:
trained_lr.coef_

array([[1.70518487, 3.24559592, 3.23285169]])

In [8]:
trained_lr.intercept_

array([-4.65501405])

In [9]:
trained_output = [
  [0.1, 0.2, 0.3],
  [0.3, 0.4, 0.5],
  [0.5, 0.6, 0.7],
]

In [10]:
trained_lr_output = trained_lr.predict_proba(trained_output)

trained_lr_output

array([[0.94611545, 0.05388455],
       [0.77360749, 0.22639251],
       [0.39940806, 0.60059194]])

### Solving

In [11]:
solving_linear_combination = trained_lr.coef_[0][0] * trained_output[0][0] \
  + trained_lr.coef_[0][1] * trained_output[0][1] \
  + trained_lr.coef_[0][2] * trained_output[0][2] \
  + trained_lr.intercept_

solving_linear_combination

array([-2.86552087])

In [12]:
positive_output = 1 / (1 + np.exp(-solving_linear_combination))

positive_output

array([0.05388455])

In [13]:
negative_output = 1 - positive_output

negative_output

array([0.94611545])

In [14]:
np.concatenate([negative_output, positive_output])

array([0.94611545, 0.05388455])

In [15]:
trained_lr_output[0]

array([0.94611545, 0.05388455])

## Testing Stuff

In [16]:
concatenated_learner_preds = joblib.load('./models/predictions')

In [17]:
concatenated_learner_preds

array([[[9.99999999e-01],
        [3.16572526e-05],
        [4.73412315e-01],
        ...,
        [9.86733107e-01],
        [1.67257632e-05],
        [9.99886411e-01]],

       [[9.90881920e-01],
        [2.38863844e-03],
        [1.58231616e-01],
        ...,
        [8.29080284e-01],
        [3.19831399e-03],
        [9.69492972e-01]],

       [[9.94101822e-01],
        [2.08841115e-02],
        [6.82193711e-02],
        ...,
        [7.42848635e-01],
        [2.01400183e-02],
        [9.63838995e-01]]])

In [18]:
concatenated_learner_preds.shape

(3, 22769, 1)

In [19]:
concatenated_learner_test_preds = joblib.load('./models/test-preds')

In [20]:
concatenated_learner_test_preds

array([[[9.98147245e-01, 1.85275489e-03],
        [9.96582061e-01, 3.41793926e-03],
        [9.99917052e-01, 8.29482744e-05],
        ...,
        [7.57227710e-01, 2.42772290e-01],
        [4.02596080e-02, 9.59740392e-01],
        [9.96524714e-01, 3.47528557e-03]],

       [[5.05567789e-01, 4.94432271e-01],
        [9.56122428e-02, 9.04387772e-01],
        [9.84410524e-01, 1.55894412e-02],
        ...,
        [6.18802786e-01, 3.81197274e-01],
        [1.44663244e-01, 8.55336726e-01],
        [9.95531023e-01, 4.46896069e-03]],

       [[2.75914729e-01, 7.24085271e-01],
        [2.86573887e-01, 7.13426113e-01],
        [9.70233858e-01, 2.97661163e-02],
        ...,
        [5.19174397e-01, 4.80825573e-01],
        [1.09437846e-01, 8.90562117e-01],
        [9.82982695e-01, 1.70173123e-02]]])

In [21]:
concatenated_learner_test_preds.shape

(3, 5692, 2)

## Dataset

In [22]:
import torch
import math

random_number_generator = np.random.default_rng(seed=0)
def shuffle_data_frame(data_frame):
    text = list(data_frame['text'])
    label = list(data_frame['label'])

    assert(len(text) == len(label))

    indices = list(range(len(label)))

    # Make a random number generator that will shuffle list of indices
    # It is seeded to be reproducible
    random_number_generator.shuffle(indices)

    shuffled_text = []
    shuffled_labels = []

    # Iterate through the list of indices and add the original data
    # from those shuffled indices
    for index in indices:
        shuffled_text.append(text[index])
        shuffled_labels.append(label[index])

    return pd.DataFrame({
        'text': shuffled_text,
        'label': shuffled_labels,
    })


def get_train_test_split(data_frame: pd.DataFrame, test_size: float):
    """
    Makes a stratified train test split.
    This aims to preserve the distribution between classes.
    """
    if not (1 > test_size > 0):
        print('ERROR: test_size must be between 0 and 1')
        return

    data_frame = shuffle_data_frame(data_frame)

    data_frame_length = len(data_frame)
    train_size = 1 - test_size

    nonhate_rows = data_frame[data_frame['label'] == 0]
    nonhate_row_length = len(nonhate_rows)

    nonhate_row_train_size = math.ceil(nonhate_row_length * train_size)

    nonhate_row_train = nonhate_rows[0:nonhate_row_train_size]
    nonhate_row_test = nonhate_rows[nonhate_row_train_size:nonhate_row_length]

    assert(len(nonhate_row_train) + len(nonhate_row_test) == nonhate_row_length)

    hate_rows = data_frame[data_frame['label'] == 1]
    hate_row_length = len(hate_rows)

    hate_row_train_size = math.ceil(hate_row_length * train_size)

    hate_row_train = hate_rows[0:hate_row_train_size]
    hate_row_test = hate_rows[hate_row_train_size:hate_row_length]

    assert(len(hate_row_train) + len(hate_row_test) == hate_row_length)

    combined_train = pd.concat([nonhate_row_train, hate_row_train])
    combined_test = pd.concat([nonhate_row_test, hate_row_test])

    assert(len(combined_train) + len(combined_test) == data_frame_length)

    shuffled_train = shuffle_data_frame(combined_train)
    shuffled_test = shuffle_data_frame(combined_test)

    assert(len(shuffled_train) + len(shuffled_test) == data_frame_length)

    return (
        shuffled_train['text'],
        shuffled_test['text'],
        shuffled_train['label'],
        shuffled_test['label'],
    )

def read_csv_file(filename: str) -> pd.DataFrame:
    try:
        data = pd.read_csv(filename, lineterminator='\n', usecols=range(2))
        print("CSV file read successfully!")
        return data
    except FileNotFoundError:
        print("ERROR: File not found")
        exit(1)

In [23]:
dataset = read_csv_file('datasets/datasetall.csv')

dataset

CSV file read successfully!


Unnamed: 0,text,label
0,Binay: Patuloy ang kahirapan dahil sa maling p...,0
1,SA GOBYERNONG TAPAT WELCOME SA BAGUO ANG LAHAT...,0
2,wait so ur telling me Let Leni Lead mo pero NY...,1
3,[USERNAME]wish this is just a nightmare that ...,0
4,doc willie ong and isko sabunutan po,0
...,...,...
28456,"Bisaya, Probinsyano/a, mostly Bisaya = katulong",1
28457,Amnesia. In my whole life wala pa ako nakasala...,1
28458,Kontrabida na ilang beses na tinalo at obvious...,1
28459,Yung antagonist laging kailangang sobrang sama...,1


In [24]:
TEST_SIZE = 0.2

X_train, X_test, y_train, y_test = get_train_test_split(dataset, TEST_SIZE)

In [25]:
X_train

0        Matthew Chang [USERNAME] Remind ko lang di ba ...
1        Yay! The interview served its purpose wellJess...
2                                             I say DASURV
3                                TayNew said Let Leni Lead
4        Gloc 9 is not endorsing Jejomar Binay as his p...
                               ...                        
22764    Nov. 11: on [USERNAME] saw tv ads of Jojo Bina...
22765    Mar Roxas your call for unity describes one th...
22766    Buti nalang nagdecide nakong hindi manood ng T...
22767    sang boto para sa pagbabago. Let Leni Lead phi...
22768               Nakakainit ng dugo yung tv ad ni Binay
Name: text, Length: 22769, dtype: object

In [26]:
y_train

0        1
1        0
2        0
3        0
4        0
        ..
22764    1
22765    1
22766    0
22767    0
22768    1
Name: label, Length: 22769, dtype: int64

## Custom LR

In [27]:
# Initialize starting parameters
# We want to optimize these to best fit the learner preds
COEFFICIENTS = [0, 0, 0]
INTERCEPT = 0

LEARNING_RATE = 0.0001

In [28]:
def get_linear_combination(X_transposed):
  return INTERCEPT \
    + COEFFICIENTS[0] * X_transposed[:, 0] \
    + COEFFICIENTS[1] * X_transposed[:, 1] \
    + COEFFICIENTS[2] * X_transposed[:, 2]

def calculate_gradients(X, y):
  X_transposed = X.T[0]

  numerator = np.exp(get_linear_combination(X_transposed))

  p = numerator / (1 + numerator)

  partial_derivative_intercept = np.sum(y - p)
  partial_derivative_0 = np.sum((y - p) * X_transposed[:, 0])
  partial_derivative_1 = np.sum((y - p) * X_transposed[:, 1])
  partial_derivative_2 = np.sum((y - p) * X_transposed[:, 2])

  return np.array([
    partial_derivative_intercept,
    partial_derivative_0,
    partial_derivative_1,
    partial_derivative_2,
  ])

def logistic_regression(X):
  X_transposed = X.T[0]

  linear_combination = get_linear_combination(X_transposed)

  return 1 / (1 + np.exp(-linear_combination))
 

In [29]:
best_accuracy = 0
accuracy_values = []

epoch = 0
epochs_without_improvement = 0
epoch_threshold = 1000

while epochs_without_improvement < epoch_threshold:
  epoch += 1

  grads = calculate_gradients(concatenated_learner_preds, y_train)
  diff = abs(grads).sum()
  INTERCEPT += LEARNING_RATE * grads[0]
  COEFFICIENTS += LEARNING_RATE * grads[1:]

  # Evaluate on test
  test_output = logistic_regression(concatenated_learner_test_preds)

  discrete_test_output = np.round(test_output)

  test_accuracy = accuracy_score(y_test, discrete_test_output)
  accuracy_values.append(test_accuracy)

  if not (epoch % 10):
    print(f"Epoch: {epoch} | Accuracy: {test_accuracy}")

  if test_accuracy > best_accuracy:
    best_accuracy = test_accuracy
  else:
    epochs_without_improvement += 1


Epoch: 10 | Accuracy: 0.14898102600140548
Epoch: 20 | Accuracy: 0.1500351370344343
Epoch: 30 | Accuracy: 0.151791988756149
Epoch: 40 | Accuracy: 0.15126493323963458


Epoch: 50 | Accuracy: 0.15126493323963458
Epoch: 60 | Accuracy: 0.1510892480674631
Epoch: 70 | Accuracy: 0.1505621925509487
Epoch: 80 | Accuracy: 0.15126493323963458
Epoch: 90 | Accuracy: 0.15073787772312017
Epoch: 100 | Accuracy: 0.1500351370344343
Epoch: 110 | Accuracy: 0.14968376669009137
Epoch: 120 | Accuracy: 0.14968376669009137
Epoch: 130 | Accuracy: 0.14898102600140548
Epoch: 140 | Accuracy: 0.14898102600140548
Epoch: 150 | Accuracy: 0.1493323963457484
Epoch: 160 | Accuracy: 0.14915671117357696
Epoch: 170 | Accuracy: 0.14862965565706254
Epoch: 180 | Accuracy: 0.14845397048489106
Epoch: 190 | Accuracy: 0.14862965565706254
Epoch: 200 | Accuracy: 0.14845397048489106
Epoch: 210 | Accuracy: 0.14792691496837668
Epoch: 220 | Accuracy: 0.14792691496837668
Epoch: 230 | Accuracy: 0.1477512297962052
Epoch: 240 | Accuracy: 0.1477512297962052
Epoch: 250 | Accuracy: 0.14757554462403374
Epoch: 260 | Accuracy: 0.14757554462403374
Epoch: 270 | Accuracy: 0.14757554462403374
Epoch: 280 | Accuracy:

In [30]:
epoch

1001

In [31]:
COEFFICIENTS

array([1.69853832, 3.26696485, 3.24849701])

In [32]:
INTERCEPT

-4.67271406863987

In [33]:
accuracy_values

[0.5040407589599438,
 0.1758608573436402,
 0.15987350667603653,
 0.15548137737174983,
 0.15126493323963458,
 0.1526704146170063,
 0.1505621925509487,
 0.1505621925509487,
 0.14968376669009137,
 0.14898102600140548,
 0.14862965565706254,
 0.14880534082923402,
 0.14845397048489106,
 0.1495080815179199,
 0.1495080815179199,
 0.1500351370344343,
 0.1493323963457484,
 0.1493323963457484,
 0.14985945186226282,
 0.1500351370344343,
 0.15021082220660575,
 0.15038650737877723,
 0.15038650737877723,
 0.15091356289529165,
 0.1510892480674631,
 0.15091356289529165,
 0.1516163035839775,
 0.1516163035839775,
 0.15144061841180603,
 0.151791988756149,
 0.1516163035839775,
 0.15196767392832045,
 0.15214335910049193,
 0.15214335910049193,
 0.15196767392832045,
 0.15214335910049193,
 0.15196767392832045,
 0.1516163035839775,
 0.1516163035839775,
 0.15126493323963458,
 0.15126493323963458,
 0.15126493323963458,
 0.15144061841180603,
 0.15126493323963458,
 0.15144061841180603,
 0.1516163035839775,
 0.15179