# Logistic Regression

### Import necessary packages

In [1]:
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
import pandas as pd

import mlflow
import logging 
import config 

from basic_functions import (
    get_preprocess_data,
    get_lemmatized_data,
    get_metrics
)


  from .autonotebook import tqdm as notebook_tqdm


### Setup

In [2]:
MODEL_NAME = "logistic_regression" 
TRACKING_URI = open("../.mlflow_uri").read().strip()
EXPERIMENT_NAME = config.EXPERIMENT_NAME

logging.basicConfig(format="%(asctime)s: %(message)s") # Configure logging format to show timestamp before every message

logger = logging.getLogger()
logger.setLevel(logging.INFO) # Only show logs that are INFO or more important (e.g., WARNING, ERROR) — but ignore DEBUG.

In [3]:
DATA_PATH = "../data/data_dropped_duplicates_small.csv"


In [4]:
# mlflow.set_tracking_uri(TRACKING_URI)
# mlflow.set_experiment(EXPERIMENT_NAME)
# mlflow.start_run()
# run = mlflow.active_run()
# mlflow.set_tag("model_name", MODEL_NAME)
# mlflow.set_tag("mlflow.runName", "logistic_regression")
# mlflow.log_params(params)

### Get and process data

In [5]:
df = get_preprocess_data(DATA_PATH)

In [6]:
def binary_classification(x):
    if x == 'none':
        return 'none'
    else:
        return 'fallacy'

In [7]:
df_binary = df.copy()
df_binary['two_class_target'] = df_binary['logical_fallacies'].apply(binary_classification)

In [8]:
df_multi_class = df.copy()
df_multi_class = df_multi_class[df_multi_class["logical_fallacies"] != 'none']

In [9]:
# nltk.download('wordnet') commented because downloaded once.

### Lemmatize text

In [10]:
df_binary = get_lemmatized_data(df_binary)
df_multi_class = get_lemmatized_data(df_multi_class)

### Train-test split

In [11]:
y_binary = df_binary[["two_class_target", "logical_fallacies"]]
X_binary = df_binary["text"]

X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(
    X_binary, y_binary, test_size=0.30, random_state=42, stratify=y_binary)

In [12]:
fa_train = y_train_b["logical_fallacies"]
y_train_b = y_train_b["two_class_target"]

In [13]:
fa_test = y_test_b["logical_fallacies"]
y_test_b = y_test_b["two_class_target"]

In [14]:
y_multi = df_multi_class["logical_fallacies"]
X_multi = df_multi_class["text"]

X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(
    X_multi, y_multi, test_size=0.30, random_state=42, stratify=y_multi)

### Logistic Regression

#### thoughts for the moment

- pipeline one: binary first, classification next 
- pipeline 2: classification from the get-go
- add stemming? Not now, since we have the lemmatization
- add bag of words? not now
- For a multi_class problem, 
    - multi_class = “multinomial” 
- class weigh = balanced

In [15]:
# Use TF-IDF Vecorizer to transform text into numerical data
tfidf_vectorizer = TfidfVectorizer()
X_vectorized_b = tfidf_vectorizer.fit_transform(X_train_b)
X_vectorized_test_b = tfidf_vectorizer.transform(X_test_b)

In [16]:
# for the multi calss
tfidf_vectorizer_m = TfidfVectorizer()
X_vectorized_m = tfidf_vectorizer_m.fit_transform(X_train_m)
X_vectorized_test_m = tfidf_vectorizer_m.transform(X_test_m)

### Initialize the model

#### Binary classification

In [17]:
bin_model = LogisticRegression(
    penalty='l2',  # most of the solvers only work with l2
    class_weight= "balanced", 
    random_state=42, 
    verbose=0, 
    n_jobs=-1, 
    )

In [18]:
# Grid search for binary classification
param_grid_bin = {
'C': [0.1, 1, 10, 100],
'max_iter': [100, 500, 1000],
'solver': ['lbfgs', 'newton-cg', 'sag', 'saga', 'liblinear']
}

In [19]:
grid_search_b = GridSearchCV(bin_model, param_grid_bin, cv=5)
grid_search_b.fit(X_vectorized_b, y_train_b)

best_params_b = grid_search_b.best_params_
print(best_params_b)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'C': 10, 'max_iter': 100, 'solver': 'lbfgs'}


In [20]:

# Predict on train and test data
best_model_b = grid_search_b.best_estimator_
y_train_pred_b = best_model_b.predict(X_vectorized_b)
y_test_pred_b = best_model_b.predict(X_vectorized_test_b)

# mlflow.log_params(best_params_b)

#### Multi-class classification

In [21]:
multi_model = LogisticRegression(
    penalty='l2',  # most of the solvers only work with l2
    class_weight= "balanced", 
    random_state=42, 
    multi_class='multinomial', 
    verbose=0, 
    n_jobs=-1, 
    )

In [22]:

# Grid search
param_grid_multi = {
'C': [0.1, 1, 10, 100],
'max_iter': [100, 500, 1000],
'solver': ['lbfgs', 'newton-cg', 'sag', 'saga']
}

In [23]:
grid_search_m = GridSearchCV(multi_model, param_grid_multi, cv=5)
grid_search_m.fit(X_vectorized_m, y_train_m)

best_params_m = grid_search_m.best_params_
print(best_params_m)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'C': 10, 'max_iter': 500, 'solver': 'saga'}


In [24]:

# Predict on train and test data
best_model_m = grid_search_m.best_estimator_
y_train_pred_m = best_model_m.predict(X_vectorized_m)

# mlflow.log_params(best_params_b)

In [25]:
# # Save model to pickle file:
# with open('../models/svm/svm_model.pkl', 'wb') as f:
#     pickle.dump(best_model, f)

#### Two-step approach

In [26]:
df_pred =  pd.DataFrame({"text": X_test_b, "logical_fallacies": fa_test , "two_class_target": y_test_b, "binary_prediction": y_test_pred_b})
df_pred.head()

Unnamed: 0,text,logical_fallacies,two_class_target,binary_prediction
15245,: the product regulatory authority(sahpra) say...,none,none,none
2656,this restaurant is terrible; i had a bad exper...,faulty_generalization,fallacy,fallacy
18793,it’s a rigged election.,appeal_to_emotion,fallacy,fallacy
13982,": my appeal to the international community, th...",none,none,none
11859,"i am not sure hrt could be considered a ""simpl...",none,none,none


In [27]:
df_pred = df_pred[df_pred["binary_prediction"] != "none"]

In [28]:
X_df_pred = tfidf_vectorizer_m.transform(df_pred["text"])
X_df_pred

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 16280 stored elements and shape (764, 6920)>

In [29]:
y_test_pred_m = best_model_m.predict(X_df_pred)


### Evaluation


In [30]:
# def log_metrics(cr, split):
#     for key, value in cr.items():
#         if (key == "accuracy"):
#                 # print(f"{split}_{key}", round(value,2))
#                 mlflow.log_metric(f"{split}_{key}", value)
#         else:
#             for metric in value:
#                 mlflow.log_metric(f"{split}_{key}_{metric}", value.get(metric))
#                 # print(f"{split}_{key}_{metric}", round(value.get(metri

In [31]:
logger.info('get test metrics for the two step approach')
classification_report_test = get_metrics(df_pred["logical_fallacies"], y_test_pred_m)
# log_metrics(classification_report_test, "test")

INFO:root:get test metrics for the two step approach
INFO:basic_functions:classification_report
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
INFO:basic_functions:confusion_matrix


                       precision    recall  f1-score   support

           ad_hominem       0.52      0.64      0.57       120
  appeal_to_authority       0.42      0.51      0.46        61
    appeal_to_emotion       0.50      0.62      0.55       175
        false_dilemma       0.57      0.78      0.66        95
faulty_generalization       0.43      0.61      0.51       137
                 none       0.00      0.00      0.00       176

             accuracy                           0.49       764
            macro avg       0.41      0.53      0.46       764
         weighted avg       0.38      0.49      0.43       764

[[ 77   4  28   3   8   0]
 [  5  31  11   3  11   0]
 [ 24  13 108   7  23   0]
 [  4   1   4  74  12   0]
 [ 12   7  24  10  84   0]
 [ 27  17  43  32  57   0]]


In [32]:
logger.info('get train metrics for binary classification')
classification_report_b_train = get_metrics(y_train_b, y_train_pred_b)
# log_metrics(classification_report_b_train, "train")

INFO:root:get train metrics for binary classification
INFO:basic_functions:classification_report


              precision    recall  f1-score   support

     fallacy       0.99      0.98      0.98      1830
        none       0.97      0.99      0.98      1670

    accuracy                           0.98      3500
   macro avg       0.98      0.98      0.98      3500
weighted avg       0.98      0.98      0.98      3500


INFO:basic_functions:confusion_matrix



[[1787   43]
 [  12 1658]]


In [33]:
logger.info('get train metrics for multi classification')
classification_report_m_train = get_metrics(y_train_m, y_train_pred_m)
# log_metrics(classification_report_m_train, "train")

INFO:root:get train metrics for multi classification
INFO:basic_functions:classification_report
INFO:basic_functions:confusion_matrix


                       precision    recall  f1-score   support

           ad_hominem       1.00      1.00      1.00       331
  appeal_to_authority       1.00      1.00      1.00       227
    appeal_to_emotion       1.00      1.00      1.00       504
        false_dilemma       1.00      1.00      1.00       319
faulty_generalization       1.00      1.00      1.00       448

             accuracy                           1.00      1829
            macro avg       1.00      1.00      1.00      1829
         weighted avg       1.00      1.00      1.00      1829

[[331   0   0   0   0]
 [  0 227   0   0   0]
 [  0   0 504   0   0]
 [  0   0   0 319   0]
 [  0   0   0   0 448]]


In [34]:
# mlflow.end_run()