# Logistic Regression

### Import necessary packages

In [8]:
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
import pandas as pd

import mlflow
import logging 
import config 

from basic_functions import (
    get_preprocess_data,
    get_lemmatized_data,
    get_metrics
)


### Setup

In [9]:
MODEL_NAME = "logistic_regression" 
TRACKING_URI = open("../.mlflow_uri").read().strip()
EXPERIMENT_NAME = config.EXPERIMENT_NAME

logging.basicConfig(format="%(asctime)s: %(message)s") # Configure logging format to show timestamp before every message

logger = logging.getLogger()
logger.setLevel(logging.INFO) # Only show logs that are INFO or more important (e.g., WARNING, ERROR) — but ignore DEBUG.

In [10]:
DATA_PATH = "../data/data_dropped_duplicates_small.csv"


In [4]:
# mlflow.set_tracking_uri(TRACKING_URI)
# mlflow.set_experiment(EXPERIMENT_NAME)
# mlflow.start_run()
# run = mlflow.active_run()
# mlflow.set_tag("model_name", MODEL_NAME)
# mlflow.set_tag("mlflow.runName", "logistic_regression")
# mlflow.log_params(params)

## Two-Step-Approach

### Get and process data

In [15]:
df = get_preprocess_data(DATA_PATH)

In [16]:
def binary_classification(x):
    if x == 'none':
        return 'none'
    else:
        return 'fallacy'

In [17]:
df_binary = df.copy()
df_binary['two_class_target'] = df_binary['logical_fallacies'].apply(binary_classification)

In [18]:
df_multi_class = df.copy()
df_multi_class = df_multi_class[df_multi_class["logical_fallacies"] != 'none']

In [19]:
# nltk.download('wordnet') commented because downloaded once.

### Lemmatize text

In [20]:
df_binary = get_lemmatized_data(df_binary)
df_multi_class = get_lemmatized_data(df_multi_class)

### Train-test split

In [21]:
y_binary = df_binary[["two_class_target", "logical_fallacies"]]
X_binary = df_binary["text"]

X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(
    X_binary, y_binary, test_size=0.30, random_state=42, stratify=y_binary)

In [22]:
fa_train = y_train_b["logical_fallacies"]
y_train_b = y_train_b["two_class_target"]

In [23]:
fa_test = y_test_b["logical_fallacies"]
y_test_b = y_test_b["two_class_target"]

In [24]:
y_multi = df_multi_class["logical_fallacies"]
X_multi = df_multi_class["text"]

X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(
    X_multi, y_multi, test_size=0.30, random_state=42, stratify=y_multi)

### Logistic Regression

#### thoughts for the moment

- pipeline one: binary first, classification next 
- pipeline 2: classification from the get-go
- add stemming? Not now, since we have the lemmatization
- add bag of words? not now
- For a multi_class problem, 
    - multi_class = “multinomial” 
- class weigh = balanced

In [26]:
# Use TF-IDF Vecorizer to transform text into numerical data
tfidf_vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),  # Bigrams (or even trigrams)
    max_features=5000,   # Limit feature space to the most important words
    min_df=5,            # Consider words that appear in at least 5 documents
    max_df=0.9           # Ignore words that appear in more than 90% of documents
)
X_vectorized_b = tfidf_vectorizer.fit_transform(X_train_b)
X_vectorized_test_b = tfidf_vectorizer.transform(X_test_b)

In [27]:
# for the multi calss
tfidf_vectorizer_m = TfidfVectorizer(
    ngram_range=(1, 2),  # Bigrams (or even trigrams)
    max_features=5000,   # Limit feature space to the most important words
    min_df=5,            # Consider words that appear in at least 5 documents
    max_df=0.9           # Ignore words that appear in more than 90% of documents

)
X_vectorized_m = tfidf_vectorizer_m.fit_transform(X_train_m)
X_vectorized_test_m = tfidf_vectorizer_m.transform(X_test_m)

### Initialize the model

#### Binary classification

In [28]:
bin_model = LogisticRegression(
    penalty='l2',  # most of the solvers only work with l2
    random_state=42, 
    verbose=0, 
    n_jobs=-1, 
    )

In [29]:
# Grid search for binary classification
param_grid_bin = {
'C': [0.1, 1, 10, 100],
'max_iter': [100, 500, 1000],
'solver': ['lbfgs', 'newton-cg', 'sag', 'saga', 'liblinear']
}

In [30]:
grid_search_b = GridSearchCV(bin_model, param_grid_bin, cv=5)
grid_search_b.fit(X_vectorized_b, y_train_b)

best_params_b = grid_search_b.best_params_
print(best_params_b)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'C': 1, 'max_iter': 100, 'solver': 'saga'}




In [31]:

# Predict on train and test data
best_model_b = grid_search_b.best_estimator_
y_train_pred_b = best_model_b.predict(X_vectorized_b)
y_test_pred_b = best_model_b.predict(X_vectorized_test_b)

# mlflow.log_params(best_params_b)

#### Multi-class classification

In [32]:
multi_model = LogisticRegression(
    penalty='l2',  # most of the solvers only work with l2
    class_weight= "balanced", 
    random_state=42, 
    multi_class='multinomial', 
    verbose=0, 
    n_jobs=-1, 
    )

In [33]:

# Grid search
param_grid_multi = {
'C': [0.1, 1, 10, 100],
'max_iter': [100, 500, 1000],
'solver': ['lbfgs', 'newton-cg', 'sag', 'saga']
}

In [34]:
grid_search_m = GridSearchCV(multi_model, param_grid_multi, cv=5)
grid_search_m.fit(X_vectorized_m, y_train_m)

best_params_m = grid_search_m.best_params_
print(best_params_m)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'C': 10, 'max_iter': 100, 'solver': 'saga'}


In [35]:

# Predict on train and test data
best_model_m = grid_search_m.best_estimator_
y_train_pred_m = best_model_m.predict(X_vectorized_m)

# mlflow.log_params(best_params_b)

In [36]:
# # Save model to pickle file:
# with open('../models/svm/svm_model.pkl', 'wb') as f:
#     pickle.dump(best_model, f)

#### Two-step approach

In [37]:
df_pred =  pd.DataFrame({"text": X_test_b, "logical_fallacies": fa_test , "two_class_target": y_test_b, "binary_prediction": y_test_pred_b})
df_pred.head()

Unnamed: 0,text,logical_fallacies,two_class_target,binary_prediction
17769,nuclear power is too dangerous,none,none,fallacy
11994,how disturbing and sad this story truly is i k...,none,none,none
8523,i see the situation a more complex than that i...,none,none,none
9975,how doe the country just let so many dangerous...,faulty_generalization,fallacy,fallacy
19573,he twisted arm and got six democrat to vote th...,appeal_to_emotion,fallacy,fallacy


In [38]:
df_pred = df_pred[df_pred["binary_prediction"] != "none"]

In [39]:
X_df_pred = tfidf_vectorizer_m.transform(df_pred["text"])
X_df_pred

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 18048 stored elements and shape (763, 2426)>

In [40]:
y_test_pred_m = best_model_m.predict(X_df_pred)


### Evaluation


In [41]:
# def log_metrics(cr, split):
#     for key, value in cr.items():
#         if (key == "accuracy"):
#                 # print(f"{split}_{key}", round(value,2))
#                 mlflow.log_metric(f"{split}_{key}", value)
#         else:
#             for metric in value:
#                 mlflow.log_metric(f"{split}_{key}_{metric}", value.get(metric))
#                 # print(f"{split}_{key}_{metric}", round(value.get(metri

In [42]:
logger.info('get test metrics for the two step approach')
classification_report_test = get_metrics(df_pred["logical_fallacies"], y_test_pred_m)
# log_metrics(classification_report_test, "test")

INFO:root:get test metrics for the two step approach
INFO:basic_functions:classification_report
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
INFO:basic_functions:confusion_matrix


                       precision    recall  f1-score   support

           ad_hominem       0.50      0.63      0.56       128
  appeal_to_authority       0.38      0.54      0.44        56
    appeal_to_emotion       0.53      0.68      0.59       181
        false_dilemma       0.66      0.78      0.72        93
faulty_generalization       0.39      0.55      0.46       125
                 none       0.00      0.00      0.00       180

             accuracy                           0.49       763
            macro avg       0.41      0.53      0.46       763
         weighted avg       0.38      0.49      0.43       763

[[ 81  10  20   3  14   0]
 [  5  30  11   1   9   0]
 [ 20  10 123   3  25   0]
 [  4   1   8  73   7   0]
 [ 16   8  23   9  69   0]
 [ 35  21  48  22  54   0]]


In [43]:
logger.info('get train metrics for binary classification')
classification_report_b_train = get_metrics(y_train_b, y_train_pred_b)
# log_metrics(classification_report_b_train, "train")

INFO:root:get train metrics for binary classification
INFO:basic_functions:classification_report
INFO:basic_functions:confusion_matrix


              precision    recall  f1-score   support

     fallacy       0.88      0.87      0.87      1785
        none       0.86      0.88      0.87      1715

    accuracy                           0.87      3500
   macro avg       0.87      0.87      0.87      3500
weighted avg       0.87      0.87      0.87      3500

[[1548  237]
 [ 210 1505]]


In [44]:
logger.info('get train metrics for multi classification')
classification_report_m_train = get_metrics(y_train_m, y_train_pred_m)
# log_metrics(classification_report_m_train, "train")

INFO:root:get train metrics for multi classification
INFO:basic_functions:classification_report
INFO:basic_functions:confusion_matrix


                       precision    recall  f1-score   support

           ad_hominem       0.98      0.99      0.99       326
  appeal_to_authority       1.00      1.00      1.00       229
    appeal_to_emotion       1.00      0.99      0.99       486
        false_dilemma       1.00      1.00      1.00       299
faulty_generalization       1.00      1.00      1.00       445

             accuracy                           1.00      1785
            macro avg       1.00      1.00      1.00      1785
         weighted avg       1.00      1.00      1.00      1785

[[324   0   2   0   0]
 [  0 229   0   0   0]
 [  5   0 480   0   1]
 [  0   0   0 299   0]
 [  0   0   0   0 445]]


In [34]:
# mlflow.end_run()

## One step approach


In [11]:
df = get_preprocess_data(DATA_PATH)
df = get_lemmatized_data(df)

X = df['text']
y = df['logical_fallacies']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y)

tfidf_vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),  # Bigrams (or even trigrams)
    max_features=1000,   # Limit feature space to the most important words
    min_df=5,            # Consider words that appear in at least 5 documents
    max_df=0.9)       # Ignore words that appear in more than 90% of documents)
    
X_vectorized = tfidf_vectorizer.fit_transform(X_train)
X_vectorized_test = tfidf_vectorizer.transform(X_test)

one_model = LogisticRegression(
    penalty='l2',  # most of the solvers only work with l2
    random_state=42,
    class_weight='balanced', 
    verbose=0, 
    n_jobs=-1, 
    )

param_grid_one = {
    # 'penalty' :['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10],
    'max_iter': [100, 500, 1000],
    'solver': ['lbfgs', 'newton-cg', 'sag', 'saga', 'liblinear']
}

cv = StratifiedKFold(5)

grid_search_one = GridSearchCV(one_model, param_grid_one, cv=cv)
grid_search_one.fit(X_vectorized, y_train)

best_params_one = grid_search_one.best_params_
print(best_params_one)

best_model_one = grid_search_one.best_estimator_
y_train_pred_one = best_model_one.predict(X_vectorized)
y_test_pred_one = best_model_one.predict(X_vectorized_test)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'C': 1, 'max_iter': 100, 'solver': 'liblinear'}




In [12]:
logger.info('get train metrics for one-step approach')
classification_report_one_train = get_metrics(y_train, y_train_pred_one)

INFO:root:get train metrics for one-step approach
INFO:basic_functions:classification_report
INFO:basic_functions:confusion_matrix


                       precision    recall  f1-score   support

           ad_hominem       0.63      0.67      0.65       325
  appeal_to_authority       0.67      0.74      0.71       229
    appeal_to_emotion       0.65      0.72      0.69       487
        false_dilemma       0.77      0.79      0.78       299
faulty_generalization       0.70      0.57      0.63       445
                 none       0.83      0.82      0.83      1715

             accuracy                           0.75      3500
            macro avg       0.71      0.72      0.71      3500
         weighted avg       0.76      0.75      0.75      3500

[[ 219    9   37    8   10   42]
 [  10  170   14    4    2   29]
 [  30   18  352   11   25   51]
 [   6    1    8  235    5   44]
 [  16   10   37   12  252  118]
 [  67   45   90   37   65 1411]]


In [14]:
logger.info('get test metrics for one-step approach')
classification_report_one_test = get_metrics(y_test, y_test_pred_one)

INFO:root:get test metrics for one-step approach
INFO:basic_functions:classification_report
INFO:basic_functions:confusion_matrix


                       precision    recall  f1-score   support

           ad_hominem       0.54      0.56      0.55       140
  appeal_to_authority       0.42      0.53      0.47        98
    appeal_to_emotion       0.52      0.58      0.55       208
        false_dilemma       0.60      0.65      0.62       128
faulty_generalization       0.49      0.38      0.43       191
                 none       0.76      0.74      0.75       735

             accuracy                           0.63      1500
            macro avg       0.56      0.57      0.56      1500
         weighted avg       0.64      0.63      0.63      1500

[[ 78  13  18   6   6  19]
 [  5  52  10   0   4  27]
 [ 18  14 121   4  20  31]
 [  3   2   8  83   3  29]
 [ 13   9  23  11  73  62]
 [ 28  34  51  34  44 544]]
