# Logistic Regression

### Import necessary packages

In [1]:
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
import pandas as pd

import mlflow
import logging 
import config 

from basic_functions import (
    get_preprocess_data,
    get_lemmatized_data,
    get_metrics
)


  from .autonotebook import tqdm as notebook_tqdm


### Setup

In [2]:
MODEL_NAME = "logistic_regression" 
TRACKING_URI = open("../.mlflow_uri").read().strip()
EXPERIMENT_NAME = config.EXPERIMENT_NAME

logging.basicConfig(format="%(asctime)s: %(message)s") # Configure logging format to show timestamp before every message

logger = logging.getLogger()
logger.setLevel(logging.INFO) # Only show logs that are INFO or more important (e.g., WARNING, ERROR) — but ignore DEBUG.

In [3]:
# DATA_PATH = "../data/data_dropped_duplicates_small.csv"
DATA_PATH = "../data/data_tiny.csv"

In [4]:
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.start_run()
run = mlflow.active_run()
mlflow.set_tag("model_name", MODEL_NAME)
mlflow.set_tag("mlflow.runName", "logistic_regression")
# mlflow.log_params(params)

### Get and process data

In [5]:
df = get_preprocess_data(DATA_PATH)

In [6]:
def binary_classification(x):
    if x == 'none':
        return 'none'
    else:
        return 'fallacy'

In [7]:
df_binary = df.copy()
df_binary['two_class_target'] = df_binary['logical_fallacies'].apply(binary_classification)

In [8]:
df_multi_class = df.copy()
df_multi_class = df_multi_class[df_multi_class["logical_fallacies"] != 'none']

In [9]:
# nltk.download('wordnet') commented because downloaded once.

### Lemmatize text

In [10]:
df_binary = get_lemmatized_data(df_binary)
df_multi_class = get_lemmatized_data(df_multi_class)

### Train-test split

In [11]:
y_binary = df_binary[["two_class_target", "logical_fallacies"]]
X_binary = df_binary["text"]

X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(
    X_binary, y_binary, test_size=0.30, random_state=42, stratify=y_binary)

In [12]:
fa_train = y_train_b["logical_fallacies"]
y_train_b = y_train_b["two_class_target"]

In [13]:
fa_test = y_test_b["logical_fallacies"]
y_test_b = y_test_b["two_class_target"]

In [14]:
y_multi = df_multi_class["logical_fallacies"]
X_multi = df_multi_class["text"]

X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(
    X_multi, y_multi, test_size=0.30, random_state=42, stratify=y_multi)

### Logistic Regression

#### thoughts for the moment

- pipeline one: binary first, classification next 
- pipeline 2: classification from the get-go
- add stemming? Not now, since we have the lemmatization
- add bag of words? not now
- For a multi_class problem, 
    - multi_class = “multinomial” 
- class weigh = balanced

In [15]:
# Use TF-IDF Vecorizer to transform text into numerical data
tfidf_vectorizer = TfidfVectorizer()
X_vectorized_b = tfidf_vectorizer.fit_transform(X_train_b)
X_vectorized_test_b = tfidf_vectorizer.transform(X_test_b)

In [16]:
# for the multi calss
tfidf_vectorizer_m = TfidfVectorizer()
X_vectorized_m = tfidf_vectorizer_m.fit_transform(X_train_m)
X_vectorized_test_m = tfidf_vectorizer_m.transform(X_test_m)

### Initialize the model

#### Binary classification

In [17]:
bin_model = LogisticRegression(
    penalty='l2',  # most of the solvers only work with l2
    class_weight= "balanced", 
    random_state=42, 
    verbose=0, 
    n_jobs=-1, 
    )

In [18]:
# Grid search for binary classification
param_grid_bin = {
'C': [0.1, 1, 10, 100],
'max_iter': [100, 500, 1000],
'solver': ['lbfgs', 'newton-cg', 'sag', 'saga', 'liblinear']
}

In [19]:
grid_search_b = GridSearchCV(bin_model, param_grid_bin, cv=5)
grid_search_b.fit(X_vectorized_b, y_train_b)

best_params_b = grid_search_b.best_params_
print(best_params_b)



{'C': 1, 'max_iter': 100, 'solver': 'saga'}


In [20]:

# Predict on train and test data
best_model_b = grid_search_b.best_estimator_
y_train_pred_b = best_model_b.predict(X_vectorized_b)
y_test_pred_b = best_model_b.predict(X_vectorized_test_b)

# mlflow.log_params(best_params_b)

#### Multi-class classification

In [21]:
multi_model = LogisticRegression(
    penalty='l2',  # most of the solvers only work with l2
    class_weight= "balanced", 
    random_state=42, 
    multi_class='multinomial', 
    verbose=0, 
    n_jobs=-1, 
    )

In [22]:

# Grid search
param_grid_multi = {
'C': [0.1, 1, 10, 100],
'max_iter': [100, 500, 1000],
'solver': ['lbfgs', 'newton-cg', 'sag', 'saga']
}

In [23]:
grid_search_m = GridSearchCV(multi_model, param_grid_multi, cv=5)
grid_search_m.fit(X_vectorized_m, y_train_m)

best_params_m = grid_search_m.best_params_
print(best_params_m)



{'C': 1, 'max_iter': 100, 'solver': 'saga'}




In [24]:

# Predict on train and test data
best_model_m = grid_search_m.best_estimator_
y_train_pred_m = best_model_m.predict(X_vectorized_m)

# mlflow.log_params(best_params_b)

In [25]:
# # Save model to pickle file:
# with open('../models/svm/svm_model.pkl', 'wb') as f:
#     pickle.dump(best_model, f)

#### Two-step approach

In [26]:
df_pred =  pd.DataFrame({"text": X_test_b, "logical_fallacies": fa_test , "two_class_target": y_test_b, "binary_prediction": y_test_pred_b})
df_pred.head()

Unnamed: 0,text,logical_fallacies,two_class_target,binary_prediction
683,"the analysis , published by the breakthrough n...",faulty_generalization,fallacy,fallacy
15869,: trump ha moved on to statue and lobsters. he...,faulty_generalization,fallacy,fallacy
13313,: is there anyone with knowledge concerning th...,none,none,none
1144,all thing being equal – “ and it ’ s only got ...,faulty_generalization,fallacy,fallacy
19611,"now, finally, if you can't pay the bill you're...",appeal_to_emotion,fallacy,fallacy


In [27]:
df_pred = df_pred[df_pred["binary_prediction"] != "none"]

In [28]:
X_df_pred = tfidf_vectorizer_m.transform(df_pred["text"])
X_df_pred

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 210 stored elements and shape (17, 584)>

In [29]:
y_test_pred_m = best_model_m.predict(X_df_pred)


### Evaluation


In [None]:
# def log_metrics(cr, split):
#     for key, value in cr.items():
#         if (key == "accuracy"):
#                 # print(f"{split}_{key}", round(value,2))
#                 mlflow.log_metric(f"{split}_{key}", value)
#         else:
#             for metric in value:
#                 mlflow.log_metric(f"{split}_{key}_{metric}", value.get(metric))
#                 # print(f"{split}_{key}_{metric}", round(value.get(metri

In [31]:
logger.info('get test metrics for the two step approach')
classification_report_test = get_metrics(df_pred["logical_fallacies"], y_test_pred_m)
# log_metrics(classification_report_test, "test")

INFO:root:get test metrics for the two step approach
INFO:basic_functions:classification_report
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
INFO:basic_functions:confusion_matrix


                       precision    recall  f1-score   support

           ad_hominem       0.00      0.00      0.00         2
  appeal_to_authority       0.00      0.00      0.00         1
    appeal_to_emotion       0.43      0.50      0.46         6
        false_dilemma       0.17      1.00      0.29         1
faulty_generalization       0.50      0.33      0.40         3
                 none       0.00      0.00      0.00         4

             accuracy                           0.29        17
            macro avg       0.18      0.31      0.19        17
         weighted avg       0.25      0.29      0.25        17

[[0 0 1 1 0 0]
 [0 0 1 0 0 0]
 [0 0 3 2 1 0]
 [0 0 0 1 0 0]
 [0 0 1 1 1 0]
 [1 1 1 1 0 0]]


In [None]:
logger.info('get train metrics for binary classification')
classification_report_b_train = get_metrics(y_test, y_test_pred)
log_metrics(classification_report_b_train, "train")

In [None]:
mlflow.end_run()

### Naive Bayes

In [None]:
# Pipeline for TF-IFD and Naive Bayes
# pipeline_bayes = Pipeline([
#     ('tfidf', TfidfVectorizer()),
#     ('nb', MultinomialNB()),
# ])

# # Train the model
# pipeline_bayes.fit(X_train, y_train)

# # Predict on train and test data
# y_train_pred_bayes = pipeline_bayes.predict(X_train)
# y_test_pred_bayes = pipeline_bayes.predict(X_test)

# __compute_and_log_metrics(y_train, y_train_pred_bayes, "train")
# __compute_and_log_metrics(y_test, y_test_pred_bayes, "test")