In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
df = pd.read_csv('sample_data/cleaned_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3448 entries, 0 to 3447
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   president              3448 non-null   object
 1   question_order         3448 non-null   int64 
 2   interview_question     3448 non-null   object
 3   interview_answer       3448 non-null   object
 4   question               3448 non-null   object
 5   inaudible              3448 non-null   bool  
 6   multiple_questions     3448 non-null   bool  
 7   affirmative_questions  3448 non-null   bool  
 8   index                  3448 non-null   int64 
 9   clarity_label          3448 non-null   object
 10  evasion_label          3448 non-null   object
dtypes: bool(3), int64(2), object(6)
memory usage: 225.7+ KB


In [None]:
TARGET_COLUMN = 'evasion_label'
df['question'] = df['question'].astype(str).fillna('')
df['interview_answer'] = df['interview_answer'].astype(str).fillna('')
df[TARGET_COLUMN] = df[TARGET_COLUMN].astype(str)

print(f"Classes distribution {TARGET_COLUMN}:")
print(df[TARGET_COLUMN].value_counts())

Classes distribution evasion_label:
evasion_label
Explicit               1052
Dodging                 706
Implicit                488
General                 386
Deflection              381
Declining to answer     145
Claims ignorance        119
Clarification            92
Partial/half-answer      79
Name: count, dtype: int64


In [None]:
df['text_input'] = df['question'] + " [SEP] " + df['interview_answer']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df['text_input'],
    df[TARGET_COLUMN],
    test_size=0.2,
    random_state=42,
    stratify=df[TARGET_COLUMN]
)
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [None]:
model = LogisticRegression(max_iter=1000, class_weight='balanced')
model.fit(X_train_vec, y_train)

preds = model.predict(X_test_vec)

print(f"Accuracy: {accuracy_score(y_test, preds):.4f}")
print("\nDetailed report (Precision, Recall, F1):")
print(classification_report(y_test, preds))

cm = confusion_matrix(y_test, preds)
print("\nConfusion Matrix:")
print(cm)

Accuracy: 0.2696

Detailed report (Precision, Recall, F1):
                     precision    recall  f1-score   support

   Claims ignorance       0.23      0.38      0.29        24
      Clarification       0.29      0.78      0.42        18
Declining to answer       0.19      0.34      0.25        29
         Deflection       0.19      0.29      0.23        76
            Dodging       0.31      0.23      0.26       141
           Explicit       0.50      0.24      0.32       211
            General       0.26      0.27      0.26        77
           Implicit       0.22      0.26      0.24        98
Partial/half-answer       0.08      0.19      0.12        16

           accuracy                           0.27       690
          macro avg       0.25      0.33      0.26       690
       weighted avg       0.32      0.27      0.27       690


Confusion Matrix:
[[ 9  1  2  3  4  2  0  2  1]
 [ 0 14  0  0  3  0  1  0  0]
 [ 1  5 10  4  2  2  2  1  2]
 [ 6  2  3 22 13  3  8 18  1]
 [ 6  

In [None]:
from sklearn import svm

svm_clf = svm.SVC(kernel='linear',class_weight='balanced', random_state=42)
svm_clf.fit(X_train_vec, y_train)

preds = svm_clf.predict(X_test_vec)

print(f"Accuracy: {accuracy_score(y_test, preds):.4f}")
print("\nDetailed report (Precision, Recall, F1):")
print(classification_report(y_test, preds))

cm = confusion_matrix(y_test, preds)
print("\nConfusion Matrix:")
print(cm)

Accuracy: 0.2899

Detailed report (Precision, Recall, F1):
                     precision    recall  f1-score   support

   Claims ignorance       0.28      0.29      0.29        24
      Clarification       0.48      0.67      0.56        18
Declining to answer       0.32      0.34      0.33        29
         Deflection       0.19      0.34      0.24        76
            Dodging       0.30      0.26      0.28       141
           Explicit       0.52      0.25      0.34       211
            General       0.24      0.27      0.25        77
           Implicit       0.23      0.32      0.26        98
Partial/half-answer       0.13      0.19      0.15        16

           accuracy                           0.29       690
          macro avg       0.30      0.33      0.30       690
       weighted avg       0.34      0.29      0.30       690


Confusion Matrix:
[[ 7  1  1  4  6  2  0  2  1]
 [ 0 12  0  0  5  0  1  0  0]
 [ 0  0 10  4  6  3  2  2  2]
 [ 5  0  1 26 12  3 11 18  0]
 [ 6  

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train_vec.toarray(), y_train)

preds = gnb.predict(X_test_vec.toarray())
print(f"Accuracy: {accuracy_score(y_test, preds):.4f}")
print("\nDetailed report (Precision, Recall, F1):")
print(classification_report(y_test, preds))

cm = confusion_matrix(y_test, preds)
print("\nConfusion Matrix:")
print(cm)

Accuracy: 0.2217

Detailed report (Precision, Recall, F1):
                     precision    recall  f1-score   support

   Claims ignorance       0.11      0.12      0.12        24
      Clarification       0.32      0.33      0.32        18
Declining to answer       0.14      0.10      0.12        29
         Deflection       0.20      0.32      0.24        76
            Dodging       0.25      0.17      0.20       141
           Explicit       0.33      0.25      0.28       211
            General       0.16      0.22      0.18        77
           Implicit       0.21      0.23      0.22        98
Partial/half-answer       0.03      0.06      0.04        16

           accuracy                           0.22       690
          macro avg       0.19      0.20      0.19       690
       weighted avg       0.24      0.22      0.23       690


Confusion Matrix:
[[ 3  0  0  6  3  2  4  4  2]
 [ 2  6  0  3  2  0  2  1  2]
 [ 1  1  3  4  5  9  1  3  2]
 [ 3  0  1 24 11 20  6 10  1]
 [ 3  

In [None]:
mapping_df = df[['evasion_label', 'clarity_label']].drop_duplicates()
evasion_to_clarity_map = dict(zip(mapping_df['evasion_label'], mapping_df['clarity_label']))

evasion_to_clarity_map

{'Explicit': 'Clear Reply',
 'General': 'Ambivalent',
 'Partial/half-answer': 'Ambivalent',
 'Dodging': 'Ambivalent',
 'Implicit': 'Ambivalent',
 'Deflection': 'Ambivalent',
 'Declining to answer': 'Clear Non-Reply',
 'Claims ignorance': 'Clear Non-Reply',
 'Clarification': 'Clear Non-Reply'}

In [None]:
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

svm_clf = svm.SVC(kernel='linear', class_weight='balanced')
train_evaluate("SVM", svm_clf, X_train_vec, y_train, X_test_vec, y_test)

log_clf = LogisticRegression(class_weight='balanced', max_iter=1000)
train_evaluate("Logistic Regression", log_clf, X_train_vec, y_train, X_test_vec, y_test)

nb_clf = MultinomialNB()
train_evaluate("Naive Bayes", nb_clf, X_train_vec, y_train, X_test_vec, y_test)

--- Traing SVM on Evasion Labels ---
[SVM] Evasion F1-Macro: 0.3015
[SVM] Clarity F1-Macro: 0.4886
[SVM] Clarity Accuracy: 0.5986
--------------------------------------------------
--- Traing Logistic Regression on Evasion Labels ---
[Logistic Regression] Evasion F1-Macro: 0.2645
[Logistic Regression] Clarity F1-Macro: 0.4714
[Logistic Regression] Clarity Accuracy: 0.5652
--------------------------------------------------
--- Traing Naive Bayes on Evasion Labels ---
[Naive Bayes] Evasion F1-Macro: 0.0659
[Naive Bayes] Clarity F1-Macro: 0.1791
[Naive Bayes] Clarity Accuracy: 0.3246
--------------------------------------------------


(array(['Explicit', 'Explicit', 'Explicit', 'Explicit', 'Explicit',
        'Explicit', 'Explicit', 'Explicit', 'Explicit', 'Explicit',
        'Explicit', 'Explicit', 'Explicit', 'Explicit', 'Explicit',
        'Explicit', 'Explicit', 'Explicit', 'Explicit', 'Explicit',
        'Explicit', 'Explicit', 'Explicit', 'Explicit', 'Explicit',
        'Explicit', 'Explicit', 'Explicit', 'Explicit', 'Explicit',
        'Explicit', 'Explicit', 'Explicit', 'Explicit', 'Explicit',
        'Explicit', 'Explicit', 'Explicit', 'Explicit', 'Explicit',
        'Explicit', 'Explicit', 'Explicit', 'Explicit', 'Explicit',
        'Explicit', 'Explicit', 'Explicit', 'Explicit', 'Explicit',
        'Explicit', 'Explicit', 'Explicit', 'Explicit', 'Explicit',
        'Explicit', 'Explicit', 'Explicit', 'Explicit', 'Explicit',
        'Explicit', 'Explicit', 'Dodging', 'Explicit', 'Explicit',
        'Dodging', 'Explicit', 'Explicit', 'Explicit', 'Explicit',
        'Explicit', 'Explicit', 'Explicit', 'Expli

In [None]:
from sklearn.metrics import accuracy_score, precision_score, f1_score

mapping_df = df[['evasion_label', 'clarity_label']].drop_duplicates()
evasion_to_clarity_map = dict(zip(mapping_df['evasion_label'], mapping_df['clarity_label']))

def generate_tabel(nume_model, model, X_test, y_test_evasion):
    preds_evasion = model.predict(X_test)

    acc_evasion = accuracy_score(y_test_evasion, preds_evasion)
    prec_evasion = precision_score(y_test_evasion, preds_evasion, average='weighted', zero_division=0)
    f1_evasion = f1_score(y_test_evasion, preds_evasion, average='weighted')

    preds_clarity = [evasion_to_clarity_map[label] for label in preds_evasion]
    y_test_clarity = [evasion_to_clarity_map[label] for label in y_test_evasion]

    acc_clarity = accuracy_score(y_test_clarity, preds_clarity)
    prec_clarity = precision_score(y_test_clarity, preds_clarity, average='weighted', zero_division=0)
    f1_clarity = f1_score(y_test_clarity, preds_clarity, average='weighted')

    print(f"\nResults for ({nume_model}):")
    print("-" * 60)
    print(f"{'Metric':<25} | {'Value'}")
    print("-" * 60)
    print(f"1. Accuracy Clarity      | {acc_clarity:.4f}")
    print(f"2. Precision Clarity     | {prec_clarity:.4f}")
    print(f"3. F1 Score Clarity      | {f1_clarity:.4f}")
    print("-" * 60)
    print(f"4. Accuracy Evasion      | {acc_evasion:.4f}")
    print(f"5. Precision Evasion     | {prec_evasion:.4f}")
    print(f"6. F1 Score Evasion      | {f1_evasion:.4f}")
    print("-" * 60)


generate_tabel("SVM", svm_clf, X_test_vec, y_test)
generate_tabel("Logistic Regression", log_clf, X_test_vec, y_test)
generate_tabel("Naive Bayes", nb_clf, X_test_vec, y_test)


Results for (SVM):
------------------------------------------------------------
Metric                    | Value
------------------------------------------------------------
1. Accuracy Clarity      | 0.5986
2. Precision Clarity     | 0.5820
3. F1 Score Clarity      | 0.5708
------------------------------------------------------------
4. Accuracy Evasion      | 0.2899
5. Precision Evasion     | 0.3395
6. F1 Score Evasion      | 0.2959
------------------------------------------------------------

Results for (Logistic Regression):
------------------------------------------------------------
Metric                    | Value
------------------------------------------------------------
1. Accuracy Clarity      | 0.5652
2. Precision Clarity     | 0.5747
3. F1 Score Clarity      | 0.5502
------------------------------------------------------------
4. Accuracy Evasion      | 0.2696
5. Precision Evasion     | 0.3211
6. F1 Score Evasion      | 0.2740
-----------------------------------------