# Experimentation with Non-DL SVM

In [1]:
import os
import pandas as pd
from modules.data_loader import load_initial_data
from modules.text_segmentation import tokenize_text, handle_unusual_sentences
from modules.text_normalization import normalize_text
from modules.connlu_converter import convert_to_connlu
from modules.utils import setup_logging
import logging
import nltk
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import json


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import joblib

In [2]:
# Setup logging
setup_logging()
logger = logging.getLogger(__name__)

# Define paths
if '__file__' in globals():
    base_path = os.path.dirname(os.path.abspath(__file__))
else:
    base_path = os.getcwd()
documents_path = os.path.join(base_path, "../training_data_16_October_release/EN/raw-documents")
annotations_file = os.path.join(base_path, "../training_data_16_October_release/EN/subtask-2-annotations.txt")
output_dir = os.path.join(base_path, "../CoNLL")

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)


# 1. Load and prepare initial data
logger.info("Loading initial data...")
df = load_initial_data(documents_path, annotations_file)
logger.info(f"Loaded {len(df)} documents")

# 2. Tokenize text
logger.info("Tokenizing text...")
df = tokenize_text(df)

# 3. Handle unusual sentences
logger.info("Handling unusual sentences...")
df = handle_unusual_sentences(df)

# 4. Normalize text
logger.info("Normalizing text...")
df = normalize_text(df)
# print(df.head())
# print(df.columns)
# print(type(df['tokens_normalized'].iloc[0]))
# print(df['tokens_normalized'].iloc[0])
# print(df['narrative_subnarrative_pairs'].iloc[0])
# 5. Convert to CoNLL-U format
# only use when ConLL-U format is needed
#logger.info("Converting to CoNLL-U format...")
#convert_to_connlu(df, output_dir, 'tokens')
logger.info("Preprocessing completed successfully")

2024-12-13 21:44:40,662 - __main__ - INFO - Loading initial data...
2024-12-13 21:44:43,030 - __main__ - INFO - Loaded 198 documents
2024-12-13 21:44:43,031 - __main__ - INFO - Tokenizing text...
2024-12-13 21:44:44,104 - __main__ - INFO - Handling unusual sentences...
2024-12-13 21:44:44,106 - __main__ - INFO - Normalizing text...
2024-12-13 21:44:44,107 - modules.text_normalization - INFO - Using device: cpu


2024-12-13 21:44:44 INFO: Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| lemma     | combined_nocharlm |



2024-12-13 21:44:44,125 - stanza - INFO - Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| lemma     | combined_nocharlm |



2024-12-13 21:44:44 INFO: Using device: cpu


2024-12-13 21:44:44,127 - stanza - INFO - Using device: cpu


2024-12-13 21:44:44 INFO: Loading: tokenize


2024-12-13 21:44:44,129 - stanza - INFO - Loading: tokenize


2024-12-13 21:44:44 INFO: Loading: lemma


2024-12-13 21:44:44,131 - stanza - INFO - Loading: lemma


  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-12-13 21:44:46 INFO: Done loading processors!


2024-12-13 21:44:46,202 - stanza - INFO - Done loading processors!
2024-12-13 21:44:46,234 - modules.text_normalization - INFO - Starting processing of 198 rows in 4 batches


Normalizing text: 100%|██████████| 4/4 [00:16<00:00,  4.11s/it]

2024-12-13 21:45:02,689 - __main__ - INFO - Preprocessing completed successfully





In [3]:
df.head()

Unnamed: 0,filename,content,topic,narrative_subnarrative_pairs,tokens,tokens_normalized
0,EN_UA_103861.txt,The World Needs Peacemaker Trump Again \n\n by...,UA,"[{'narrative': 'Other', 'subnarrative': 'Other'}]","[[The, World, Needs, Peacemaker, Trump, Again,...","[world, need, peacemaker, trump, jeff, crouere..."
1,EN_UA_103667.txt,Desperation and Diplomacy: North Korea's Tech ...,UA,"[{'narrative': 'Other', 'subnarrative': 'Other'}]","[[Desperation, and, Diplomacy, :, North, Korea...","[desperation, diplomacy, north, korea, tech, h..."
2,EN_UA_021270.txt,"Ukraine's Fate Will Be Decided In Coming Year,...",UA,"[{'narrative': 'Speculating war outcomes', 'su...","[[Ukraine, 's, Fate, Will, Be, Decided, In, Co...","[ukraine, fate, decide, come, year, top, zelen..."
3,EN_UA_103403.txt,Russia Stages Major Airstrike on Ukraine; One ...,UA,"[{'narrative': 'Other', 'subnarrative': 'Other'}]","[[Russia, Stages, Major, Airstrike, on, Ukrain...","[russia, stage, major, airstrike, ukraine, one..."
4,EN_CC_100145.txt,Strategy needed to preserve water resources in...,CC,"[{'narrative': 'Other', 'subnarrative': 'Other'}]","[[Strategy, needed, to, preserve, water, resou...","[strategy, need, preserve, water, resource, pa..."


In [4]:
def create_label_mapping(all_narratives):
    """
    Create a consistent mapping for all narrative pairs
    
    Args:
        all_narratives: List of lists of narrative dictionaries
    
    Returns:
        dict: Mapping from narrative string to numeric index
    """
    unique_narratives = set()
    for narratives in all_narratives:
        for narrative in narratives:
            narrative_str = str(narrative)  # Convert dict to string
            unique_narratives.add(narrative_str)
    
    # Create mapping
    narrative_to_idx = {
        narrative: idx 
        for idx, narrative in enumerate(sorted(unique_narratives))
    }
    
    logger.info(f"Created mapping for {len(narrative_to_idx)} unique narratives")
    return narrative_to_idx

def get_first_narrative_label(narrative_list, label_mapping):
    """
    Convert first narrative in list to numeric label
    
    Args:
        narrative_list: List of narrative dictionaries
        label_mapping: Dictionary mapping narrative strings to indices
    
    Returns:
        int: Numeric label for the first narrative
    """
    if narrative_list and len(narrative_list) > 0:
        narrative_str = str(narrative_list[0])
        return label_mapping[narrative_str]
    return None

def prepare_data(df, label_mapping=None):
    """
    Prepare data for BERT training
    
    Args:
        df: DataFrame containing tokens_normalized and narrative_subnarrative_pairs
        label_mapping: Optional pre-existing label mapping to use
    
    Returns:
        tuple: (texts, labels, label_mapping)
    """
    try:
        # Handle tokens_normalized
        texts = df['tokens_normalized'].tolist()
        texts = [' '.join(tokens) if isinstance(tokens, list) else tokens for tokens in texts]
        
        # Convert narrative_subnarrative_pairs to list if it's a string
        narratives = df['narrative_subnarrative_pairs'].apply(
            lambda x: eval(x) if isinstance(x, str) else x
        ).tolist()

        # Create or use label mapping
        if label_mapping is None:
            label_mapping = create_label_mapping(narratives)
            
        # Convert narratives to numerical labels
        labels = []
        for narrative_list in narratives:
            if narrative_list:  # Check if list is not empty
                label_str = str(narrative_list[0])  # Convert first narrative dict to string
                if label_str in label_mapping:
                    labels.append(label_mapping[label_str])
                else:
                    raise ValueError(f"Unknown narrative: {label_str}")
            else:
                raise ValueError("Empty narrative list found")

        logger.info(f"Number of unique labels in mapping: {len(label_mapping)}")
        logger.info(f"Sample text: {texts[0][:100]}")
        logger.info(f"Sample label: {labels[0]}")
        
        return texts, labels, label_mapping

    except Exception as e:
        logger.error(f"Error in prepare_data: {str(e)}")
        logger.error(f"Sample narrative_subnarrative_pairs: {df['narrative_subnarrative_pairs'].iloc[0]}")
        raise

def compute_metrics(pred):
    """
    Compute evaluation metrics
    
    Args:
        pred: Prediction object from trainer
    
    Returns:
        dict: Dictionary containing computed metrics
    """
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="micro")
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [11]:
def train_svm(df, base_path, project_name="svm-training"):
    """
    Train an SVM model for multiclass multilabel classification.
    
    Args:
        df: DataFrame containing the training data.
        base_path: Base path for saving model outputs.
        project_name: Name for the project (for logging, optional).
    
    Returns:
        dict: Classification report.
    """
    try:
        # Create label mapping
        all_narratives = df['narrative_subnarrative_pairs'].apply(
            lambda x: eval(x) if isinstance(x, str) else x
        ).tolist()
        label_mapping = create_label_mapping(all_narratives)
        
        # Prepare data
        train_texts, train_labels, label_mapping = prepare_data(df, label_mapping)
        
        # Vectorize text using TF-IDF
        vectorizer = TfidfVectorizer(max_features=5000)
        X = vectorizer.fit_transform(train_texts)
        y = train_labels
        #print(y)
        # Train SVM using OneVsRestClassifier
        model = OneVsRestClassifier(LinearSVC())
        model.fit(X, y)

        # Save vectorizer and model
        model_path = os.path.join(base_path, "svm_model.joblib")
        vectorizer_path = os.path.join(base_path, "tfidf_vectorizer.joblib")
        joblib.dump(model, model_path)
        joblib.dump(vectorizer, vectorizer_path)
        
        logger.info(f"Model saved to {model_path}")
        logger.info(f"Vectorizer saved to {vectorizer_path}")

        # Evaluate model
        y_pred = model.predict(X)
        #print(y_pred)
        report = classification_report(y, y_pred, output_dict=True)
        logger.info("Training classification report:")
        logger.info(json.dumps(report, indent=2))

        return report

    except Exception as e:
        logger.error(f"Error in SVM training: {str(e)}")
        raise


In [9]:
def predict_svm(text, model_path, vectorizer_path):
    """
    Make predictions using the trained SVM model.
    
    Args:
        text: Input text to classify.
        model_path: Path to the saved SVM model.
        vectorizer_path: Path to the saved TF-IDF vectorizer.
    
    Returns:
        list: Predicted class indices.
    """
    try:
        # Load model and vectorizer
        model = joblib.load(model_path)
        vectorizer = joblib.load(vectorizer_path)
        
        # Load label mapping
        with open(os.path.join(os.path.dirname(model_path), "label_mapping.json"), 'r') as f:
            label_mapping = json.load(f)
        
        # Convert label indices to their original labels
        idx_to_label = {v: k for k, v in label_mapping.items()}
        
        # Transform input text using vectorizer
        X = vectorizer.transform([text])
        
        # Predict
        predictions = model.predict(X)[0]
        predicted_labels = [idx_to_label[idx] for idx in range(len(predictions)) if predictions[idx] == 1]
        print(predicted_labels)
        return predicted_labels

    except Exception as e:
        logger.error(f"Error in SVM prediction: {str(e)}")
        raise

In [12]:
logger.info("Starting BERT training...")
training_results = train_svm(df, base_path)
logger.info(f"BERT training completed. Results: {training_results}")

2024-12-13 21:53:25,493 - __main__ - INFO - Starting BERT training...
2024-12-13 21:54:31,328 - __main__ - INFO - Created mapping for 69 unique narratives
2024-12-13 21:55:07,905 - __main__ - INFO - Number of unique labels in mapping: 69
2024-12-13 21:55:07,905 - __main__ - INFO - Sample text: world need peacemaker trump jeff crouere liberty daily world total chaos month biden presidency sout
2024-12-13 21:55:07,911 - __main__ - INFO - Sample label: 49
2024-12-13 22:03:33,372 - __main__ - INFO - Model saved to c:\Users\leonb\OneDrive\Dokumente\Studium\Master\Sem1\NLP and InfExt\practical\nlp_Backpropagandists_2024\code\svm_model.joblib
2024-12-13 22:03:33,845 - __main__ - INFO - Vectorizer saved to c:\Users\leonb\OneDrive\Dokumente\Studium\Master\Sem1\NLP and InfExt\practical\nlp_Backpropagandists_2024\code\tfidf_vectorizer.joblib
2024-12-13 22:20:38,500 - __main__ - INFO - Training classification report:
2024-12-13 22:20:38,503 - __main__ - INFO - {
  "0": {
    "precision": 1.0,
    

In [None]:
[49, 49, 66, 49, 49, 49, 4, 49, 35, 49, 49, 15, 38, 67, 55, 3, 49, 49, 33, 49, 49, 49, 47, 49, 66, 31, 12, 49, 49, 12, 49, 38, 49, 49, 49, 49, 66, 49, 24, 49, 67, 30, 49, 23, 49, 49, 34, 5, 49, 49, 49, 49, 49, 49, 36, 22, 49, 22, 35, 47, 49, 38, 2, 49, 49, 49, 61, 31, 5, 49, 38, 49, 50, 54, 49, 5, 0, 31, 31, 49, 8, 0, 49, 21, 49, 68, 49, 21, 35, 67, 49, 49, 49, 49, 49, 49, 45, 27, 49, 49, 5, 4, 49, 49, 49, 47, 49, 25, 49, 49, 49, 49, 49, 4, 68, 24, 49, 30, 3, 17, 49, 49, 33, 49, 49, 49, 49, 49, 49, 4, 30, 45, 21, 38, 16, 49, 5, 49, 30, 49, 49, 61, 45, 49, 50, 4, 3, 49, 22, 49, 49, 50, 49, 49, 57, 20, 12, 33, 49, 5, 49, 49, 49, 49, 49, 49, 49, 36, 2, 3, 17, 49, 12, 38, 49, 2, 36, 21, 43, 4, 49, 67, 49, 15, 63, 49, 65, 41, 49, 21, 47, 21, 49, 63, 49, 2, 5, 49]
[49 49 66 49 49 49  4 49 35 49 49 15 38 67 55  3 49 49 33 49 49 49 47 49
 66 31 12 49 49 12 49 38 49 49 49 49 66 49 24 49 67 30 49 23 49 49 34  5
 49 49 49 49 49 49 36 22 49 22 35 47 49 38  2 49 49 49 61 31  5 49 38 49
 50 54 49  5  0 31 31 49  8  0 49 21 49 68 49 21 35 67 49 49 49 49 49 49
 45 27 49 49  5  4 49 49 49 47 49 25 49 49 49 49 49  4 68 24 49 30  3 17
 49 49 33 49 49 49 49 49 49  4 30 45 21 38 16 49  5 49 30 49 49 61 45 49
 50  4  3 49 22 49 49 50 49 49 57 20 12 33 49  5 49 49 49 49 49 49 49 36
  2  3 17 49 12 38 49  2 36 21 43  4 49 67 49 15 63 49 65 41 49 21 47 21
 49 63 49  2  5 49]