In [None]:
import csv
import spacy
import nltk
from nltk.util import ngrams
from nltk.corpus import stopwords
from collections import Counter

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Example list of agreement and disagreement lexicons
agreement_lexicon = ['agree', 'agreeing', 'support', 'supporting', 'in favor of', 'yes', 'yes', 'certainly', 'definitely', 'sure']
disagreement_lexicon = ['disagree', 'disagreeing', 'oppose', 'opposing', 'against', 'no', 'not', 'never', 'certainly not', 'definitely not']

# Helper functions

# 1. N-gram generation
def generate_ngrams(text, n=1):
    tokens = nltk.word_tokenize(text.lower())  # Tokenize and lowercase
    return list(ngrams(tokens, n))

# 2. Extract Modal Verbs using spaCy
def extract_modal_verbs(text):
    doc = nlp(text)
    modal_verbs = [token.text for token in doc if token.tag_ == 'MD']  # Modal verbs in spaCy are tagged as 'MD'
    return ' '.join(modal_verbs)

# 3. Detect Negations
def detect_negation(text):
    doc = nlp(text)
    negations = [token.text for token in doc if token.dep_ == 'neg']
    return ' '.join(negations)

# 4. Argument lexicon match (agreement and disagreement)
def check_argument_lexicon(text):
    tokens = text.lower().split()
    agreement_count = sum(1 for word in tokens if word in agreement_lexicon)
    disagreement_count = sum(1 for word in tokens if word in disagreement_lexicon)
    return agreement_count, disagreement_count

# 5. Generate feature vector
def extract_features(text, label):
    # 1. N-grams (Unigrams, Bigrams, Trigrams)
    unigrams = generate_ngrams(text, n=1)
    bigrams = generate_ngrams(text, n=2)
    trigrams = generate_ngrams(text, n=3)

    # Flatten the ngrams into strings
    unigram_str = ' '.join(['_'.join(gram) for gram in unigrams])
    bigram_str = ' '.join(['_'.join(gram) for gram in bigrams])
    trigram_str = ' '.join(['_'.join(gram) for gram in trigrams])

    # 2. Argument lexicons (agreement and disagreement)
    agreement_count, disagreement_count = check_argument_lexicon(text)

    # 3. Modal verbs
    modals = extract_modal_verbs(text)

    # 4. Negations
    negations = detect_negation(text)

    # Return the feature vector
    return [text, label, unigram_str, bigram_str, trigram_str, agreement_count, disagreement_count, modals, negations]

# Function to process compiled_output.csv and extract features
def process_and_extract_features(input_csv, output_csv):
    with open(input_csv, 'r', encoding='utf-8') as infile:
        reader = csv.reader(infile)
        header = next(reader)  # Skip the header row
        data = list(reader)
    
    # Open the output CSV to write features
    with open(output_csv, 'w', newline='', encoding='utf-8') as outfile:
        writer = csv.writer(outfile)
        writer.writerow(['Text', 'Label', 'Unigrams', 'Bigrams', 'Trigrams', 'Agreement_Count', 'Disagreement_Count', 'Modal_Verbs', 'Negations'])
        
        for row in data:
            text, label = row
            features = extract_features(text, label)
            writer.writerow(features)

# Example usage
input_csv = 'compiled_output.csv'  # Path to your compiled output CSV file
output_csv = 'extracted_features.csv'  # Path to the output CSV with features

process_and_extract_features(input_csv, output_csv)


UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 1583: character maps to <undefined>