In [15]:
import pandas as pd

# Load the data
df = pd.read_csv("compiled_output.csv")

# Display the first few rows to check the structure
print(df.head())


                                                Text             Label
0  complicated 3D character models are widely use...  background_claim
1  The range of breathtaking realistic 3D models ...  background_claim
2         a production cannot afford major revisions  background_claim
3  providing a flexible and efficient solution to...         own_claim
4  Skeleton Subspace Deformation (SSD) is the pre...  background_claim


In [26]:
# Check for missing or empty text
df['Text'].isnull().sum()  # Check for NaNs
df[df['Text'].str.strip() == '']  # Check for empty or all-whitespace texts
# Remove rows where text is empty or null
df = df[df['Text'].str.strip() != '']
df = df.dropna(subset=['Text'])
vectorizer = CountVectorizer(ngram_range=(1, 3), stop_words='english')  # Remove stop words


In [27]:
# Test tokenization of a sample text
sample_text = df['Text'].iloc[0]
print("Original Text:", sample_text)
print("Tokenized Text:", sample_text.split())


Original Text: complicated 3D character models are widely used in fields of entertainment, virtual reality, medicine etc
Tokenized Text: ['complicated', '3D', 'character', 'models', 'are', 'widely', 'used', 'in', 'fields', 'of', 'entertainment,', 'virtual', 'reality,', 'medicine', 'etc']


In [28]:
# Test extracting n-grams from a small set of text
sample_texts = df['Text'].head(5)  # First 5 rows for quick testing
vectorizer = CountVectorizer(ngram_range=(1, 3), stop_words='english')
X_test = vectorizer.fit_transform(sample_texts)
print("Vocabulary:", vectorizer.get_feature_names_out())


Vocabulary: ['3d' '3d character' '3d character models' '3d models' '3d models limited'
 'afford' 'afford major' 'afford major revisions' 'animation'
 'animation remains' 'animation remains open' 'approach'
 'approach character' 'approach character skinning' 'artists'
 'artists resolution' 'artists resolution devices' 'breathtaking'
 'breathtaking realistic' 'breathtaking realistic 3d' 'character'
 'character models' 'character models widely' 'character skinning'
 'character skinning present' 'complicated' 'complicated 3d'
 'complicated 3d character' 'creativity' 'creativity artists'
 'creativity artists resolution' 'deformation' 'deformation ssd'
 'deformation ssd predominant' 'devices' 'efficient' 'efficient solution'
 'efficient solution animation' 'entertainment' 'entertainment virtual'
 'entertainment virtual reality' 'fields' 'fields entertainment'
 'fields entertainment virtual' 'flexible' 'flexible efficient'
 'flexible efficient solution' 'limited' 'limited creativity'
 'limite

In [29]:
import spacy
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Initialize spaCy model
nlp = spacy.load("en_core_web_sm")

# Argument lexicons
agreement_lexicon = ['agree', 'yes', 'definitely', 'sure', 'absolutely', 'of course']
disagreement_lexicon = ['disagree', 'no', 'never', 'not', 'don’t', 'won’t']

# Hedge words
hedge_words = ['perhaps', 'maybe', 'possibly', 'could', 'might', 'probably']

# Modal verbs list
modal_verbs = ['can', 'could', 'may', 'might', 'shall', 'should', 'will', 'would']

# Function to extract n-grams
def extract_ngrams(text, ngram_range=(1, 3)):
    vectorizer = CountVectorizer(ngram_range=ngram_range)
    ngrams = vectorizer.fit_transform([text])
    return vectorizer.get_feature_names_out()

# Extract Argument Lexicons (Agreement and Disagreement)
def extract_argument_lexicon_features(text):
    agreement_count = sum([word in text.lower() for word in agreement_lexicon])
    disagreement_count = sum([word in text.lower() for word in disagreement_lexicon])
    return agreement_count, disagreement_count

# Extract Hedge Features
def extract_hedge_features(text):
    hedge_count = sum([word in text.lower() for word in hedge_words])
    return hedge_count

# Extract Modal Verbs
def extract_modal_verbs(text):
    doc = nlp(text)
    modal_count = sum([token.lemma_ in modal_verbs for token in doc])
    return modal_count

# Detect Negation
def detect_negation(text):
    negation_patterns = [r'\b(not|no|never|don\'t|won\'t|isn\'t|aren\'t|can\'t)\b']
    negation_count = sum([bool(re.search(pattern, text.lower())) for pattern in negation_patterns])
    return negation_count

# Function to extract all features for each text
def extract_features(df):
    all_ngrams = []
    other_features = []
    
    for text in df['Text']:
        # Extract n-grams
        ngrams = extract_ngrams(text)
        ngram_features = list(ngrams)  # Get the ngram features
        
        # Extract argument lexicons (agreement, disagreement)
        agreement_count, disagreement_count = extract_argument_lexicon_features(text)

        # Extract hedge features
        hedge_count = extract_hedge_features(text)

        # Extract modal verbs
        modal_count = extract_modal_verbs(text)

        # Extract negation features
        negation_count = detect_negation(text)

        # Combine all the features into one list
        other_features.append([agreement_count, disagreement_count, hedge_count, modal_count, negation_count])

        # Store the ngram features as a separate part of the feature matrix
        all_ngrams.append(ngram_features)
    
    return all_ngrams, other_features


In [30]:
# Function to extract n-grams
def extract_ngrams(text, ngram_range=(1, 3)):
    if text.strip():  # Check if text is non-empty and not just whitespace
        vectorizer = CountVectorizer(ngram_range=ngram_range, stop_words='english')
        ngrams = vectorizer.fit_transform([text])
        return vectorizer.get_feature_names_out()
    else:
        return []  # Return an empty list for empty text

# Extract n-grams and other features
def extract_features(df):
    all_ngrams = []
    other_features = []
    
    for text in df['Text']:
        # Extract n-grams
        ngrams = extract_ngrams(text)
        ngram_features = list(ngrams)  # Get the ngram features
        
        # Extract argument lexicons (agreement, disagreement)
        agreement_count, disagreement_count = extract_argument_lexicon_features(text)

        # Extract hedge features
        hedge_count = extract_hedge_features(text)

        # Extract modal verbs
        modal_count = extract_modal_verbs(text)

        # Extract negation features
        negation_count = detect_negation(text)

        # Combine all the features into one list
        other_features.append([agreement_count, disagreement_count, hedge_count, modal_count, negation_count])

        # Store the ngram features as a separate part of the feature matrix
        all_ngrams.append(ngram_features)
    
    return all_ngrams, other_features


In [31]:
# Function to extract n-grams
def extract_ngrams(text, ngram_range=(1, 3)):
    if text.strip():  # Check if text is non-empty and not just whitespace
        vectorizer = CountVectorizer(ngram_range=ngram_range, stop_words='english')
        ngrams = vectorizer.fit_transform([text])
        return vectorizer.get_feature_names_out()
    else:
        return []  # Return an empty list for empty text

# Extract n-grams and other features
def extract_features(df):
    all_ngrams = []
    other_features = []
    
    for text in df['Text']:
        # Extract n-grams
        ngrams = extract_ngrams(text)
        ngram_features = list(ngrams)  # Get the ngram features
        
        # Extract argument lexicons (agreement, disagreement)
        agreement_count, disagreement_count = extract_argument_lexicon_features(text)

        # Extract hedge features
        hedge_count = extract_hedge_features(text)

        # Extract modal verbs
        modal_count = extract_modal_verbs(text)

        # Extract negation features
        negation_count = detect_negation(text)

        # Combine all the features into one list
        other_features.append([agreement_count, disagreement_count, hedge_count, modal_count, negation_count])

        # Store the ngram features as a separate part of the feature matrix
        all_ngrams.append(ngram_features)
    
    return all_ngrams, other_features


In [22]:
import pandas as pd
import spacy
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Initialize spaCy model
nlp = spacy.load("en_core_web_sm")

# Argument lexicons
agreement_lexicon = ['agree', 'yes', 'definitely', 'sure', 'absolutely', 'of course']
disagreement_lexicon = ['disagree', 'no', 'never', 'not', 'don’t', 'won’t']

# Hedge words
hedge_words = ['perhaps', 'maybe', 'possibly', 'could', 'might', 'probably']

# Modal verbs list
modal_verbs = ['can', 'could', 'may', 'might', 'shall', 'should', 'will', 'would']

# Function to extract n-grams
def extract_ngrams(text, ngram_range=(1, 3)):
    if text.strip():  # Check if text is non-empty and not just whitespace
        vectorizer = CountVectorizer(ngram_range=ngram_range, stop_words='english')
        ngrams = vectorizer.fit_transform([text])
        return vectorizer.get_feature_names_out()
    else:
        return []  # Return an empty list for empty text

# Extract Argument Lexicons (Agreement and Disagreement)
def extract_argument_lexicon_features(text):
    agreement_count = sum([word in text.lower() for word in agreement_lexicon])
    disagreement_count = sum([word in text.lower() for word in disagreement_lexicon])
    return agreement_count, disagreement_count

# Extract Hedge Features
def extract_hedge_features(text):
    hedge_count = sum([word in text.lower() for word in hedge_words])
    return hedge_count

# Extract Modal Verbs
def extract_modal_verbs(text):
    doc = nlp(text)
    modal_count = sum([token.lemma_ in modal_verbs for token in doc])
    return modal_count

# Detect Negation
def detect_negation(text):
    negation_patterns = [r'\b(not|no|never|don\'t|won\'t|isn\'t|aren\'t|can\'t)\b']
    negation_count = sum([bool(re.search(pattern, text.lower())) for pattern in negation_patterns])
    return negation_count

# Function to extract all features for each text
def extract_features(df):
    all_ngrams = []
    other_features = []
    
    for text in df['Text']:
        # Extract n-grams
        ngrams = extract_ngrams(text)
        ngram_features = list(ngrams)  # Get the ngram features
        
        # Extract argument lexicons (agreement, disagreement)
        agreement_count, disagreement_count = extract_argument_lexicon_features(text)

        # Extract hedge features
        hedge_count = extract_hedge_features(text)

        # Extract modal verbs
        modal_count = extract_modal_verbs(text)

        # Extract negation features
        negation_count = detect_negation(text)

        # Combine all the features into one list
        other_features.append([agreement_count, disagreement_count, hedge_count, modal_count, negation_count])

        # Store the ngram features as a separate part of the feature matrix
        all_ngrams.append(ngram_features)
    
    return all_ngrams, other_features

# Now, proceed with the previous steps to create feature matrix and train Logistic Regression


In [32]:

# Extract features and labels
#df = df[df['Text'].notna() & (df['Text'].str.strip() != '')]
ngram_features, additional_features = extract_features(df)

# Combine n-grams and additional features into a single feature matrix
# N-grams will be a list of list of n-grams, so we need to flatten this into a single feature vector for each entry
from sklearn.feature_extraction.text import CountVectorizer

# Combine n-grams into one string for each row to pass into vectorizer
ngram_texts = [' '.join(ngram) for ngram in ngram_features]

# Create the CountVectorizer for n-grams
vectorizer = CountVectorizer(ngram_range=(1, 3), stop_words='english')
X_ngrams = vectorizer.fit_transform(ngram_texts)

# Convert additional features to a numpy array
X_additional = np.array(additional_features)

# Combine n-grams and additional features (horizontal stack)
from scipy.sparse import hstack
X = hstack([X_ngrams, X_additional])

# Prepare the target labels (Y)
y = df['Label'].values

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict the labels on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regression Accuracy: {accuracy * 100:.2f}%")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

ValueError: empty vocabulary; perhaps the documents only contain stop words

In [13]:
print(df['Text'].isnull().sum())  # Check for any null values
print(df['Text'].apply(len).describe()) 

0
count    13592.000000
mean        59.892216
std         45.405939
min          1.000000
25%         22.000000
50%         53.000000
75%         86.000000
max        359.000000
Name: Text, dtype: float64


In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import spacy
from scipy.sparse import hstack

# Load the Spacy model for modal verb detection
nlp = spacy.load("en_core_web_sm")

# Define function for extracting n-grams (unigrams, bigrams, trigrams)
def extract_ngrams(text, ngram_range=(1, 3)):
    # Ensure the text is not empty or consisting of just spaces
    if text.strip():
        # Initialize vectorizer without stopwords
        vectorizer = CountVectorizer(ngram_range=ngram_range, stop_words=None)
        ngrams = vectorizer.fit_transform([text])
        
        # If ngrams contain valid features, return them, otherwise return empty
        if ngrams.shape[1] > 0:
            return vectorizer.get_feature_names_out()
        else:
            return []  # Return empty list if no n-grams are found
    else:
        return []  # Return empty list for empty or invalid text

# Example lexicons for argument extraction (these could be refined further)
agreement_lexicon = ['agree', 'agreed', 'agreement', 'yes', 'support']
disagreement_lexicon = ['disagree', 'disagreed', 'disagreement', 'no', 'oppose']

# Function to extract argument lexicons (agreement and disagreement)
def extract_argument_lexicons(text):
    agreement_count = sum([word in text.lower() for word in agreement_lexicon])
    disagreement_count = sum([word in text.lower() for word in disagreement_lexicon])
    return [agreement_count, disagreement_count]

# Function to detect modal verbs (e.g., "can", "could", "will", etc.)
def extract_modal_verbs(text):
    doc = nlp(text)
    modal_verbs = ['can', 'could', 'will', 'would', 'shall', 'should', 'may', 'might', 'must', 'ought']
    modal_count = sum([token.lemma_ in modal_verbs for token in doc])
    return [modal_count]

# Function to detect negation words (e.g., "not", "never", etc.)
def extract_negation(text):
    negation_words = ['not', 'never', 'no', 'none', 'nothing', 'neither', 'nor']
    negation_count = sum([word in text.lower() for word in negation_words])
    return [negation_count]

# Function to extract all features
def extract_features(df):
    ngram_features = []
    other_features = []

    # Loop through each text entry in the dataframe
    for text in df['Text']:
        # Extract n-grams (unigrams, bigrams, trigrams)
        ngrams = extract_ngrams(text)
        
        # Extract additional features
        argument_lexicons = extract_argument_lexicons(text)
        modal_verbs = extract_modal_verbs(text)
        negation = extract_negation(text)
        
        # Combine all features into a single list for this text
        ngram_features.append(ngrams)
        other_features.append(argument_lexicons + modal_verbs + negation)
    
    return ngram_features, other_features

# Read the data from the CSV file
df = pd.read_csv('compiled_output.csv')  # Replace with your file path

# Check the first few rows of the dataframe
print(df.head())

# Remove rows with NaN or empty Text after stripping extra spaces
df = df[df['Text'].notna() & (df['Text'].str.strip() != '')]
df['Text'] = df['Text'].str.strip()  # Remove leading/trailing spaces

# Remove rows with empty or very short texts (length < 3 words)
df['Text_Length'] = df['Text'].apply(lambda x: len(x.split()))
df = df[df['Text_Length'] > 2]

# Extract features and labels
ngram_features, additional_features = extract_features(df)

# Combine n-grams into one string for each row to pass into vectorizer
ngram_texts = [' '.join(ngram) for ngram in ngram_features]

# Create the CountVectorizer for n-grams
vectorizer = CountVectorizer(ngram_range=(1, 3), stop_words=None)
X_ngrams = vectorizer.fit_transform(ngram_texts)

# Convert additional features to a numpy array
X_additional = np.array(additional_features)

# Combine n-grams and additional features (horizontal stack)
X = hstack([X_ngrams, X_additional])

# Prepare the target labels (Y)
y = df['Label'].values

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict the labels on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regression Accuracy: {accuracy * 100:.2f}%")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


                                                Text             Label
0  complicated 3D character models are widely use...  background_claim
1  The range of breathtaking realistic 3D models ...  background_claim
2         a production cannot afford major revisions  background_claim
3  providing a flexible and efficient solution to...         own_claim
4  Skeleton Subspace Deformation (SSD) is the pre...  background_claim


ValueError: empty vocabulary; perhaps the documents only contain stop words

In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import spacy
from scipy.sparse import hstack

# Load the Spacy model for modal verb detection
nlp = spacy.load("en_core_web_sm")

# Define function for extracting n-grams (unigrams, bigrams, trigrams)
def extract_ngrams(text, ngram_range=(1, 3)):
    if text.strip():  # Ensure the text is not empty or just whitespace
        vectorizer = CountVectorizer(ngram_range=ngram_range, stop_words='english')
        ngrams = vectorizer.fit_transform([text])
        
        # If ngrams contain valid features, return them, otherwise return empty
        if ngrams.shape[1] > 0:
            return vectorizer.get_feature_names_out()
        else:
            return []  # Return empty list if no n-grams are found
    else:
        return []  # Return empty list for empty or invalid text

# Example lexicons for argument extraction (these could be refined further)
agreement_lexicon = ['agree', 'agreed', 'agreement', 'yes', 'support']
disagreement_lexicon = ['disagree', 'disagreed', 'disagreement', 'no', 'oppose']

# Function to extract argument lexicons (agreement and disagreement)
def extract_argument_lexicons(text):
    agreement_count = sum([word in text.lower() for word in agreement_lexicon])
    disagreement_count = sum([word in text.lower() for word in disagreement_lexicon])
    return [agreement_count, disagreement_count]

# Function to detect modal verbs (e.g., "can", "could", "will", etc.)
def extract_modal_verbs(text):
    doc = nlp(text)
    modal_verbs = ['can', 'could', 'will', 'would', 'shall', 'should', 'may', 'might', 'must', 'ought']
    modal_count = sum([token.lemma_ in modal_verbs for token in doc])
    return [modal_count]

# Function to detect negation words (e.g., "not", "never", etc.)
def extract_negation(text):
    negation_words = ['not', 'never', 'no', 'none', 'nothing', 'neither', 'nor']
    negation_count = sum([word in text.lower() for word in negation_words])
    return [negation_count]

# Function to extract all features
def extract_features(df):
    ngram_features = []
    other_features = []

    # Loop through each text entry in the dataframe
    for text in df['Text']:
        # Extract n-grams (unigrams, bigrams, trigrams)
        ngrams = extract_ngrams(text)
        
        # Extract additional features
        argument_lexicons = extract_argument_lexicons(text)
        modal_verbs = extract_modal_verbs(text)
        negation = extract_negation(text)
        
        # Combine all features into a single list for this text
        ngram_features.append(ngrams)
        other_features.append(argument_lexicons + modal_verbs + negation)
    
    return ngram_features, other_features

# Read the data from the CSV file
df = pd.read_csv('compiled_output.csv')  # Replace with your file path

# Check the first few rows of the dataframe
print(df.head())

# Remove rows with NaN or empty Text after stripping extra spaces
df = df[df['Text'].notna() & (df['Text'].str.strip() != '')]
df['Text'] = df['Text'].str.strip()  # Remove leading/trailing spaces

# Remove rows with empty or very short texts (length < 3 words)
df['Text_Length'] = df['Text'].apply(lambda x: len(x.split()))
df = df[df['Text_Length'] > 2]

# Extract features and labels
ngram_features, additional_features = extract_features(df)

# Combine n-grams into one string for each row to pass into vectorizer
ngram_texts = [' '.join(ngram) for ngram in ngram_features]

# Create the CountVectorizer for n-grams
vectorizer = CountVectorizer(ngram_range=(1, 3), stop_words='english')  # Added stop_words='english' here
X_ngrams = vectorizer.fit_transform(ngram_texts)

# Convert additional features to a numpy array
X_additional = np.array(additional_features)

# Combine n-grams and additional features (horizontal stack)
X = hstack([X_ngrams, X_additional])

# Prepare the target labels (Y)
y = df['Label'].values

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict the labels on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regression Accuracy: {accuracy * 100:.2f}%")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


                                                Text             Label
0  complicated 3D character models are widely use...  background_claim
1  The range of breathtaking realistic 3D models ...  background_claim
2         a production cannot afford major revisions  background_claim
3  providing a flexible and efficient solution to...         own_claim
4  Skeleton Subspace Deformation (SSD) is the pre...  background_claim


ValueError: empty vocabulary; perhaps the documents only contain stop words

In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import spacy
from scipy.sparse import hstack

# Load the Spacy model for modal verb detection
nlp = spacy.load("en_core_web_sm")

# Define function for extracting n-grams (unigrams, bigrams, trigrams)
def extract_ngrams(text, ngram_range=(1, 3)):
    if text.strip():  # Ensure the text is not empty or just whitespace
        vectorizer = CountVectorizer(ngram_range=ngram_range, stop_words='english')
        ngrams = vectorizer.fit_transform([text])
        
        # If ngrams contain valid features, return them, otherwise return empty
        if ngrams.shape[1] > 0:
            return vectorizer.get_feature_names_out()
        else:
            return []  # Return empty list if no n-grams are found
    else:
        return []  # Return empty list for empty or invalid text

# Example lexicons for argument extraction (these could be refined further)
agreement_lexicon = ['agree', 'agreed', 'agreement', 'yes', 'support']
disagreement_lexicon = ['disagree', 'disagreed', 'disagreement', 'no', 'oppose']

# Function to extract argument lexicons (agreement and disagreement)
def extract_argument_lexicons(text):
    agreement_count = sum([word in text.lower() for word in agreement_lexicon])
    disagreement_count = sum([word in text.lower() for word in disagreement_lexicon])
    return [agreement_count, disagreement_count]

# Function to detect modal verbs (e.g., "can", "could", "will", etc.)
def extract_modal_verbs(text):
    doc = nlp(text)
    modal_verbs = ['can', 'could', 'will', 'would', 'shall', 'should', 'may', 'might', 'must', 'ought']
    modal_count = sum([token.lemma_ in modal_verbs for token in doc])
    return [modal_count]

# Function to detect negation words (e.g., "not", "never", etc.)
def extract_negation(text):
    negation_words = ['not', 'never', 'no', 'none', 'nothing', 'neither', 'nor']
    negation_count = sum([word in text.lower() for word in negation_words])
    return [negation_count]

# Function to extract all features
def extract_features(df):
    ngram_features = []
    other_features = []

    # Loop through each text entry in the dataframe
    for text in df['Text']:
        # Extract n-grams (unigrams, bigrams, trigrams)
        ngrams = extract_ngrams(text)
        
        # Extract additional features
        argument_lexicons = extract_argument_lexicons(text)
        modal_verbs = extract_modal_verbs(text)
        negation = extract_negation(text)
        
        # Combine all features into a single list for this text
        ngram_features.append(ngrams)
        other_features.append(argument_lexicons + modal_verbs + negation)
    
    return ngram_features, other_features

# Read the data from the CSV file
df = pd.read_csv('compiled_output.csv')  # Replace with your file path

# Check the first few rows of the dataframe
print(df.head())

# Remove rows with NaN or empty Text after stripping extra spaces
df = df[df['Text'].notna() & (df['Text'].str.strip() != '')]
df['Text'] = df['Text'].str.strip()  # Remove leading/trailing spaces

# Remove rows with empty or very short texts (length < 3 words)
df['Text_Length'] = df['Text'].apply(lambda x: len(x.split()))
df = df[df['Text_Length'] > 2]

# Extract features and labels
ngram_features, additional_features = extract_features(df)

# Combine n-grams into one string for each row to pass into vectorizer
ngram_texts = [' '.join(ngram) for ngram in ngram_features]

# Create the CountVectorizer for n-grams
vectorizer = CountVectorizer(ngram_range=(1, 3), stop_words='english')  # Added stop_words='english' here
X_ngrams = vectorizer.fit_transform(ngram_texts)

# Convert additional features to a numpy array
X_additional = np.array(additional_features)

# Combine n-grams and additional features (horizontal stack)
X = hstack([X_ngrams, X_additional])

# Prepare the target labels (Y)
y = df['Label'].values

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict the labels on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regression Accuracy: {accuracy * 100:.2f}%")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


                                                Text             Label
0  complicated 3D character models are widely use...  background_claim
1  The range of breathtaking realistic 3D models ...  background_claim
2         a production cannot afford major revisions  background_claim
3  providing a flexible and efficient solution to...         own_claim
4  Skeleton Subspace Deformation (SSD) is the pre...  background_claim


ValueError: empty vocabulary; perhaps the documents only contain stop words

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import spacy
from scipy.sparse import hstack

# Load the Spacy model for modal verb detection
nlp = spacy.load("en_core_web_sm")

# Define function for extracting n-grams (unigrams, bigrams, trigrams)
def extract_ngrams(text, ngram_range=(1, 3)):
    # Check if the text is valid (non-empty after removing spaces)
    if text.strip():  
        vectorizer = CountVectorizer(ngram_range=ngram_range, stop_words='english')
        ngrams = vectorizer.fit_transform([text])
        
        # If no valid n-grams are found, return empty list
        if ngrams.shape[1] > 0:
            return vectorizer.get_feature_names_out()
        else:
            return ['<empty_ngram>']  # Return a placeholder for empty ngrams
    else:
        return ['<empty_ngram>']  # Return a placeholder for empty ngrams

# Example lexicons for argument extraction (these could be refined further)
agreement_lexicon = ['agree', 'agreed', 'agreement', 'yes', 'support']
disagreement_lexicon = ['disagree', 'disagreed', 'disagreement', 'no', 'oppose']

# Function to extract argument lexicons (agreement and disagreement)
def extract_argument_lexicons(text):
    agreement_count = sum([word in text.lower() for word in agreement_lexicon])
    disagreement_count = sum([word in text.lower() for word in disagreement_lexicon])
    return [agreement_count, disagreement_count]

# Function to detect modal verbs (e.g., "can", "could", "will", etc.)
def extract_modal_verbs(text):
    doc = nlp(text)
    modal_verbs = ['can', 'could', 'will', 'would', 'shall', 'should', 'may', 'might', 'must', 'ought']
    modal_count = sum([token.lemma_ in modal_verbs for token in doc])
    return [modal_count]

# Function to detect negation words (e.g., "not", "never", etc.)
def extract_negation(text):
    negation_words = ['not', 'never', 'no', 'none', 'nothing', 'neither', 'nor']
    negation_count = sum([word in text.lower() for word in negation_words])
    return [negation_count]

# Function to extract all features
def extract_features(df):
    ngram_features = []
    other_features = []

    # Loop through each text entry in the dataframe
    for text in df['Text']:
        # Extract n-grams (unigrams, bigrams, trigrams)
        ngrams = extract_ngrams(text)
        
        # Extract additional features
        argument_lexicons = extract_argument_lexicons(text)
        modal_verbs = extract_modal_verbs(text)
        negation = extract_negation(text)
        
        # Combine all features into a single list for this text
        ngram_features.append(ngrams)
        other_features.append(argument_lexicons + modal_verbs + negation)
    
    return ngram_features, other_features

# Read the data from the CSV file
df = pd.read_csv('compiled_output.csv')  # Replace with your file path

# Check the first few rows of the dataframe
print(df.head())
import pandas as pd

# Read the data
df = pd.read_csv('compiled_output.csv')  # Replace with your file path

# Check the first few rows of the dataframe
print(df.head())

# Remove rows where the 'Text' column contains only numbers or lists (e.g., [2])
df = df[~df['Text'].str.match(r'^\[.*\]$')]  # Regex to match rows like '[2]', '[text]', etc.

# Alternatively, if you want to remove rows where 'Text' only contains digits or lists of digits
df = df[~df['Text'].str.match(r'^\[\d+\]$')]  # Regex to match rows like '[2]', '[3]', etc.

# Verify the result
print(df.head())

# Now you can continue with your feature extraction or other operations


# Remove rows with NaN or empty Text after stripping extra spaces
df = df[df['Text'].notna() & (df['Text'].str.strip() != '')]
df['Text'] = df['Text'].str.strip()  # Remove leading/trailing spaces

# Remove rows with empty or very short texts (length < 3 words)
df['Text_Length'] = df['Text'].apply(lambda x: len(x.split()))
df = df[df['Text_Length'] > 2]

# Extract features and labels
ngram_features, additional_features = extract_features(df)

# Combine n-grams into one string for each row to pass into vectorizer
ngram_texts = [' '.join(ngram) for ngram in ngram_features]

# Create the CountVectorizer for n-grams
vectorizer = CountVectorizer(ngram_range=(1, 3), stop_words='english')  # Added stop_words='english' here
X_ngrams = vectorizer.fit_transform(ngram_texts)

# Convert additional features to a numpy array
X_additional = np.array(additional_features)

# Combine n-grams and additional features (horizontal stack)
X = hstack([X_ngrams, X_additional])

# Prepare the target labels (Y)
y = df['Label'].values

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict the labels on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regression Accuracy: {accuracy * 100:.2f}%")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.0 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\PC\AppData\Roaming\Python\Python312\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "C:\Users\PC\AppData\Roaming\Python\Python312\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "C:\Users\PC\AppData\Roaming\Python\Python312\site-packages\ipykernel\kernelapp.py", line 739, in start
    self.io_loop.start(

                                                Text             Label
0  complicated 3D character models are widely use...  background_claim
1  The range of breathtaking realistic 3D models ...  background_claim
2         a production cannot afford major revisions  background_claim
3  providing a flexible and efficient solution to...         own_claim
4  Skeleton Subspace Deformation (SSD) is the pre...  background_claim
                                                Text             Label
0  complicated 3D character models are widely use...  background_claim
1  The range of breathtaking realistic 3D models ...  background_claim
2         a production cannot afford major revisions  background_claim
3  providing a flexible and efficient solution to...         own_claim
4  Skeleton Subspace Deformation (SSD) is the pre...  background_claim
                                                Text             Label
0  complicated 3D character models are widely use...  background_claim
1  The

ValueError: empty vocabulary; perhaps the documents only contain stop words

In [2]:
def extract_ngrams(text, ngram_range=(1, 3)):
    # Only proceed if the text is not empty after stripping
    if text.strip():
        # Initialize vectorizer with stop_words removed
        vectorizer = CountVectorizer(ngram_range=ngram_range, stop_words='english')
        ngrams = vectorizer.fit_transform([text])
        
        # Check if any valid ngrams were extracted
        if ngrams.shape[1] > 0:
            return vectorizer.get_feature_names_out()
        else:
            return []  # Return empty if no valid ngrams were found
    return []  # Return empty if the text is invalid or empty

# Function to extract features from the dataframe
def extract_features(df):
    ngram_features = []
    additional_features = []

    # Loop through each text entry in the dataframe
    for text in df['Text']:
        # Extract n-grams (unigrams, bigrams, trigrams)
        ngrams = extract_ngrams(text)
        ngram_features.append(ngrams)

        # Here you can add your other feature extraction methods
        # For now, just add a placeholder for additional features
        additional_features.append([])  # Replace with actual feature extraction logic

    return ngram_features, additional_features

# Extract features and labels
ngram_features, additional_features = extract_features(df)

# Combine n-grams into one string for each row to pass into vectorizer
ngram_texts = [' '.join(ngram) for ngram in ngram_features]

# Create the CountVectorizer for n-grams
vectorizer = CountVectorizer(ngram_range=(1, 3), stop_words='english')
X_ngrams = vectorizer.fit_transform(ngram_texts)

# Convert additional features to a numpy array
import numpy as np
X_additional = np.array(additional_features)

# Combine n-grams and additional features (horizontal stack)
from scipy.sparse import hstack
X = hstack([X_ngrams, X_additional])

# Prepare the target labels (Y)
y = df['Label'].values

# Split the dataset into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Logistic Regression model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict the labels on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regression Accuracy: {accuracy * 100:.2f}%")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

ValueError: empty vocabulary; perhaps the documents only contain stop words

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# Read the data
df = pd.read_csv('compiled_output.csv')  # Replace with your file path

# Step 1: Remove rows where 'Text' is empty, whitespace, or non-informative
df = df[df['Text'].str.strip().notna()]  # Remove rows with empty strings
df = df[df['Text'].str.strip() != '']  # Remove rows with only whitespace

# Print the first few rows to check
print("Data after cleaning:")
print(df.head())

# Step 2: Check for rows with non-informative content like `[2]`
# This will help you filter out any non-textual or irrelevant rows (e.g., '[2]')
df = df[~df['Text'].str.contains(r'\[.*\]')]  # Remove rows with square brackets (e.g., '[2]')
print("\nData after removing non-informative content:")
print(df.head())

# Step 3: Filter out rows where the text is too short to contain meaningful n-grams
df['Text_Length'] = df['Text'].apply(lambda x: len(x.split()))  # Length in words
df = df[df['Text_Length'] > 2]  # Only keep rows with more than 2 words

# Check if there are still any rows left after filtering
print(f"\nRemaining rows after length filter: {len(df)}")
print(df.head())

# Step 4: Function to extract n-grams safely
def extract_ngrams(text, ngram_range=(1, 3)):
    # Only proceed if the text is not empty after stripping
    if text.strip():
        # Initialize vectorizer with stop_words removed
        vectorizer = CountVectorizer(ngram_range=ngram_range, stop_words='english')
        ngrams = vectorizer.fit_transform([text])
        
        # Check if any valid ngrams were extracted
        if ngrams.shape[1] > 0:
            return vectorizer.get_feature_names_out()
        else:
            return []  # Return empty if no valid ngrams were found
    return []  # Return empty if the text is invalid or empty

# Function to extract features from the dataframe
def extract_features(df):
    ngram_features = []
    additional_features = []

    # Loop through each text entry in the dataframe
    for text in df['Text']:
        # Extract n-grams (unigrams, bigrams, trigrams)
        ngrams = extract_ngrams(text)
        ngram_features.append(ngrams)

        # Here you can add your other feature extraction methods
        # For now, just add a placeholder for additional features
        additional_features.append([])  # Replace with actual feature extraction logic

    return ngram_features, additional_features

# Extract features and labels
ngram_features, additional_features = extract_features(df)

# Combine n-grams into one string for each row to pass into vectorizer
ngram_texts = [' '.join(ngram) for ngram in ngram_features]

# Create the CountVectorizer for n-grams
vectorizer = CountVectorizer(ngram_range=(1, 3), stop_words='english')
X_ngrams = vectorizer.fit_transform(ngram_texts)

# Convert additional features to a numpy array
X_additional = np.array(additional_features)

# Combine n-grams and additional features (horizontal stack)
from scipy.sparse import hstack
X = hstack([X_ngrams, X_additional])

# Prepare the target labels (Y)
y = df['Label'].values

# Split the dataset into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Logistic Regression model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict the labels on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regression Accuracy: {accuracy * 100:.2f}%")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Data after cleaning:
                                                Text             Label
0  complicated 3D character models are widely use...  background_claim
1  The range of breathtaking realistic 3D models ...  background_claim
2         a production cannot afford major revisions  background_claim
3  providing a flexible and efficient solution to...         own_claim
4  Skeleton Subspace Deformation (SSD) is the pre...  background_claim

Data after removing non-informative content:
                                                Text             Label
0  complicated 3D character models are widely use...  background_claim
1  The range of breathtaking realistic 3D models ...  background_claim
2         a production cannot afford major revisions  background_claim
3  providing a flexible and efficient solution to...         own_claim
4  Skeleton Subspace Deformation (SSD) is the pre...  background_claim

Remaining rows after length filter: 11288
                                      

ValueError: empty vocabulary; perhaps the documents only contain stop words

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import spacy

# Load SpaCy's English model for modal verb and negation detection
nlp = spacy.load("en_core_web_sm")

# Define a list of agreement and disagreement words
agreement_lexicon = ['agree', 'agreement', 'consistent', 'consistent with', 'support', 'supports', 'endorses']
disagreement_lexicon = ['disagree', 'disagreement', 'opposes', 'opposed', 'against', 'contradicts']

# Define a list of hedge words
hedge_lexicon = ['perhaps', 'maybe', 'likely', 'uncertain', 'possibly', 'could', 'should']

# Define a list of negation words
negation_lexicon = ['not', 'never', 'no', 'none', 'nothing', 'neither']

# Read the data
df = pd.read_csv('compiled_output.csv')  # Replace with your file path

# Step 1: Remove rows with empty or irrelevant content
df = df[df['Text'].str.strip().notna()]  # Remove rows with empty strings
df = df[df['Text'].str.strip() != '']  # Remove rows with only whitespace
df = df[~df['Text'].str.contains(r'\[.*\]')]  # Remove rows with non-textual content like '[2]'

# Step 2: Check if there are still any rows left after filtering
print(f"Remaining rows after filtering: {len(df)}")
print(df.head())

# Step 3: Feature Extraction
def extract_features(df):
    agreement_features = []
    disagreement_features = []
    hedge_features = []
    negation_features = []
    modal_verbs_features = []

    for text in df['Text']:
        # Tokenize the text using spaCy for modal verbs and negation
        doc = nlp(text)

        # Feature 1: Check for agreement and disagreement lexicons
        agreement_score = sum(1 for word in text.lower().split() if word in agreement_lexicon)
        disagreement_score = sum(1 for word in text.lower().split() if word in disagreement_lexicon)

        # Feature 2: Check for hedge words
        hedge_score = sum(1 for word in text.lower().split() if word in hedge_lexicon)

        # Feature 3: Check for negation words
        negation_score = sum(1 for word in text.lower().split() if word in negation_lexicon)

        # Feature 4: Check for modal verbs using spaCy's part-of-speech tagging
        modal_verbs_score = sum(1 for token in doc if token.pos_ == 'VERB' and token.morph.get('Mood') == ['Ind'] and token.dep_ == 'aux')

        # Append the features to respective lists
        agreement_features.append(agreement_score)
        disagreement_features.append(disagreement_score)
        hedge_features.append(hedge_score)
        negation_features.append(negation_score)
        modal_verbs_features.append(modal_verbs_score)

    # Combine all features into a single matrix
    features = np.array([agreement_features, disagreement_features, hedge_features, negation_features, modal_verbs_features]).T
    return features

# Step 4: Extract the features
X = extract_features(df)

# Prepare the target labels (Y)
y = df['Label'].values

# Step 5: Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 6: Train a Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Step 7: Predict the labels on the test set
y_pred = model.predict(X_test)

# Step 8: Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regression Accuracy: {accuracy * 100:.2f}%")

# Step 9: Print the classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Remaining rows after filtering: 13454
                                                Text             Label
0  complicated 3D character models are widely use...  background_claim
1  The range of breathtaking realistic 3D models ...  background_claim
2         a production cannot afford major revisions  background_claim
3  providing a flexible and efficient solution to...         own_claim
4  Skeleton Subspace Deformation (SSD) is the pre...  background_claim
Logistic Regression Accuracy: 42.93%

Classification Report:
                  precision    recall  f1-score   support

background_claim       0.00      0.00      0.00      1039
            data       0.00      0.00      0.00      1265
       own_claim       0.43      1.00      0.60      1733

        accuracy                           0.43      4037
       macro avg       0.14      0.33      0.20      4037
    weighted avg       0.18      0.43      0.26      4037



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
