# Logistic Regression With Argument Level Features

### First I am going to do data analysis also called EDA

In [15]:
import pandas as pd

# Load the data
df = pd.read_csv("compiled_output.csv")

# Display the first few rows to check the structure
print(df.head())


                                                Text             Label
0  complicated 3D character models are widely use...  background_claim
1  The range of breathtaking realistic 3D models ...  background_claim
2         a production cannot afford major revisions  background_claim
3  providing a flexible and efficient solution to...         own_claim
4  Skeleton Subspace Deformation (SSD) is the pre...  background_claim


###  Preprocessing 

In [26]:
# Check for missing or empty text
df['Text'].isnull().sum()  # Check for NaNs
df[df['Text'].str.strip() == '']  # Check for empty or all-whitespace texts
# Remove rows where text is empty or null
df = df[df['Text'].str.strip() != '']
df = df.dropna(subset=['Text'])
vectorizer = CountVectorizer(ngram_range=(1, 3), stop_words='english')  # Remove stop words


In [27]:
# Test tokenization of a sample text
sample_text = df['Text'].iloc[0]
print("Original Text:", sample_text)
print("Tokenized Text:", sample_text.split())


Original Text: complicated 3D character models are widely used in fields of entertainment, virtual reality, medicine etc
Tokenized Text: ['complicated', '3D', 'character', 'models', 'are', 'widely', 'used', 'in', 'fields', 'of', 'entertainment,', 'virtual', 'reality,', 'medicine', 'etc']


In [28]:
# Test extracting n-grams from a small set of text
sample_texts = df['Text'].head(5)  # First 5 rows for quick testing
vectorizer = CountVectorizer(ngram_range=(1, 3), stop_words='english')
X_test = vectorizer.fit_transform(sample_texts)
print("Vocabulary:", vectorizer.get_feature_names_out())


Vocabulary: ['3d' '3d character' '3d character models' '3d models' '3d models limited'
 'afford' 'afford major' 'afford major revisions' 'animation'
 'animation remains' 'animation remains open' 'approach'
 'approach character' 'approach character skinning' 'artists'
 'artists resolution' 'artists resolution devices' 'breathtaking'
 'breathtaking realistic' 'breathtaking realistic 3d' 'character'
 'character models' 'character models widely' 'character skinning'
 'character skinning present' 'complicated' 'complicated 3d'
 'complicated 3d character' 'creativity' 'creativity artists'
 'creativity artists resolution' 'deformation' 'deformation ssd'
 'deformation ssd predominant' 'devices' 'efficient' 'efficient solution'
 'efficient solution animation' 'entertainment' 'entertainment virtual'
 'entertainment virtual reality' 'fields' 'fields entertainment'
 'fields entertainment virtual' 'flexible' 'flexible efficient'
 'flexible efficient solution' 'limited' 'limited creativity'
 'limite

### Extracting the required features

In [None]:
import spacy
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Initialize spaCy model
nlp = spacy.load("en_core_web_sm")

# Argument lexicons
agreement_lexicon = ['agree', 'yes', 'definitely', 'sure', 'absolutely', 'of course', , 'agreement', 'consistent', 'consistent with', 'support', 'supports', 'endorses']
disagreement_lexicon = ['disagree', 'no', 'never', 'not', 'don’t', 'won’t', 'disagreement', 'opposes', 'opposed', 'against', 'contradicts]


# Hedge words
hedge_words = ['perhaps', 'maybe', 'possibly', 'could', 'might', 'probably', 'likely', 'uncertain', 'should']

# Modal verbs list
modal_verbs = ['can', 'could', 'may', 'might', 'shall', 'should', 'will', 'would']

# Define a list of negation words
negation_lexicon = ['not', 'never', 'no', 'none', 'nothing', 'neither']
# Function to extract n-grams
def extract_ngrams(text, ngram_range=(1, 3)):
    vectorizer = CountVectorizer(ngram_range=ngram_range)
    ngrams = vectorizer.fit_transform([text])
    return vectorizer.get_feature_names_out()

# Extract Argument Lexicons (Agreement and Disagreement)
def extract_argument_lexicon_features(text):
    agreement_count = sum([word in text.lower() for word in agreement_lexicon])
    disagreement_count = sum([word in text.lower() for word in disagreement_lexicon])
    return agreement_count, disagreement_count

# Extract Hedge Features
def extract_hedge_features(text):
    hedge_count = sum([word in text.lower() for word in hedge_words])
    return hedge_count

# Extract Modal Verbs
def extract_modal_verbs(text):
    doc = nlp(text)
    modal_count = sum([token.lemma_ in modal_verbs for token in doc])
    return modal_count

# Detect Negation
def detect_negation(text):
    negation_patterns = [r'\b(not|no|never|don\'t|won\'t|isn\'t|aren\'t|can\'t)\b']
    negation_count = sum([bool(re.search(pattern, text.lower())) for pattern in negation_patterns])
    return negation_count

# Function to extract all features for each text
def extract_features(df):
    all_ngrams = []
    other_features = []
    
    for text in df['Text']:
        # Extract n-grams
        ngrams = extract_ngrams(text)
        ngram_features = list(ngrams)  # Get the ngram features
        
        # Extract argument lexicons (agreement, disagreement)
        agreement_count, disagreement_count = extract_argument_lexicon_features(text)

        # Extract hedge features
        hedge_count = extract_hedge_features(text)

        # Extract modal verbs
        modal_count = extract_modal_verbs(text)

        # Extract negation features
        negation_count = detect_negation(text)

        # Combine all the features into one list
        other_features.append([agreement_count, disagreement_count, hedge_count, modal_count, negation_count])

        # Store the ngram features as a separate part of the feature matrix
        all_ngrams.append(ngram_features)
    
    return all_ngrams, other_features


In [13]:
print(df['Text'].isnull().sum())  # Check for any null values
print(df['Text'].apply(len).describe()) 

0
count    13592.000000
mean        59.892216
std         45.405939
min          1.000000
25%         22.000000
50%         53.000000
75%         86.000000
max        359.000000
Name: Text, dtype: float64


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import spacy


# Extract the features
X = extract_features(df)

# Preparing the target labels (Y)
y = df['Label'].values

# Spliting the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Training a Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predicting the labels on the test set
y_pred = model.predict(X_test)

# Evaluating the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regression Accuracy: {accuracy * 100:.2f}%")

# Printing the classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Remaining rows after filtering: 13454
                                                Text             Label
0  complicated 3D character models are widely use...  background_claim
1  The range of breathtaking realistic 3D models ...  background_claim
2         a production cannot afford major revisions  background_claim
3  providing a flexible and efficient solution to...         own_claim
4  Skeleton Subspace Deformation (SSD) is the pre...  background_claim
Logistic Regression Accuracy: 42.93%

Classification Report:
                  precision    recall  f1-score   support

background_claim       0.00      0.00      0.00      1039
            data       0.00      0.00      0.00      1265
       own_claim       0.43      1.00      0.60      1733

        accuracy                           0.43      4037
       macro avg       0.14      0.33      0.20      4037
    weighted avg       0.18      0.43      0.26      4037



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
