In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 1: Load the extracted features from CSV
data = pd.read_csv('extracted_features.csv')  # Replace with the path to your CSV file

# Step 2: Preprocess the data
# Separate the features (X) and the labels (y)
X = data[['Unigrams', 'Bigrams', 'Trigrams', 'Modal_Verbs', 'Negations']]  # Feature columns
y = data['Label']  # Label column

# Combine all text-based features into a single feature column
X_combined = X.apply(lambda row: ' '.join(row), axis=1)  # Combine unigrams, bigrams, etc.

# Step 3: Vectorize the combined feature column using CountVectorizer (or TF-IDF Vectorizer)
vectorizer = CountVectorizer()
X_vec = vectorizer.fit_transform(X_combined)

# Step 4: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

# Step 5: Train the Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Step 6: Make predictions on the test set
y_pred = model.predict(X_test)

# Step 7: Evaluate the model
print(f'Accuracy: {accuracy_score(y_test, y_pred):.4f}')
print('\nClassification Report:')
print(classification_report(y_test, y_pred))
print('\nConfusion Matrix:')
print(confusion_matrix(y_test, y_pred))


TypeError: sequence item 3: expected str instance, float found