1. Data Preparation

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load data from files
with open('data/positive-reviews.txt', 'r', encoding='utf-8') as pos_file:
    positive_reviews = pos_file.readlines()

with open('data/negative-reviews.txt', 'r', encoding='utf-8') as neg_file:
    negative_reviews = neg_file.readlines()

# Split data into training (80%) and testing (20%) sets
train_pos, test_pos = train_test_split(positive_reviews, test_size=0.2, random_state=42)
train_neg, test_neg = train_test_split(negative_reviews, test_size=0.2, random_state=42)

# Combine positive and negative reviews for training and testing
def create_dataset(positive, negative):
    data = pd.DataFrame({
        'review': positive + negative,
        'label': [1] * len(positive) + [0] * len(negative)  # 1 for positive, 0 for negative
    })
    return data

train_data = create_dataset(train_pos, train_neg)
test_data = create_dataset(test_pos, test_neg)

# Save data to CSV files for later use (optional)
train_data.to_csv('data/train_data.csv', index=False)
test_data.to_csv('data/test_data.csv', index=False)

print("Training and testing datasets created successfully.")


Training and testing datasets created successfully.


2. Data Processing

In [2]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Download required resources
nltk.download('stopwords')
nltk.download('wordnet')

# Set stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_review(review):
    words = re.findall(r'\w+', review.lower())  # Tokenize and lowercase
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return words

def preprocess_data(data):
    return [preprocess_review(review) for review in data]



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maste\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\maste\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Step 3: Feature Extraction

In [3]:
import numpy as np
import re
import pandas as pd
from math import log
import chardet

# Detect encoding for the positive words file
with open('data/positive-words.txt', 'rb') as f:
    pos_encoding = chardet.detect(f.read())['encoding']

with open('data/negative-words.txt', 'rb') as f:
    neg_encoding = chardet.detect(f.read())['encoding']

# Load positive and negative words using the detected encoding
positive_words = set(open('data/positive-words.txt', encoding=pos_encoding).read().splitlines())
negative_words = set(open('data/negative-words.txt', encoding=neg_encoding).read().splitlines())

# Function to extract features from reviews
def extract_features(review):
    words = re.findall(r'\w+', review.lower())
    pos_count = sum(word in positive_words for word in words)
    neg_count = sum(word in negative_words for word in words)
    contains_no = int('no' in words)
    pronoun_count = sum(word in {'i', 'me', 'my', 'you', 'your'} for word in words)
    contains_exclamation = int('!' in review)
    review_length_log = log(len(words) + 1)
    return [pos_count, neg_count, contains_no, pronoun_count, contains_exclamation, round(review_length_log, 3)]

# Load train and test datasets
train_data = pd.read_csv('data/train_data.csv')  # Replace with actual file path
test_data = pd.read_csv('data/test_data.csv')    # Replace with actual file path

# Ensure datasets have the required columns
assert 'review' in train_data.columns and 'label' in train_data.columns, "Train data must have 'review' and 'label' columns."
assert 'review' in test_data.columns and 'label' in test_data.columns, "Test data must have 'review' and 'label' columns."

# Extract features for training and testing
train_features = np.array([extract_features(r) for r in train_data['review']])
test_features = np.array([extract_features(r) for r in test_data['review']])

# Get labels for training and testing
train_labels = np.array(train_data['label'])
test_labels = np.array(test_data['label'])

# Optional: Convert to DataFrame for better readability
columns = ['Positive Words', 'Negative Words', 'Contains "No"', 'Pronoun Count', 'Contains "!"', 'Log of Review Length']
train_df = pd.DataFrame(train_features, columns=columns, index=[f"Review {i}" for i in range(len(train_features))])
test_df = pd.DataFrame(test_features, columns=columns, index=[f"Review {i}" for i in range(len(test_features))])

# Print Train and Test DataFrames for inspection
print("\nTrain Features:\n")
print(train_df.to_markdown(index=True))

print("\nTest Features:\n")
print(test_df.to_markdown(index=True))

# Save the features and labels to file for future use (optional)
np.save('data/train/train_features.npy', train_features)
np.save('data/train/train_labels.npy', train_labels)
np.save('data/train/test_features.npy', test_features)
np.save('data/train/test_labels.npy', test_labels)



Train Features:

|              |   Positive Words |   Negative Words |   Contains "No" |   Pronoun Count |   Contains "!" |   Log of Review Length |
|:-------------|-----------------:|-----------------:|----------------:|----------------:|---------------:|-----------------------:|
| Review 0     |                0 |                0 |               0 |               0 |              0 |                  0.693 |
| Review 1     |                2 |                0 |               0 |               0 |              0 |                  1.792 |
| Review 2     |                2 |                0 |               0 |               0 |              0 |                  2.303 |
| Review 3     |                0 |                0 |               0 |               0 |              0 |                  1.609 |
| Review 4     |                0 |                0 |               0 |               0 |              0 |                  1.099 |
| Review 5     |                3 |                

4. Train Model and Evaluate

In [9]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Load the saved features and labels
train_features = np.load('data/train/train_features.npy')
train_labels = np.load('data/train/train_labels.npy')
test_features = np.load('data/train/test_features.npy')
test_labels = np.load('data/train/test_labels.npy')

# Function to train and evaluate a model
def train_and_evaluate(model, train_features, train_labels, test_features, test_labels):
    # Train the model
    model.fit(train_features, train_labels)
    
    # Predict the test data
    predictions = model.predict(test_features)
    
    # Calculate accuracy
    accuracy = accuracy_score(test_labels, predictions)
    
    return accuracy

def main(train_features, train_labels, test_features, test_labels):
    # Initialize models
    models = {
        "Logistic Regression": LogisticRegression(),
        "Random Forest": RandomForestClassifier(),
        "SVM": SVC()
    }
    
    # Evaluate each model and print accuracy
    print("\nModel Accuracies:")
    for model_name, model in models.items():
        accuracy = train_and_evaluate(model, train_features, train_labels, test_features, test_labels)
        print(f"{model_name} Accuracy: {accuracy}")

if __name__ == "__main__":
    # Call main function with the data
    main(train_features, train_labels, test_features, test_labels)



Model Accuracies:
Logistic Regression Accuracy: 0.82025
Random Forest Accuracy: 0.82025
SVM Accuracy: 0.821


5. Report Result

In [None]:
# Initialize models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC()
}

# Collect model names and accuracies
results = {'Model': [], 'Accuracy': []}

# Evaluate each model
for model_name, model in models.items():
    accuracy = train_and_evaluate(model, train_features, train_labels, test_features, test_labels)
    results['Model'].append(model_name)
    results['Accuracy'].append(accuracy)

# Create DataFrame from results
results_df = pd.DataFrame(results)

# Print the results DataFrame
print(results_df)

                 Model  Accuracy
0  Logistic Regression  0.820250
1        Random Forest  0.820625
2                  SVM  0.821000
