In [1]:
import os
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from tqdm import tqdm
import nltk
from textblob import TextBlob
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize text processors
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

from nltk.stem import WordNetLemmatizer

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    words = word_tokenize(text)

     # Correct spellings
    words = [str(TextBlob(word).correct()) for word in words]
    
    # Remove stopwords and non-alphabetic tokens
    words = [word for word in words if word not in stop_words and word.isalpha()]
    
    # Apply stemming
    # words = [stemmer.stem(word) for word in words]

        # Apply lemmatization (instead of stemming)
    words = [lemmatizer.lemmatize(word, pos='v') for word in words]  # 'v' for verbs
    words = [lemmatizer.lemmatize(word, pos='n') for word in words]  # 'n' for nouns
    
    return ' '.join(words)

# Load your data
def load_data(folder_path, label):
    data = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            try:
                with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                    text = file.read()
                    data.append((text, label))
            except UnicodeDecodeError:
                try:
                    with open(os.path.join(folder_path, filename), 'r', encoding='latin-1') as file:
                        text = file.read()
                        data.append((text, label))
                except:
                    continue
    return data

# Main 
print("Loading data...")
base_path = "review_polarity/txt_sentoken"
neg_data = load_data(os.path.join(base_path, "neg"), "negative")
pos_data = load_data(os.path.join(base_path, "pos"), "positive")
all_data = neg_data + pos_data
texts = [item[0] for item in all_data]
labels = [item[1] for item in all_data]

print(f"Loaded {len(texts)} documents (positive: {len(pos_data)}, negative: {len(neg_data)})")

# Preprocess with progress bar
print("Preprocessing texts with stemming...")
processed_texts = [preprocess_text(text) for text in tqdm(texts)]

# Create TF-IDF vectors
print("Creating TF-IDF vectors...")
tfidf_vectorizer = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1, 2),  # Include bigrams
    min_df=5,            # Ignore very rare words
    max_df=0.7           # Ignore overly common words
)
tfidf_vectors = tfidf_vectorizer.fit_transform(processed_texts)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    tfidf_vectors, labels, test_size=0.2, random_state=42
)

# Train model
print("Training model...")
model = LogisticRegression(
    max_iter=1000,
    class_weight='balanced',
    solver='liblinear'
)
model.fit(X_train, y_train)

# Evaluate
print("Evaluating model...")
y_pred = model.predict(X_test)
print("\nModel Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Loading data...
Loaded 2000 documents (positive: 1000, negative: 1000)
Preprocessing texts with stemming...


100%|██████████| 2000/2000 [3:14:16<00:00,  5.83s/it]  


Creating TF-IDF vectors...
Training model...
Evaluating model...

Model Evaluation:
Accuracy: 0.8225

Classification Report:
              precision    recall  f1-score   support

    negative       0.83      0.80      0.82       199
    positive       0.81      0.84      0.83       201

    accuracy                           0.82       400
   macro avg       0.82      0.82      0.82       400
weighted avg       0.82      0.82      0.82       400



In [2]:
from joblib import dump

# Save TF-IDF vectors (sparse matrix)
dump(tfidf_vectors, 'tfidf_vectors_lemm.joblib')

# Save the vectorizer
dump(tfidf_vectorizer, 'tfidf_vectorizer_lemm.joblib')

# Save labels
dump(labels, 'labels_lemm.joblib')

['labels_lemm.joblib']

In [3]:
from joblib import load

# Or using joblib
tfidf_vectors = load('tfidf_vectors_lemm.joblib')
tfidf_vectorizer = load('tfidf_vectorizer_lemm.joblib')
labels = load('labels_lemm.joblib')

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    tfidf_vectors, labels, test_size=0.2, random_state=42
)


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

#Logistic Regression
print("Training Logistic Regression model...")
logreg = LogisticRegression(
max_iter=1000,
class_weight='balanced',
solver='liblinear'
)
logreg.fit(X_train, y_train)

#Evaluate
print("Evaluating Logistic Regression...")
y_pred_logreg = logreg.predict(X_test)
print("\nLogistic Regression Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred_logreg))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_logreg))
print("="*50)

#Decision Tree
print("\nTraining Decision Tree model...")
tree = DecisionTreeClassifier(
max_depth=5,
class_weight='balanced',
random_state=42
)
tree.fit(X_train, y_train)

#Evaluate
print("Evaluating Decision Tree...")
y_pred_tree = tree.predict(X_test)
print("\nDecision Tree Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred_tree))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_tree))
print("="*50)

#Random Forest
print("\nTraining Random Forest model...")
rf = RandomForestClassifier(
n_estimators=100,
max_depth=5,
class_weight='balanced',
random_state=42
)
rf.fit(X_train, y_train)

#Evaluate
print("Evaluating Random Forest...")
y_pred_rf = rf.predict(X_test)
print("\nRandom Forest Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))
print("="*50)

#SVM
print("\nTraining SVM model...")
svm = SVC(
kernel='linear',
class_weight='balanced',
probability=True,
random_state=42
)
svm.fit(X_train, y_train)

#Evaluate
print("Evaluating SVM...")
y_pred_svm = svm.predict(X_test)
print("\nSVM Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_svm))
print("="*50)

#Gradient Boosting
print("\nTraining Gradient Boosting model...")
gb = GradientBoostingClassifier(
n_estimators=100,
max_depth=3,
random_state=42
)
gb.fit(X_train, y_train)

#Evaluate
print("Evaluating Gradient Boosting...")
y_pred_gb = gb.predict(X_test)
print("\nGradient Boosting Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred_gb))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_gb))
print("="*50)

#XGBoost
from sklearn.preprocessing import LabelEncoder

# Convert string labels to numeric (0 and 1) for all models
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

# For models that need dense matrices (Naive Bayes), convert X_train and X_test
# (Assuming you're using sparse matrices - if not, you can skip this)
X_train_dense = X_train.toarray() if hasattr(X_train, 'toarray') else X_train
X_test_dense = X_test.toarray() if hasattr(X_test, 'toarray') else X_test

# Now run all models with the appropriate data formats:

# 1. Models that work with sparse X and numeric y (most models)
print("Training Logistic Regression model...")
logreg = LogisticRegression(
    max_iter=1000,
    class_weight='balanced',
    solver='liblinear'
)
logreg.fit(X_train, y_train_encoded)

# ... [similar for Random Forest, SVM, etc.] ...

# 2. Models that need dense X (Naive Bayes)
print("\nTraining Naive Bayes model...")
nb = GaussianNB()
nb.fit(X_train_dense, y_train_encoded)

# Evaluate
print("Evaluating Naive Bayes...")
y_pred_nb = nb.predict(X_test_dense)
print("\nNaive Bayes Evaluation:")
print("Accuracy:", accuracy_score(y_test_encoded, y_pred_nb))
print("\nClassification Report:")
print(classification_report(y_test_encoded, y_pred_nb, target_names=le.classes_))
print("="*50)

# 3. XGBoost (needs numeric y)
print("\nTraining XGBoost model...")
xgb = XGBClassifier(
    n_estimators=100,
    max_depth=3,
    scale_pos_weight=len(y_train_encoded[y_train_encoded==0])/len(y_train_encoded[y_train_encoded==1]),
    random_state=42
)
xgb.fit(X_train, y_train_encoded)  # Can use sparse X with XGBoost

# Evaluate
print("Evaluating XGBoost...")
y_pred_xgb = xgb.predict(X_test)
print("\nXGBoost Evaluation:")
print("Accuracy:", accuracy_score(y_test_encoded, y_pred_xgb))
print("\nClassification Report:")
print(classification_report(y_test_encoded, y_pred_xgb, target_names=le.classes_))
print("="*50)

# Evaluate
print("Evaluating XGBoost...")
y_pred_xgb = xgb.predict(X_test)
print("\nXGBoost Evaluation:")
print("Accuracy:", accuracy_score(y_test_encoded, y_pred_xgb))
print("\nClassification Report:")
print(classification_report(y_test_encoded, y_pred_xgb, target_names=le.classes_))
print("="*50)

#K-Nearest Neighbors
print("\nTraining KNN model...")
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

#Evaluate
print("Evaluating KNN...")
y_pred_knn = knn.predict(X_test)
print("\nKNN Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred_knn))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_knn))
print("="*50)

#Neural Network (MLP)
print("\nTraining Neural Network (MLP) model...")
mlp = MLPClassifier(
hidden_layer_sizes=(100,),
max_iter=1000,
random_state=42
)
mlp.fit(X_train, y_train)

#Evaluate
print("Evaluating Neural Network...")
y_pred_mlp = mlp.predict(X_test)
print("\nNeural Network Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred_mlp))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_mlp))
print("="*50)

Training Logistic Regression model...
Evaluating Logistic Regression...

Logistic Regression Evaluation:
Accuracy: 0.8225

Classification Report:
              precision    recall  f1-score   support

    negative       0.83      0.80      0.82       199
    positive       0.81      0.84      0.83       201

    accuracy                           0.82       400
   macro avg       0.82      0.82      0.82       400
weighted avg       0.82      0.82      0.82       400


Training Decision Tree model...
Evaluating Decision Tree...

Decision Tree Evaluation:
Accuracy: 0.645

Classification Report:
              precision    recall  f1-score   support

    negative       0.66      0.58      0.62       199
    positive       0.63      0.71      0.67       201

    accuracy                           0.65       400
   macro avg       0.65      0.64      0.64       400
weighted avg       0.65      0.65      0.64       400


Training Random Forest model...
Evaluating Random Forest...

Random For