## 1. Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import pickle
import time
from pathlib import Path

# Scikit-learn imports
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, 
    precision_score, 
    recall_score, 
    f1_score,
    classification_report,
    confusion_matrix
)

# Configuration
warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

print("‚úì Libraries imported successfully")

‚úì Libraries imported successfully


## 2. Load Cleaned Data

In [2]:
# Define paths
DATA_PATH = Path('../data/processed/cleaned_tweets.csv')
MODELS_PATH = Path('../models/ml')
VECTORIZER_PATH = Path('../models/vectorizers')
VISUALS_PATH = Path('../visuals/confusion_matrices')

# Create directories if they don't exist
MODELS_PATH.mkdir(parents=True, exist_ok=True)
VECTORIZER_PATH.mkdir(parents=True, exist_ok=True)
VISUALS_PATH.mkdir(parents=True, exist_ok=True)

# Load the cleaned dataset
df = pd.read_csv(DATA_PATH)

print(f"Dataset loaded: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nSentiment distribution:")
print(df['sentiment'].value_counts())

df.head()

Dataset loaded: (99735, 4)

Columns: ['text', 'text_clean', 'sentiment', 'sentiment_label']

Sentiment distribution:
sentiment
0    49893
1    49842
Name: count, dtype: int64


Unnamed: 0,text,text_clean,sentiment,sentiment_label
0,@xnausikaax oh no! where did u order from? tha...,oh no where did u order from that s horrible,0,negative
1,A great hard training weekend is over. a coup...,a great hard training weekend is over a couple...,0,negative
2,"Right, off to work Only 5 hours to go until I...",right off to work only hours to go until i m f...,0,negative
3,I am craving for japanese food,i am craving for japanese food,0,negative
4,Jean Michel Jarre concert tomorrow gotta work...,jean michel jarre concert tomorrow gotta work ...,0,negative


## 3. Prepare Features and Labels

In [3]:
# Extract features (cleaned text) and labels (sentiment)
X = df['text_clean']  # Cleaned text
y = df['sentiment']   # 0 = negative, 1 = positive

print(f"Features shape: {X.shape}")
print(f"Labels shape: {y.shape}")
print(f"\nLabel distribution:")
print(y.value_counts())

Features shape: (99735,)
Labels shape: (99735,)

Label distribution:
sentiment
0    49893
1    49842
Name: count, dtype: int64


## 4. Split Data: Train/Test

In [4]:
# Split data into training (80%) and testing (20%)
# stratify=y ensures both sets have the same proportion of positive/negative sentiments
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,      # 20% for testing
    random_state=42,    # For reproducibility
    stratify=y          # Maintain class balance
)

print(f"Training set size: {len(X_train)} samples")
print(f"Testing set size: {len(X_test)} samples")
print(f"\nTraining set sentiment distribution:")
print(y_train.value_counts())
print(f"\nTesting set sentiment distribution:")
print(y_test.value_counts())

Training set size: 79788 samples
Testing set size: 19947 samples

Training set sentiment distribution:
sentiment
0    39914
1    39874
Name: count, dtype: int64

Testing set sentiment distribution:
sentiment
0    9979
1    9968
Name: count, dtype: int64


## 5. TF-IDF Vectorization

**TF-IDF** (Term Frequency-Inverse Document Frequency) converts text into numerical features:
- **TF**: How often a word appears in a document
- **IDF**: How rare/important a word is across all documents
- Common words (like "the", "is") get low scores
- Rare, meaningful words get high scores

In [5]:
# Create TF-IDF vectorizer
# max_features=10000: Keep only the 10,000 most important words
# min_df=5: Ignore words that appear in fewer than 5 documents
# max_df=0.7: Ignore words that appear in more than 70% of documents (too common)
tfidf = TfidfVectorizer(
    max_features=10000,
    min_df=5,
    max_df=0.7,
    ngram_range=(1, 2)  # Use single words (unigrams) and word pairs (bigrams)
)

print("Converting text to TF-IDF features...")
print("This may take a few minutes...\n")

start_time = time.time()

# Fit on training data and transform both train and test
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

elapsed_time = time.time() - start_time

print(f"‚úì Vectorization completed in {elapsed_time:.2f} seconds")
print(f"\nTraining features shape: {X_train_tfidf.shape}")
print(f"Testing features shape: {X_test_tfidf.shape}")
print(f"\nVocabulary size: {len(tfidf.vocabulary_)} words")

Converting text to TF-IDF features...
This may take a few minutes...

‚úì Vectorization completed in 2.83 seconds

Training features shape: (79788, 10000)
Testing features shape: (19947, 10000)

Vocabulary size: 10000 words


In [6]:
# Save the TF-IDF vectorizer for later use
vectorizer_file = VECTORIZER_PATH / 'tfidf_vectorizer.pkl'
with open(vectorizer_file, 'wb') as f:
    pickle.dump(tfidf, f)

print(f"‚úì TF-IDF vectorizer saved to: {vectorizer_file}")

‚úì TF-IDF vectorizer saved to: ..\models\vectorizers\tfidf_vectorizer.pkl


## 6. Model 1: Logistic Regression

**Logistic Regression** is a simple, fast linear model:
- Good baseline for binary classification
- Works well with high-dimensional sparse data (like TF-IDF)
- Fast to train and predict

In [7]:
print("="*60)
print("TRAINING LOGISTIC REGRESSION")
print("="*60)

start_time = time.time()

# Create and train Logistic Regression model
# max_iter=1000: Maximum number of iterations
# C=1.0: Regularization strength (smaller = stronger regularization)
lr_model = LogisticRegression(
    max_iter=1000,
    C=1.0,
    random_state=42,
    n_jobs=-1  # Use all CPU cores
)

lr_model.fit(X_train_tfidf, y_train)

# Make predictions on test set
y_pred_lr = lr_model.predict(X_test_tfidf)

elapsed_time = time.time() - start_time

print(f"\n‚úì Training completed in {elapsed_time:.2f} seconds")
print(f"\nModel trained on {len(X_train)} samples")
print(f"Predictions made on {len(X_test)} samples")

TRAINING LOGISTIC REGRESSION

‚úì Training completed in 15.28 seconds

Model trained on 79788 samples
Predictions made on 19947 samples


In [8]:
# Evaluate Logistic Regression
lr_accuracy = accuracy_score(y_test, y_pred_lr)
lr_precision = precision_score(y_test, y_pred_lr)
lr_recall = recall_score(y_test, y_pred_lr)
lr_f1 = f1_score(y_test, y_pred_lr)

print("="*60)
print("LOGISTIC REGRESSION RESULTS")
print("="*60)
print(f"Accuracy:  {lr_accuracy:.4f} ({lr_accuracy*100:.2f}%)")
print(f"Precision: {lr_precision:.4f}")
print(f"Recall:    {lr_recall:.4f}")
print(f"F1-Score:  {lr_f1:.4f}")
print("\n" + "="*60)

# Detailed classification report
print("\nDetailed Classification Report:")
print(classification_report(y_test, y_pred_lr, target_names=['Negative', 'Positive']))

LOGISTIC REGRESSION RESULTS
Accuracy:  0.7919 (79.19%)
Precision: 0.7882
Recall:    0.7980
F1-Score:  0.7931


Detailed Classification Report:
              precision    recall  f1-score   support

    Negative       0.80      0.79      0.79      9979
    Positive       0.79      0.80      0.79      9968

    accuracy                           0.79     19947
   macro avg       0.79      0.79      0.79     19947
weighted avg       0.79      0.79      0.79     19947



## 7. Model 2: Support Vector Machine (SVM)

**LinearSVC** (Linear Support Vector Classification):
- Finds the best hyperplane to separate classes
- Often performs very well on text classification
- Good with high-dimensional data

In [9]:
print("="*60)
print("TRAINING SUPPORT VECTOR MACHINE (SVM)")
print("="*60)

start_time = time.time()

# Create and train SVM model
# C=1.0: Regularization parameter
# max_iter=1000: Maximum iterations
svm_model = LinearSVC(
    C=1.0,
    max_iter=1000,
    random_state=42
)

svm_model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred_svm = svm_model.predict(X_test_tfidf)

elapsed_time = time.time() - start_time

print(f"\n‚úì Training completed in {elapsed_time:.2f} seconds")
print(f"\nModel trained on {len(X_train)} samples")
print(f"Predictions made on {len(X_test)} samples")

TRAINING SUPPORT VECTOR MACHINE (SVM)

‚úì Training completed in 1.39 seconds

Model trained on 79788 samples
Predictions made on 19947 samples


In [None]:
# Evaluate SVM
svm_accuracy = accuracy_score(y_test, y_pred_svm)
svm_precision = precision_score(y_test, y_pred_svm)
svm_recall = recall_score(y_test, y_pred_svm)
svm_f1 = f1_score(y_test, y_pred_svm)

print("="*60)
print("SUPPORT VECTOR MACHINE RESULTS")
print("="*60)
print(f"Accuracy:  {svm_accuracy:.4f} ({svm_accuracy*100:.2f}%)")
print(f"Precision: {svm_precision:.4f}")
print(f"Recall:    {svm_recall:.4f}")
print(f"F1-Score:  {svm_f1:.4f}")
print("\n" + "="*60)

# Detailed classification report
print("\nDetailed Classification Report:")
print(classification_report(y_test, y_pred_svm, target_names=['Negative', 'Positive']))

## 8. Model 3: Random Forest

**Random Forest**:
- Ensemble of decision trees
- Each tree votes on the final prediction
- Good at capturing non-linear patterns
- More robust to overfitting than single decision trees

In [None]:
print("="*60)
print("TRAINING RANDOM FOREST")
print("="*60)

start_time = time.time()

# Create and train Random Forest model
# n_estimators=100: Number of trees in the forest
# max_depth=20: Maximum depth of each tree
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=20,
    random_state=42,
    n_jobs=-1,  # Use all CPU cores
    verbose=1   # Show progress
)

rf_model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test_tfidf)

elapsed_time = time.time() - start_time

print(f"\n‚úì Training completed in {elapsed_time:.2f} seconds")
print(f"\nModel trained on {len(X_train)} samples")
print(f"Predictions made on {len(X_test)} samples")

In [None]:
# Evaluate Random Forest
rf_accuracy = accuracy_score(y_test, y_pred_rf)
rf_precision = precision_score(y_test, y_pred_rf)
rf_recall = recall_score(y_test, y_pred_rf)
rf_f1 = f1_score(y_test, y_pred_rf)

print("="*60)
print("RANDOM FOREST RESULTS")
print("="*60)
print(f"Accuracy:  {rf_accuracy:.4f} ({rf_accuracy*100:.2f}%)")
print(f"Precision: {rf_precision:.4f}")
print(f"Recall:    {rf_recall:.4f}")
print(f"F1-Score:  {rf_f1:.4f}")
print("\n" + "="*60)

# Detailed classification report
print("\nDetailed Classification Report:")
print(classification_report(y_test, y_pred_rf, target_names=['Negative', 'Positive']))

## 9. Compare All Models

In [None]:
# Create comparison dataframe
results = pd.DataFrame({
    'Model': ['Logistic Regression', 'SVM (LinearSVC)', 'Random Forest'],
    'Accuracy': [lr_accuracy, svm_accuracy, rf_accuracy],
    'Precision': [lr_precision, svm_precision, rf_precision],
    'Recall': [lr_recall, svm_recall, rf_recall],
    'F1-Score': [lr_f1, svm_f1, rf_f1]
})

# Sort by F1-Score
results = results.sort_values('F1-Score', ascending=False).reset_index(drop=True)

print("="*80)
print("MODEL COMPARISON")
print("="*80)
print(results.to_string(index=False))
print("\n" + "="*80)

# Find best model
best_model_name = results.iloc[0]['Model']
best_f1 = results.iloc[0]['F1-Score']
print(f"\nüèÜ Best Model: {best_model_name} (F1-Score: {best_f1:.4f})")

In [None]:
# Visualize model comparison
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
colors = ['#3498db', '#e74c3c', '#2ecc71']

for idx, metric in enumerate(metrics):
    ax = axes[idx // 2, idx % 2]
    
    # Create bar plot
    bars = ax.bar(results['Model'], results[metric], color=colors)
    
    # Add value labels on bars
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.4f}',
                ha='center', va='bottom', fontsize=10, fontweight='bold')
    
    ax.set_title(f'{metric} Comparison', fontsize=14, fontweight='bold')
    ax.set_ylabel(metric, fontsize=11)
    ax.set_ylim(0, 1)
    ax.grid(axis='y', alpha=0.3)
    ax.set_xticklabels(results['Model'], rotation=15, ha='right')

plt.tight_layout()
plt.savefig('../visuals/charts/model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úì Comparison chart saved to visuals/charts/model_comparison.png")

## 10. Confusion Matrices

**Confusion Matrix** shows:
- **True Positives (TP)**: Correctly predicted positive
- **True Negatives (TN)**: Correctly predicted negative
- **False Positives (FP)**: Predicted positive, actually negative
- **False Negatives (FN)**: Predicted negative, actually positive

In [None]:
# Function to plot confusion matrix
def plot_confusion_matrix(y_true, y_pred, model_name):
    """
    Plot and save confusion matrix for a model.
    """
    cm = confusion_matrix(y_true, y_pred)
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Negative', 'Positive'],
                yticklabels=['Negative', 'Positive'],
                cbar_kws={'label': 'Count'})
    
    plt.title(f'Confusion Matrix - {model_name}', fontsize=14, fontweight='bold', pad=20)
    plt.ylabel('True Label', fontsize=12)
    plt.xlabel('Predicted Label', fontsize=12)
    
    # Add accuracy in the plot
    accuracy = (cm[0, 0] + cm[1, 1]) / cm.sum()
    plt.text(0.5, -0.15, f'Accuracy: {accuracy:.4f}', 
             ha='center', transform=plt.gca().transAxes, fontsize=11)
    
    # Save figure
    filename = f"../visuals/confusion_matrices/{model_name.lower().replace(' ', '_')}_cm.png"
    plt.tight_layout()
    plt.savefig(filename, dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"‚úì Confusion matrix saved to {filename}")
    
    return cm

In [None]:
# Plot confusion matrices for all models
print("Generating confusion matrices...\n")

cm_lr = plot_confusion_matrix(y_test, y_pred_lr, 'Logistic Regression')

In [None]:
cm_svm = plot_confusion_matrix(y_test, y_pred_svm, 'SVM LinearSVC')

In [None]:
cm_rf = plot_confusion_matrix(y_test, y_pred_rf, 'Random Forest')

## 11. Save Best Model

In [None]:
# Determine which model is best based on F1-Score
models_dict = {
    'Logistic Regression': (lr_model, lr_f1),
    'SVM (LinearSVC)': (svm_model, svm_f1),
    'Random Forest': (rf_model, rf_f1)
}

# Find best model
best_model_name = max(models_dict, key=lambda k: models_dict[k][1])
best_model = models_dict[best_model_name][0]
best_score = models_dict[best_model_name][1]

# Save all models
print("Saving models...\n")

for model_name, (model, score) in models_dict.items():
    filename = MODELS_PATH / f"{model_name.lower().replace(' ', '_').replace('(', '').replace(')', '')}.pkl"
    with open(filename, 'wb') as f:
        pickle.dump(model, f)
    print(f"‚úì {model_name} saved to: {filename}")

# Also save best model separately
best_model_file = MODELS_PATH / 'best_model.pkl'
with open(best_model_file, 'wb') as f:
    pickle.dump(best_model, f)

print(f"\nüèÜ Best model ({best_model_name}) saved to: {best_model_file}")
print(f"   F1-Score: {best_score:.4f}")

## 12. Test on Sample Texts

In [None]:
# Test the best model on new sample texts
sample_texts = [
    "i love this product it is amazing",
    "this is the worst experience ever",
    "absolutely fantastic cant wait to use it again",
    "terrible service very disappointed",
    "great quality highly recommend",
    "waste of money do not buy"
]

print("="*60)
print("TESTING ON SAMPLE TEXTS")
print("="*60)

# Transform sample texts using TF-IDF
sample_tfidf = tfidf.transform(sample_texts)

# Make predictions
predictions = best_model.predict(sample_tfidf)

# Display results
for text, pred in zip(sample_texts, predictions):
    sentiment = "üòä POSITIVE" if pred == 1 else "üòû NEGATIVE"
    print(f"\nText: \"{text}\"")
    print(f"Prediction: {sentiment}")

print("\n" + "="*60)

## 13. Summary

### Key Findings:

**Dataset:**
- Training samples: 80,000 tweets
- Testing samples: 20,000 tweets
- Features: TF-IDF with 10,000 words

**Models Trained:**
1. Logistic Regression - Fast, simple baseline
2. SVM (LinearSVC) - Good for text classification
3. Random Forest - Ensemble method

**Next Steps (Week 4):**
- Implement Deep Learning models (RNN, LSTM, GRU)
- Compare DL vs ML performance
- Try word embeddings (Word2Vec, GloVe)

---

‚úì Notebook 02 Complete!

In [None]:
# Final summary
print("="*80)
print("NOTEBOOK 02 SUMMARY")
print("="*80)
print(f"\n‚úì Dataset loaded: {len(df)} tweets")
print(f"‚úì Train/Test split: {len(X_train)}/{len(X_test)} samples")
print(f"‚úì TF-IDF features: {X_train_tfidf.shape[1]} dimensions")
print(f"\n‚úì Models trained: 3 (Logistic Regression, SVM, Random Forest)")
print(f"‚úì Best model: {best_model_name}")
print(f"‚úì Best F1-Score: {best_score:.4f}")
print(f"\n‚úì Models saved to: {MODELS_PATH}")
print(f"‚úì Visualizations saved to: {VISUALS_PATH}")
print("\n" + "="*80)
print("Ready for Week 4: Deep Learning Models!")
print("="*80)