Title: Popular Classification Algorithms

Support Vector Machines (SVM)

Task 1: Identify handwriting on checks and classify each letter.

In [1]:
# Task 1: Identify handwriting on checks and classify each letter using SVM (Support Vector Machine)

from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Load the handwritten digits dataset (each digit image is 8x8 pixels)
digits = load_digits()
X = digits.data
y = digits.target  # Digits 0-9 (for letters, a similar approach applies with a letter dataset)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Train an SVM classifier
clf = SVC(kernel='linear', random_state=42)
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9777777777777777
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        36
           1       0.92      0.94      0.93        36
           2       1.00      1.00      1.00        35
           3       1.00      0.97      0.99        37
           4       1.00      1.00      1.00        36
           5       1.00      1.00      1.00        37
           6       1.00      0.97      0.99        36
           7       0.97      1.00      0.99        36
           8       0.94      0.89      0.91        35
           9       0.95      1.00      0.97        36

    accuracy                           0.98       360
   macro avg       0.98      0.98      0.98       360
weighted avg       0.98      0.98      0.98       360




Task 2: Detect gender of a speaker based on voice data.

In [2]:
# Task 2: Detect gender of a speaker based on voice data using SVM

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Example: Use the UCI Gender Recognition by Voice dataset (if available locally)
# For demonstration, we'll create a small synthetic dataset
# In practice, use real features like meanfreq, sd, median, Q25, Q75, etc.

data = {
    'meanfreq': [0.23, 0.18, 0.22, 0.19, 0.25, 0.17, 0.21, 0.24, 0.20, 0.16],
    'sd':       [0.03, 0.04, 0.02, 0.03, 0.05, 0.04, 0.03, 0.02, 0.04, 0.03],
    'median':   [0.22, 0.17, 0.21, 0.18, 0.24, 0.16, 0.20, 0.23, 0.19, 0.15],
    'gender':   [1, 0, 1, 0, 1, 0, 1, 1, 0, 0]  # 1 = male, 0 = female
}
df = pd.DataFrame(data)

# Features and target
X = df.drop('gender', axis=1)
y = df['gender']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train an SVM classifier
clf = SVC(kernel='linear', random_state=42)
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(
    y_test, y_pred, labels=[0, 1], target_names=['Female', 'Male'], zero_division=0
))

Accuracy: 0.0
Classification Report:
               precision    recall  f1-score   support

      Female       0.00      0.00      0.00       2.0
        Male       0.00      0.00      0.00       0.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0



Task 3: Classify email topics based on content.

In [7]:
# Task 3: Classify email topics based on content using SVM

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report

# Expanded synthetic dataset: email content and topic (at least 4 samples per topic)
data = {
    'email': [
        "Meeting scheduled for tomorrow at 10am.",
        "Reminder: Project deadline is approaching.",
        "Team outing planned for Friday evening.",
        "Let's catch up for lunch next week.",
        "Join our webinar on cloud computing.",
        "Conference call scheduled for next Monday.",
        "Your invoice for last month is attached.",
        "Payment received. Thank you for your business.",
        "Your order has been shipped and is on its way.",
        "Discounts available on all electronics.",
        "Special offer just for you! Buy now.",
        "Exclusive deals on your favorite products.",
        "Salary credited to your account.",
        "Expense report submission deadline.",
        "Tax documents for this year attached.",
        "Personal reminder: call mom this weekend.",
        "Dinner plans for Saturday night.",
        "Birthday invitation for next month.",
        "Shopping list for groceries.",
        "Order confirmation for your recent purchase."
    ],
    'topic': [
        "work", "work", "work", "work",
        "personal", "personal", "personal", "personal",
        "finance", "finance", "finance", "finance",
        "shopping", "shopping", "shopping", "shopping",
        "personal", "personal", "shopping", "shopping"
    ]
}
df = pd.DataFrame(data)

# Features and target
X = df['email']
y = df['topic']

# Convert text to numerical features using TF-IDF
vectorizer = TfidfVectorizer()
X_vec = vectorizer.fit_transform(X)

# Split into train and test sets (stratify now works due to enough samples per class)
X_train, X_test, y_train, y_test = train_test_split(
    X_vec, y, test_size=0.2, random_state=42, stratify=y
)

# Train an SVM classifier
clf = SVC(kernel='linear', random_state=42)
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, zero_division=0))

Accuracy: 0.5
Classification Report:
               precision    recall  f1-score   support

     finance       0.00      0.00      0.00         1
    personal       0.50      1.00      0.67         1
    shopping       0.50      1.00      0.67         1
        work       0.00      0.00      0.00         1

    accuracy                           0.50         4
   macro avg       0.25      0.50      0.33         4
weighted avg       0.25      0.50      0.33         4

