In [1]:
import pandas as pd
import numpy as np
import seaborn as sns 
from matplotlib import pyplot as plt
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import openpyxl

In [2]:
text_file_name = "osdg-community-data-v2024-04-01.csv"
text_df = pd.read_csv(text_file_name,sep = "\t",  quotechar='"')

In [3]:
text_df.drop(text_df.columns.values[0],axis = 1, inplace=True)
text_df = text_df.query("agreement > 0.5 and (labels_positive - labels_negative) > 2").reset_index(drop=True)

After cleaning the data, we get the text data then define the function

Since there is no labels in the text_df, we need to define a function to add the sdg labels to each row:

In [4]:
def assign_sdg_labels(text):
    """
    Assign SDG labels to the text based on keywords related to each SDG.
    Each SDG can be associated with multiple keywords, and a text can be labeled with more than one SDG.
    """
    # Initialize an empty list to hold SDG labels
    labels = []

    # SDG 1: No Poverty
    if "poverty" in text.lower() or "income inequality" in text.lower() or "economic hardship" in text.lower():
        labels.append(1)  # No Poverty

    # SDG 2: Zero Hunger
    if "hunger" in text.lower() or "food security" in text.lower() or "malnutrition" in text.lower():
        labels.append(2)  # Zero Hunger

    # SDG 3: Good Health and Well-being
    if "health" in text.lower() or "disease" in text.lower() or "mental health" in text.lower():
        labels.append(3)  # Good Health and Well-being

    # SDG 4: Quality Education
    if "education" in text.lower() or "learning" in text.lower() or "school" in text.lower():
        labels.append(4)  # Quality Education

    # SDG 5: Gender Equality
    if "gender" in text.lower() or "women's rights" in text.lower() or "equality" in text.lower():
        labels.append(5)  # Gender Equality

    # SDG 6: Clean Water and Sanitation
    if "water" in text.lower() or "sanitation" in text.lower() or "clean water" in text.lower():
        labels.append(6)  # Clean Water and Sanitation

    # SDG 7: Affordable and Clean Energy
    if "energy" in text.lower() or "renewable" in text.lower() or "solar power" in text.lower() or "clean energy" in text.lower():
        labels.append(7)  # Affordable and Clean Energy

    # SDG 8: Decent Work and Economic Growth
    if "employment" in text.lower() or "job" in text.lower() or "economic growth" in text.lower():
        labels.append(8)  # Decent Work and Economic Growth

    # SDG 9: Industry, Innovation and Infrastructure
    if "innovation" in text.lower() or "infrastructure" in text.lower() or "industry" in text.lower():
        labels.append(9)  # Industry, Innovation and Infrastructure

    # SDG 10: Reduced Inequalities
    if "inequality" in text.lower() or "discrimination" in text.lower() or "social justice" in text.lower():
        labels.append(10)  # Reduced Inequalities

    # SDG 11: Sustainable Cities and Communities
    if "urban" in text.lower() or "city" in text.lower() or "community" in text.lower() or "sustainable development" in text.lower():
        labels.append(11)  # Sustainable Cities and Communities

    # SDG 12: Responsible Consumption and Production
    if "sustainability" in text.lower() or "responsible consumption" in text.lower() or "waste reduction" in text.lower():
        labels.append(12)  # Responsible Consumption and Production

    # SDG 13: Climate Action
    if "climate change" in text.lower() or "global warming" in text.lower() or "carbon footprint" in text.lower():
        labels.append(13)  # Climate Action

    # SDG 14: Life Below Water
    if "ocean" in text.lower() or "marine life" in text.lower() or "fisheries" in text.lower():
        labels.append(14)  # Life Below Water

    # SDG 15: Life on Land
    if "biodiversity" in text.lower() or "conservation" in text.lower() or "wildlife" in text.lower() or "deforestation" in text.lower():
        labels.append(15)  # Life on Land

    # SDG 16: Peace, Justice and Strong Institutions
    if "justice" in text.lower() or "peace" in text.lower() or "corruption" in text.lower() or "rule of law" in text.lower():
        labels.append(16)  # Peace, Justice and Strong Institutions

    # SDG 17: Partnerships for the Goals
    if "partnerships" in text.lower() or "collaboration" in text.lower() or "global cooperation" in text.lower():
        labels.append(17)  # Partnerships for the Goals

    # If no SDGs match, return a default value (optional)
    return labels if labels else [0]  # Default to [0] if no SDG matches



text_df['sdg_labels'] = text_df.text.apply(assign_sdg_labels)

corpus = text_df[['text', 'sdg_labels']]


For this function, we use unigram only and tfid vectorizor
Function definiton:

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

def sdg_classify_tfid_unigram(corpus, classifier_algorithm):
 

    # Convert SDG labels into binary format
    mlb = MultiLabelBinarizer()
    y_binary = mlb.fit_transform(corpus['sdg_labels'])
    X = corpus['text']
    y = y_binary

    # Vectorize text data
    vectorizer = TfidfVectorizer(ngram_range=(1,1), min_df=500, max_features=5000, stop_words='english')
    X_transformed = vectorizer.fit_transform(X)

    # train and test split
    X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.3, random_state=42)

    try:
        # Check if the classifier accepts a random_state parameter
        classifier = OneVsRestClassifier(classifier_algorithm(random_state=42))
    except TypeError:
        # If not, initialize it without the random_state parameter
        classifier = OneVsRestClassifier(classifier_algorithm())

    classifier.fit(X_train, y_train)


    try:
        # Use `predict_proba` for classifiers that support it (e.g., MLPClassifier, MultinomialNB)
        y_pred_proba = classifier.predict_proba(X_test)
        threshold = 0.3
        y_pred = (y_pred_proba >= threshold).astype(int)
    except AttributeError:
        try:
            # Use `predict_log_proba` for classifiers that support it (e.g., MultinomialNB)
            y_pred_log_proba = classifier.predict_log_proba(X_test)
            y_pred_proba = np.exp(y_pred_log_proba)  # Convert log probabilities to probabilities
            threshold = 0.3
            y_pred = (y_pred_proba >= threshold).astype(int)
        except AttributeError:
            # If neither `predict_proba` nor `predict_log_proba` is available, use `predict`
            y_pred = classifier.predict(X_test)
        

    precision = precision_score(y_test, y_pred, average='samples', zero_division=1)
    recall = recall_score(y_test, y_pred, average='samples', zero_division=1)
    f1 = f1_score(y_test, y_pred, average='samples', zero_division=1)
    accuracy = accuracy_score(y_test, y_pred)

    # Return metrics
    metrics = {
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'accuracy': accuracy,
    }
    return metrics


Tfid and bigram only:

In [6]:
def sdg_classify_tfid_bigram(corpus, classifier_algorithm):
 

    # Convert SDG labels into binary format
    mlb = MultiLabelBinarizer()
    y_binary = mlb.fit_transform(corpus['sdg_labels'])
    X = corpus['text']
    y = y_binary

    # Vectorize text data
    vectorizer = TfidfVectorizer(ngram_range=(2,2), min_df=500, max_features=5000, stop_words='english')
    X_transformed = vectorizer.fit_transform(X)

    # train and test split
    X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.3, random_state=42)

    try:
        # Check if the classifier accepts a random_state parameter
        classifier = OneVsRestClassifier(classifier_algorithm(random_state=42))
    except TypeError:
        # If not, initialize it without the random_state parameter
        classifier = OneVsRestClassifier(classifier_algorithm())

    classifier.fit(X_train, y_train)


    try:
        # Use `predict_proba` for classifiers that support it (e.g., MLPClassifier, MultinomialNB)
        y_pred_proba = classifier.predict_proba(X_test)
        threshold = 0.3
        y_pred = (y_pred_proba >= threshold).astype(int)
    except AttributeError:
        try:
            # Use `predict_log_proba` for classifiers that support it (e.g., MultinomialNB)
            y_pred_log_proba = classifier.predict_log_proba(X_test)
            y_pred_proba = np.exp(y_pred_log_proba)  # Convert log probabilities to probabilities
            threshold = 0.3
            y_pred = (y_pred_proba >= threshold).astype(int)
        except AttributeError:
            # If neither `predict_proba` nor `predict_log_proba` is available, use `predict`
            y_pred = classifier.predict(X_test)
        

    precision = precision_score(y_test, y_pred, average='samples', zero_division=1)
    recall = recall_score(y_test, y_pred, average='samples', zero_division=1)
    f1 = f1_score(y_test, y_pred, average='samples', zero_division=1)
    accuracy = accuracy_score(y_test, y_pred)

    # Return metrics
    metrics = {
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'accuracy': accuracy,
    }
    return metrics

tfid and both unigram and bigram:

In [7]:
def sdg_classify_tfid_both(corpus, classifier_algorithm):
 

    # Convert SDG labels into binary format
    mlb = MultiLabelBinarizer()
    y_binary = mlb.fit_transform(corpus['sdg_labels'])
    X = corpus['text']
    y = y_binary

    # Vectorize text data
    vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=500, max_features=5000, stop_words='english')
    X_transformed = vectorizer.fit_transform(X)

    # train and test split
    X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.3, random_state=42)

    try:
        # Check if the classifier accepts a random_state parameter
        classifier = OneVsRestClassifier(classifier_algorithm(random_state=42))
    except TypeError:
        # If not, initialize it without the random_state parameter
        classifier = OneVsRestClassifier(classifier_algorithm())

    classifier.fit(X_train, y_train)


    try:
        # Use `predict_proba` for classifiers that support it (e.g., MLPClassifier, MultinomialNB)
        y_pred_proba = classifier.predict_proba(X_test)
        threshold = 0.3
        y_pred = (y_pred_proba >= threshold).astype(int)
    except AttributeError:
        try:
            # Use `predict_log_proba` for classifiers that support it (e.g., MultinomialNB)
            y_pred_log_proba = classifier.predict_log_proba(X_test)
            y_pred_proba = np.exp(y_pred_log_proba)  # Convert log probabilities to probabilities
            threshold = 0.3
            y_pred = (y_pred_proba >= threshold).astype(int)
        except AttributeError:
            # If neither `predict_proba` nor `predict_log_proba` is available, use `predict`
            y_pred = classifier.predict(X_test)
        

    precision = precision_score(y_test, y_pred, average='samples', zero_division=1)
    recall = recall_score(y_test, y_pred, average='samples', zero_division=1)
    f1 = f1_score(y_test, y_pred, average='samples', zero_division=1)
    accuracy = accuracy_score(y_test, y_pred)

    # Return metrics
    metrics = {
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'accuracy': accuracy,
    }
    return metrics

Then we will define all functions with count vector in one cell:

In [8]:
def sdg_classify_count_unigram(corpus, classifier_algorithm):
 

    # Convert SDG labels into binary format
    mlb = MultiLabelBinarizer()
    y_binary = mlb.fit_transform(corpus['sdg_labels'])
    X = corpus['text']
    y = y_binary

    # Vectorize text data
    vectorizer = CountVectorizer(ngram_range=(1,1), min_df=500, max_features=5000, stop_words='english')
    X_transformed = vectorizer.fit_transform(X)

    # train and test split
    X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.3, random_state=42)

    try:
        # Check if the classifier accepts a random_state parameter
        classifier = OneVsRestClassifier(classifier_algorithm(random_state=42))
    except TypeError:
        # If not, initialize it without the random_state parameter
        classifier = OneVsRestClassifier(classifier_algorithm())

    classifier.fit(X_train, y_train)


    try:
        # Use `predict_proba` for classifiers that support it (e.g., MLPClassifier, MultinomialNB)
        y_pred_proba = classifier.predict_proba(X_test)
        threshold = 0.3
        y_pred = (y_pred_proba >= threshold).astype(int)
    except AttributeError:
        try:
            # Use `predict_log_proba` for classifiers that support it (e.g., MultinomialNB)
            y_pred_log_proba = classifier.predict_log_proba(X_test)
            y_pred_proba = np.exp(y_pred_log_proba)  # Convert log probabilities to probabilities
            threshold = 0.3
            y_pred = (y_pred_proba >= threshold).astype(int)
        except AttributeError:
            # If neither `predict_proba` nor `predict_log_proba` is available, use `predict`
            y_pred = classifier.predict(X_test)
        

    precision = precision_score(y_test, y_pred, average='samples', zero_division=1)
    recall = recall_score(y_test, y_pred, average='samples', zero_division=1)
    f1 = f1_score(y_test, y_pred, average='samples', zero_division=1)
    accuracy = accuracy_score(y_test, y_pred)

    # Return metrics
    metrics = {
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'accuracy': accuracy,
    }
    return metrics

def sdg_classify_count_bigram(corpus, classifier_algorithm):
 

    # Convert SDG labels into binary format
    mlb = MultiLabelBinarizer()
    y_binary = mlb.fit_transform(corpus['sdg_labels'])
    X = corpus['text']
    y = y_binary

    # Vectorize text data
    vectorizer = CountVectorizer(ngram_range=(2,2), min_df=500, max_features=5000, stop_words='english')
    X_transformed = vectorizer.fit_transform(X)

    # train and test split
    X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.3, random_state=42)

    try:
        # Check if the classifier accepts a random_state parameter
        classifier = OneVsRestClassifier(classifier_algorithm(random_state=42))
    except TypeError:
        # If not, initialize it without the random_state parameter
        classifier = OneVsRestClassifier(classifier_algorithm())

    classifier.fit(X_train, y_train)


    try:
        # Use `predict_proba` for classifiers that support it (e.g., MLPClassifier, MultinomialNB)
        y_pred_proba = classifier.predict_proba(X_test)
        threshold = 0.3
        y_pred = (y_pred_proba >= threshold).astype(int)
    except AttributeError:
        try:
            # Use `predict_log_proba` for classifiers that support it (e.g., MultinomialNB)
            y_pred_log_proba = classifier.predict_log_proba(X_test)
            y_pred_proba = np.exp(y_pred_log_proba)  # Convert log probabilities to probabilities
            threshold = 0.3
            y_pred = (y_pred_proba >= threshold).astype(int)
        except AttributeError:
            # If neither `predict_proba` nor `predict_log_proba` is available, use `predict`
            y_pred = classifier.predict(X_test)
        

    precision = precision_score(y_test, y_pred, average='samples', zero_division=1)
    recall = recall_score(y_test, y_pred, average='samples', zero_division=1)
    f1 = f1_score(y_test, y_pred, average='samples', zero_division=1)
    accuracy = accuracy_score(y_test, y_pred)

    # Return metrics
    metrics = {
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'accuracy': accuracy,
    }
    return metrics

def sdg_classify_count_both(corpus, classifier_algorithm):
 

    # Convert SDG labels into binary format
    mlb = MultiLabelBinarizer()
    y_binary = mlb.fit_transform(corpus['sdg_labels'])
    X = corpus['text']
    y = y_binary

    # Vectorize text data
    vectorizer = CountVectorizer(ngram_range=(1,2), min_df=500, max_features=5000, stop_words='english')
    X_transformed = vectorizer.fit_transform(X)

    # train and test split
    X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.3, random_state=42)

    try:
        # Check if the classifier accepts a random_state parameter
        classifier = OneVsRestClassifier(classifier_algorithm(random_state=42))
    except TypeError:
        # If not, initialize it without the random_state parameter
        classifier = OneVsRestClassifier(classifier_algorithm())

    classifier.fit(X_train, y_train)


    try:
        # Use `predict_proba` for classifiers that support it (e.g., MLPClassifier, MultinomialNB)
        y_pred_proba = classifier.predict_proba(X_test)
        threshold = 0.3
        y_pred = (y_pred_proba >= threshold).astype(int)
    except AttributeError:
        try:
            # Use `predict_log_proba` for classifiers that support it (e.g., MultinomialNB)
            y_pred_log_proba = classifier.predict_log_proba(X_test)
            y_pred_proba = np.exp(y_pred_log_proba)  # Convert log probabilities to probabilities
            threshold = 0.3
            y_pred = (y_pred_proba >= threshold).astype(int)
        except AttributeError:
            # If neither `predict_proba` nor `predict_log_proba` is available, use `predict`
            y_pred = classifier.predict(X_test)
        

    precision = precision_score(y_test, y_pred, average='samples', zero_division=1)
    recall = recall_score(y_test, y_pred, average='samples', zero_division=1)
    f1 = f1_score(y_test, y_pred, average='samples', zero_division=1)
    accuracy = accuracy_score(y_test, y_pred)

    # Return metrics
    metrics = {
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'accuracy': accuracy,
    }
    return metrics

Since the functionn is defined, we are going to run this function with tfidf vector and on all three classiers mentioned:

In [9]:
# MultinomialNB
metrics = sdg_classify_tfid_unigram(corpus, MultinomialNB)
precision1 = metrics['precision']
recall1 = metrics['recall']
f1_score1 = metrics['f1_score']
accuracy1 = metrics['accuracy']
print(metrics)

{'precision': 0.8527707966825836, 'recall': 0.6851078280017713, 'f1_score': 0.6749792924397096, 'accuracy': 0.49434531289268663}


Ridge:

In [10]:
from sklearn.linear_model import RidgeClassifier
metrics = sdg_classify_tfid_unigram(corpus, lambda: RidgeClassifier(solver='sparse_cg'))
precision2 = metrics['precision']
recall2 = metrics['recall']
f1_score2 = metrics['f1_score']
accuracy2 = metrics['accuracy']
print(metrics)


{'precision': 0.9477046159001423, 'recall': 0.6238309657447632, 'f1_score': 0.650616124541734, 'accuracy': 0.5310379492334757}


In [11]:
# MLP
metrics = sdg_classify_tfid_unigram(corpus, lambda: MLPClassifier(solver='sgd', learning_rate_init=0.01, max_iter=3000, random_state=42))
precision3 = metrics['precision']
recall3 = metrics['recall']
f1_score3 = metrics['f1_score']
accuracy3 = metrics['accuracy']
print(metrics)

{'precision': 0.8998226801820669, 'recall': 0.8801178907518439, 'f1_score': 0.8684987619958717, 'accuracy': 0.7425232470469967}


In [12]:
data = {
    "Classifier": ["MultinomialNB", "RidgeClassifier", "MLPClassifier"],
    "Precision": [precision1, precision2, precision3],
    "Recall": [recall1, recall2, recall3],
    "F1-Score": [f1_score1, f1_score2, f1_score3],
    "Accuracy": [accuracy1, accuracy2, accuracy3]
}
# Create DataFrame
df = pd.DataFrame(data)

# Boldface numbers above 0.9
def highlight_high_values(val):
    if val > 0.9:
        return f"font-weight: bold"  # Bold formatting
    return None

styled_df = df.style.applymap(highlight_high_values, subset=["Precision", "Recall", "F1-Score"])

styled_df

  styled_df = df.style.applymap(highlight_high_values, subset=["Precision", "Recall", "F1-Score"])


Unnamed: 0,Classifier,Precision,Recall,F1-Score,Accuracy
0,MultinomialNB,0.852771,0.685108,0.674979,0.494345
1,RidgeClassifier,0.947705,0.623831,0.650616,0.531038
2,MLPClassifier,0.899823,0.880118,0.868499,0.742523


Now we try count vector with unigram only:

In [13]:
# MultinomialNB
metrics = sdg_classify_count_unigram(corpus, MultinomialNB)
precision1 = metrics['precision']
recall1 = metrics['recall']
f1_score1 = metrics['f1_score']
accuracy1 = metrics['accuracy']

# Ridge
metrics = sdg_classify_count_unigram(corpus, lambda: RidgeClassifier(solver='sparse_cg'))
precision2 = metrics['precision']
recall2 = metrics['recall']
f1_score2 = metrics['f1_score']
accuracy2 = metrics['accuracy']

# MLP
metrics = sdg_classify_count_unigram(corpus, lambda: MLPClassifier(solver='sgd', learning_rate_init=0.01, max_iter=3000, random_state=42))
precision3 = metrics['precision']
recall3 = metrics['recall']
f1_score3 = metrics['f1_score']
accuracy3 = metrics['accuracy']

data = {
    "Classifier": ["MultinomialNB", "RidgeClassifier", "MLPClassifier"],
    "Precision": [precision1, precision2, precision3],
    "Recall": [recall1, recall2, recall3],
    "F1-Score": [f1_score1, f1_score2, f1_score3],
    "Accuracy": [accuracy1, accuracy2, accuracy3]
}
df = pd.DataFrame(data)

styled_df = df.style.applymap(highlight_high_values, subset=["Precision", "Recall", "F1-Score"])

# Display in Jupyter or export to HTML
styled_df

  styled_df = df.style.applymap(highlight_high_values, subset=["Precision", "Recall", "F1-Score"])


Unnamed: 0,Classifier,Precision,Recall,F1-Score,Accuracy
0,MultinomialNB,0.509466,0.878837,0.600334,0.188364
1,RidgeClassifier,0.966847,0.474112,0.499932,0.384644
2,MLPClassifier,0.905297,0.873023,0.86505,0.749937


Then we try tfidf with bigram only:

In [14]:
# MultinomialNB
metrics = sdg_classify_tfid_bigram(corpus, MultinomialNB)
precision1 = metrics['precision']
recall1 = metrics['recall']
f1_score1 = metrics['f1_score']
accuracy1 = metrics['accuracy']

# Ridge
metrics = sdg_classify_tfid_bigram(corpus, lambda: RidgeClassifier(solver='sparse_cg'))
precision2 = metrics['precision']
recall2 = metrics['recall']
f1_score2 = metrics['f1_score']
accuracy2 = metrics['accuracy']

# MLP
metrics = sdg_classify_tfid_bigram(corpus, lambda: MLPClassifier(solver='sgd', learning_rate_init=0.01, max_iter=3000, random_state=42))
precision3 = metrics['precision']
recall3 = metrics['recall']
f1_score3 = metrics['f1_score']
accuracy3 = metrics['accuracy']

data = {
    "Classifier": ["MultinomialNB", "RidgeClassifier", "MLPClassifier"],
    "Precision": [precision1, precision2, precision3],
    "Recall": [recall1, recall2, recall3],
    "F1-Score": [f1_score1, f1_score2, f1_score3],
    "Accuracy": [accuracy1, accuracy2, accuracy3]
}
df = pd.DataFrame(data)

styled_df = df.style.applymap(highlight_high_values, subset=["Precision", "Recall", "F1-Score"])

# Display in Jupyter or export to HTML
styled_df

  styled_df = df.style.applymap(highlight_high_values, subset=["Precision", "Recall", "F1-Score"])


Unnamed: 0,Classifier,Precision,Recall,F1-Score,Accuracy
0,MultinomialNB,0.895577,0.097222,0.101383,0.070872
1,RidgeClassifier,0.996607,0.041255,0.046486,0.028525
2,MLPClassifier,0.955893,0.074221,0.075525,0.044735


Count vector and bigram:

In [15]:
# MultinomialNB
metrics = sdg_classify_count_bigram(corpus, MultinomialNB)
precision1 = metrics['precision']
recall1 = metrics['recall']
f1_score1 = metrics['f1_score']
accuracy1 = metrics['accuracy']

# Ridge
metrics = sdg_classify_count_bigram(corpus, lambda: RidgeClassifier(solver='sparse_cg'))
precision2 = metrics['precision']
recall2 = metrics['recall']
f1_score2 = metrics['f1_score']
accuracy2 = metrics['accuracy']

# MLP
metrics = sdg_classify_count_bigram(corpus, lambda: MLPClassifier(solver='sgd', learning_rate_init=0.01, max_iter=3000, random_state=42))
precision3 = metrics['precision']
recall3 = metrics['recall']
f1_score3 = metrics['f1_score']
accuracy3 = metrics['accuracy']

data = {
    "Classifier": ["MultinomialNB", "RidgeClassifier", "MLPClassifier"],
    "Precision": [precision1, precision2, precision3],
    "Recall": [recall1, recall2, recall3],
    "F1-Score": [f1_score1, f1_score2, f1_score3],
    "Accuracy": [accuracy1, accuracy2, accuracy3]
}
df = pd.DataFrame(data)

styled_df = df.style.applymap(highlight_high_values, subset=["Precision", "Recall", "F1-Score"])

# Display in Jupyter or export to HTML
styled_df

  styled_df = df.style.applymap(highlight_high_values, subset=["Precision", "Recall", "F1-Score"])


Unnamed: 0,Classifier,Precision,Recall,F1-Score,Accuracy
0,MultinomialNB,0.891819,0.101784,0.090988,0.045489
1,RidgeClassifier,0.99422,0.045286,0.050679,0.030787
2,MLPClassifier,0.953715,0.075367,0.077934,0.050013


Tfidf vector and with both unigram and bigram:

In [16]:
# MultinomialNB
metrics = sdg_classify_tfid_both(corpus, MultinomialNB)
precision1 = metrics['precision']
recall1 = metrics['recall']
f1_score1 = metrics['f1_score']
accuracy1 = metrics['accuracy']

# Ridge
metrics = sdg_classify_tfid_both(corpus, lambda: RidgeClassifier(solver='sparse_cg'))
precision2 = metrics['precision']
recall2 = metrics['recall']
f1_score2 = metrics['f1_score']
accuracy2 = metrics['accuracy']

# MLP
metrics = sdg_classify_tfid_both(corpus, lambda: MLPClassifier(solver='sgd', learning_rate_init=0.01, max_iter=3000, random_state=42))
precision3 = metrics['precision']
recall3 = metrics['recall']
f1_score3 = metrics['f1_score']
accuracy3 = metrics['accuracy']

data = {
    "Classifier": ["MultinomialNB", "RidgeClassifier", "MLPClassifier"],
    "Precision": [precision1, precision2, precision3],
    "Recall": [recall1, recall2, recall3],
    "F1-Score": [f1_score1, f1_score2, f1_score3],
    "Accuracy": [accuracy1, accuracy2, accuracy3]
}
df = pd.DataFrame(data)

styled_df = df.style.applymap(highlight_high_values, subset=["Precision", "Recall", "F1-Score"])

# Display in Jupyter or export to HTML
styled_df

  styled_df = df.style.applymap(highlight_high_values, subset=["Precision", "Recall", "F1-Score"])


Unnamed: 0,Classifier,Precision,Recall,F1-Score,Accuracy
0,MultinomialNB,0.851573,0.693625,0.681452,0.497487
1,RidgeClassifier,0.94959,0.631492,0.658599,0.53707
2,MLPClassifier,0.902421,0.877078,0.867358,0.745916


Lastly, we try count with both unigram and bigram:

In [17]:
# MultinomialNB
metrics = sdg_classify_count_both(corpus, MultinomialNB)
precision1 = metrics['precision']
recall1 = metrics['recall']
f1_score1 = metrics['f1_score']
accuracy1 = metrics['accuracy']

# Ridge
metrics = sdg_classify_count_both(corpus, lambda: RidgeClassifier(solver='sparse_cg'))
precision2 = metrics['precision']
recall2 = metrics['recall']
f1_score2 = metrics['f1_score']
accuracy2 = metrics['accuracy']

# MLP
metrics = sdg_classify_count_both(corpus, lambda: MLPClassifier(solver='sgd', learning_rate_init=0.01, max_iter=3000, random_state=42))
precision3 = metrics['precision']
recall3 = metrics['recall']
f1_score3 = metrics['f1_score']
accuracy3 = metrics['accuracy']

data = {
    "Classifier": ["MultinomialNB", "RidgeClassifier", "MLPClassifier"],
    "Precision": [precision1, precision2, precision3],
    "Recall": [recall1, recall2, recall3],
    "F1-Score": [f1_score1, f1_score2, f1_score3],
    "Accuracy": [accuracy1, accuracy2, accuracy3]
}
df = pd.DataFrame(data)

styled_df = df.style.applymap(highlight_high_values, subset=["Precision", "Recall", "F1-Score"])

# Display in Jupyter or export to HTML
styled_df

  styled_df = df.style.applymap(highlight_high_values, subset=["Precision", "Recall", "F1-Score"])


Unnamed: 0,Classifier,Precision,Recall,F1-Score,Accuracy
0,MultinomialNB,0.502684,0.876304,0.594339,0.181076
1,RidgeClassifier,0.970617,0.482272,0.508094,0.390927
2,MLPClassifier,0.905131,0.874465,0.866207,0.749812


After testing all these classifiers by combining with different preprocessing settings. I find that Ridge Classifier is the best one for precision while MLP is the most ideal one for all metrics. In terms of vectors, count vectors and tfidf vectors are similar. 
However, there is a huge difference between the unigram only and bigram only while unigram only is way better than bigram only. When using bigram only, recall, f1_score, and accuracy are all unreliable. When using both unigram and bigram, the performance is decent and similar to using unigram only.


To directly extract all the text content from these websites, I just use BeautifulSoup to extract the text 

In [18]:
import requests
from bs4 import BeautifulSoup

# Function to extract main text content from a web page
def extract_text_from_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Example: Extract the main content (adjust based on website structure)
    main_content = soup.find('body').get_text(separator=" ", strip=True)  # Extract body text
    return main_content

# List of URLs
urls = [
    "http://gianttortoise.org/en/beyond-tracking",
    "https://www.dhs.gov/blue-campaign/what-human-trafficking",
    "https://www.dol.gov/agencies/odep/program-areas/individuals/older-workers",
    "https://michigantoday.umich.edu/2022/08/26/positively-breaking-the-age-code/"
]

# Extract text from each URL
text_df2 = pd.DataFrame(columns=['text'])
text_df2['text'] = [extract_text_from_url(url) for url in urls]



In [19]:
text_df2['sdg_labels'] = text_df.text.apply(assign_sdg_labels)
corpus2 = text_df2[['text', 'sdg_labels']]


In the previous section, to reduce the running time of MLP, I set the min_df to 500, which is not applicable here. So I change it to 3 to make sure this function can execute well.

In [20]:
def sdg_classify_tfid_unigram2(corpus, classifier_algorithm):
 

    # Convert SDG labels into binary format
    mlb = MultiLabelBinarizer()
    y_binary = mlb.fit_transform(corpus['sdg_labels'])
    X = corpus['text']
    y = y_binary

    # Vectorize text data
    vectorizer = TfidfVectorizer(ngram_range=(1,1), min_df=3, max_features=5000, stop_words='english')
    X_transformed = vectorizer.fit_transform(X)

    # train and test split
    X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.3, random_state=42)

    try:
        # Check if the classifier accepts a random_state parameter
        classifier = OneVsRestClassifier(classifier_algorithm(random_state=42))
    except TypeError:
        # If not, initialize it without the random_state parameter
        classifier = OneVsRestClassifier(classifier_algorithm())

    classifier.fit(X_train, y_train)


    try:
        # Use `predict_proba` for classifiers that support it (e.g., MLPClassifier, MultinomialNB)
        y_pred_proba = classifier.predict_proba(X_test)
        threshold = 0.3
        y_pred = (y_pred_proba >= threshold).astype(int)
    except AttributeError:
        try:
            # Use `predict_log_proba` for classifiers that support it (e.g., MultinomialNB)
            y_pred_log_proba = classifier.predict_log_proba(X_test)
            y_pred_proba = np.exp(y_pred_log_proba)  # Convert log probabilities to probabilities
            threshold = 0.3
            y_pred = (y_pred_proba >= threshold).astype(int)
        except AttributeError:
            # If neither `predict_proba` nor `predict_log_proba` is available, use `predict`
            y_pred = classifier.predict(X_test)
        

    precision = precision_score(y_test, y_pred, average='samples', zero_division=1)
    recall = recall_score(y_test, y_pred, average='samples', zero_division=1)
    f1 = f1_score(y_test, y_pred, average='samples', zero_division=1)
    accuracy = accuracy_score(y_test, y_pred)

    # Return metrics
    metrics = {
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'accuracy': accuracy,
    }
    return metrics


In [21]:
# MultinomialNB
metrics = sdg_classify_tfid_unigram2(corpus2, MultinomialNB)
precision1 = metrics['precision']
recall1 = metrics['recall']
f1_score1 = metrics['f1_score']
accuracy1 = metrics['accuracy']

# Ridge
metrics = sdg_classify_tfid_unigram2(corpus2, lambda: RidgeClassifier(solver='sparse_cg'))
precision2 = metrics['precision']
recall2 = metrics['recall']
f1_score2 = metrics['f1_score']
accuracy2 = metrics['accuracy']

# MLP
metrics = sdg_classify_tfid_unigram2(corpus2, lambda: MLPClassifier(solver='sgd', learning_rate_init=0.01, max_iter=3000, random_state=42))
precision3 = metrics['precision']
recall3 = metrics['recall']
f1_score3 = metrics['f1_score']
accuracy3 = metrics['accuracy']

data = {
    "Classifier": ["MultinomialNB", "RidgeClassifier", "MLPClassifier"],
    "Precision": [precision1, precision2, precision3],
    "Recall": [recall1, recall2, recall3],
    "F1-Score": [f1_score1, f1_score2, f1_score3],
    "Accuracy": [accuracy1, accuracy2, accuracy3]
}
df = pd.DataFrame(data)

styled_df = df.style.applymap(highlight_high_values, subset=["Precision", "Recall", "F1-Score"])

# Display in Jupyter or export to HTML
styled_df

  styled_df = df.style.applymap(highlight_high_values, subset=["Precision", "Recall", "F1-Score"])


Unnamed: 0,Classifier,Precision,Recall,F1-Score,Accuracy
0,MultinomialNB,0.0,0.0,0.0,0.0
1,RidgeClassifier,0.0,0.0,0.0,0.0
2,MLPClassifier,0.0,0.0,0.0,0.0


Since the content from these websites are not SDG-related, it shows that all classfiers are not ideal since I use SDGs as my labels.