In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.preprocessing import LabelEncoder
import requests
from bs4 import BeautifulSoup
import matplotlib as mpl
from sklearn import metrics
from sklearn import preprocessing


Preliminary Data Fetching, Cleaning, and Vectorization. This code also initiates the sdg assignment to each sentence in the array - enabling us to sort and organize each line.

In [2]:
data_dir = "/Users/christian fink/Math485/Math485_2/"
sdg_names = pd.read_csv(data_dir + "sdg_name_definition.csv")
text_file_name = "osdg-community-data-v2024-04-01.csv"
text_df = pd.read_csv(data_dir + text_file_name,sep = "\t",  quotechar='"')
text_df.drop(text_df.columns.values[0],axis = 1, inplace=True)
text_df = text_df.query("agreement > 0.5 and (labels_positive - labels_negative) > 2").reset_index(drop=True)
corpus = text_df.text
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer.fit(corpus)
count_vector = count_vectorizer.transform(corpus).toarray() 
count_vector_df = pd.DataFrame(count_vector, columns=count_vectorizer.get_feature_names_out())
term_freq = pd.DataFrame({"term": count_vector_df.columns.values, "freq" : count_vector_df.sum(axis=0)})
term_freq.sort_values(by="freq", ascending=False)
sdg_num = text_df.sdg

SDG_CLASSIFIER takes in our data (corpus), what type of algorithm we want to use (classifier_algorithm), the vectorizer type (vectorizer_type), bigram or unigram (ngram_range), and the min_df value. It then splits the data into training and testing partitions, sorts based on the fed information, and then runs the selected classifier, returning the accuracy, recall, f1_score, and precision.

In [3]:
def sdg_classifier(corpus, classifier_algorithm, vectorizer_type='count', ngram_range=(1,1), stop_words='english', min_df=2):
    X_train, X_test, y_train, y_test = train_test_split(corpus, sdg_num, test_size=0.25, random_state=8)
    
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)
    
    if vectorizer_type == 'count':
        vectorizer = CountVectorizer(
            ngram_range=ngram_range, 
            stop_words=stop_words,
            min_df=min_df
        )
    else:
        vectorizer = TfidfVectorizer(
            ngram_range=ngram_range, 
            stop_words=stop_words,
            min_df=min_df
        )

    X_train_vector = vectorizer.fit_transform(X_train)
    X_test_vector = vectorizer.transform(X_test)

    if isinstance(classifier_algorithm, str):
        if classifier_algorithm.lower() == 'multinomialnb':
            clf = MultinomialNB()
        elif classifier_algorithm.lower() == 'mlp':
            clf = MLPClassifier(max_iter=5, random_state=8)
        elif classifier_algorithm.lower() == 'ridge':
            clf = RidgeClassifier(alpha=1, solver='auto', max_iter=5)
    else:
        clf = classifier_algorithm
    
    clf.fit(X_train_vector, y_train)
    y_pred = clf.predict(X_test_vector)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
    accuracy = accuracy_score(y_test, y_pred)
    
    return {
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'accuracy': accuracy,
    }


This cell contains all the configurations/combinations of vectorizer_type, ngram_range, and min_df we select. It then runs these selected configurations and appends the results into a table, printing a "B: " next to the best value in each column.

In [4]:
configurations = [
    {'vectorizer': 'count', 'ngram_range': (1, 1), 'min_df': 2},
    {'vectorizer': 'count', 'ngram_range': (2, 2), 'min_df': 2},
    {'vectorizer': 'count', 'ngram_range': (1, 2), 'min_df': 2},
    {'vectorizer': 'tfidf', 'ngram_range': (1, 1), 'min_df': 2},
    {'vectorizer': 'tfidf', 'ngram_range': (2, 2), 'min_df': 2},
    {'vectorizer': 'tfidf', 'ngram_range': (1, 2), 'min_df': 2}
]

results = []

for config in configurations:
    vectorizer_type = config['vectorizer']
    ngram_range = config['ngram_range']
    min_df = config['min_df']
    
    run1 = sdg_classifier(corpus, classifier_algorithm='multinomialnb', 
                           vectorizer_type=vectorizer_type, 
                           ngram_range=ngram_range, min_df=min_df)
    
    run2 = sdg_classifier(corpus, classifier_algorithm='mlp', 
                           vectorizer_type=vectorizer_type, 
                           ngram_range=ngram_range, min_df=min_df)
    
    run3 = sdg_classifier(corpus, classifier_algorithm='ridge', 
                           vectorizer_type=vectorizer_type, 
                           ngram_range=ngram_range, min_df=min_df)
    
    results.append({
        'vectorizer': vectorizer_type,
        'ngram_range': str(ngram_range),
        'min_df': min_df,
        'MultinomialNB_precision': run1['precision'],
        'MLP_precision': run2['precision'],
        'Ridge_precision': run3['precision'],
        'MultinomialNB_recall': run1['recall'],
        'MLP_recall': run2['recall'],
        'Ridge_recall': run3['recall'],
        'MultinomialNB_f1': run1['f1_score'],
        'MLP_f1': run2['f1_score'],
        'Ridge_f1': run3['f1_score'],
        'MultinomialNB_accuracy': run1['accuracy'],
        'MLP_accuracy': run2['accuracy'],
        'Ridge_accuracy': run3['accuracy']
    })

results_df = pd.DataFrame(results)
def highlight_best(s):
    return ["B: " + str(v) if v == s.max() else str(v) for v in s]

highlighted_df = results_df.copy()
metrics = ['precision', 'recall', 'f1', 'accuracy']
for metric in metrics:
    highlighted_df[[f'MultinomialNB_{metric}', f'MLP_{metric}', f'Ridge_{metric}']] = highlighted_df[[f'MultinomialNB_{metric}', f'MLP_{metric}', f'Ridge_{metric}']].apply(highlight_best)

highlighted_df




Unnamed: 0,vectorizer,ngram_range,min_df,MultinomialNB_precision,MLP_precision,Ridge_precision,MultinomialNB_recall,MLP_recall,Ridge_recall,MultinomialNB_f1,MLP_f1,Ridge_f1,MultinomialNB_accuracy,MLP_accuracy,Ridge_accuracy
0,count,"(1, 1)",2,B: 0.8494199484561036,0.8784494044251351,0.8118893628429225,B: 0.8481604342581424,0.8784680337756333,0.8098612786489746,B: 0.8456923177718398,0.8778208064328132,0.8098842263592053,B: 0.8481604342581424,0.8784680337756333,0.8098612786489746
1,count,"(2, 2)",2,0.8140283992558922,0.8238964243693374,0.7922129944947116,0.8097104945717732,0.8223763570566948,0.794481302774427,0.8035533293809642,0.8188390950274772,0.7908925083612142,0.8097104945717732,0.8223763570566948,0.794481302774427
2,count,"(1, 2)",2,0.8411318516598572,B: 0.890916419587095,0.8738489843174359,0.8297647768395657,B: 0.8911338962605548,0.8745476477683957,0.8197638370231969,B: 0.8903803653272887,0.8731847549677968,0.8297647768395657,B: 0.8911338962605548,0.8745476477683957
3,tfidf,"(1, 1)",2,0.8111105755736483,0.8824680105743974,0.8801440619074978,0.7454764776839565,0.882388419782871,0.8817852834740652,0.7230167700650596,0.8813329753768443,0.8798864388908068,0.7454764776839565,0.882388419782871,0.8817852834740652
4,tfidf,"(2, 2)",2,0.7797434892619032,0.8304058821227179,0.8212857016687594,0.6949638118214716,0.8300663449939686,0.821773220747889,0.6652111610792425,0.8265588914674353,0.8177049415636208,0.6949638118214716,0.8300663449939686,0.821773220747889
5,tfidf,"(1, 2)",2,0.8011519053176797,0.8900795592839371,B: 0.8886074919564034,0.7003920386007237,0.8905307599517491,B: 0.889475271411339,0.671296226355045,0.8893836734667491,B: 0.8874442666981663,0.7003920386007237,0.8905307599517491,B: 0.889475271411339


Provided Links  
With min_df = 3

| Bigram with Count Vectors     | MultinomialNB | MLP  | Ridge |
|:------------------------------|:--------------|:-----|:------|
| Precision                     | .814          | **.824** | .792  |
| Recall                        | .810          | **.822** | .794  |
| F1                            | .804          | **.819** | .791  |
| Accuracy                      | .810          | **.822** | .794  |
| Unigram with Count Vectors    |               |      |       |
| Precision                     | .849          | **.878** | .811  |
| Recall                        | .848          | **.878** | .810  |
| F1                            | .846          | **.879** | .810  |
| Accuracy                      | .848          | **.878** | .810  |
| Mixed Gram with Count Vectors |               |      |       |
| Precision                     | .841          | **.891** | .874  |
| Recall                        | .830          | **.891** | .875  |
| F1                            | .820          | **.890** | .873  |
| Accuracy                      | .830          | **.891** | .875  |
| Bigram with tfidf Vectors     |               |      |       |
| Precision                     | .780          | **.830** | .821  |
| Recall                        | .695          | **.830** | .821  |
| F1                            | .665          | **.827** | .818  |
| Accuracy                      | .695          | **.830** | .822  |
| Unigram with tfidf Vectors    |               |      |       |
| Precision                     | .811          | **.882** | .880  |
| Recall                        | .745          | **.882** | **.882**  |
| F1                            | .723          | **.881** | .880  |
| Accuracy                      | .745          | **.882** | **.882**  |
| Mixed Gram with tfidf Vectors |               |      |       |
| Precision                     | .801          | **.890** | .889  |
| Recall                        | .700          | **.891** | .889  |
| F1                            | .671          | **.890** | .887  |
| Accuracy                      | .700          | **.891** | .889  |

**Bigram with Count Vectors (MultinomialNB):**
This model achieves a precision of 0.814 and recall of 0.810, leading to an F1 score of 0.804. The accuracy is 0.810, indicating a strong balance between precision and recall for this setup.

**Unigram with Count Vectors (MultinomialNB):**
With a precision of 0.849, recall of 0.848, and F1 score of 0.846, this model slightly outperforms the bigram setup. The accuracy is also higher at 0.848, making it a more reliable approach for classification.

**Mixed Gram with Count Vectors (MultinomialNB):**
This model performs comparably to the unigram approach, with precision at 0.841, recall at 0.830, and an F1 score of 0.820. Accuracy is 0.830, showing strong performance but slightly lower than the unigram model.

**Bigram with TF-IDF Vectors (MultinomialNB):**
This approach struggles in comparison, achieving a precision of 0.780 and recall of 0.695, resulting in an F1 score of 0.665. The accuracy is also lower at 0.695, suggesting limited effectiveness in distinguishing between classes.

**Unigram with TF-IDF Vectors (MultinomialNB):**
The model achieves a precision of 0.811, recall of 0.745, and an F1 score of 0.723. The accuracy is 0.745, indicating modest improvements over the bigram setup with TF-IDF vectors but still underperforming compared to the count vector approaches.

**Mixed Gram with TF-IDF Vectors (MultinomialNB):**
This model offers a slight improvement over the bigram and unigram TF-IDF setups, with a precision of 0.801, recall of 0.700, and an F1 score of 0.671. The accuracy is 0.700, reflecting moderate classification performance.

This following code extracts the text from each url in the urls list. Then it breaks down the text line by line, creating the data array **corpus2_df** with a list called **corpus2** in it.

In [None]:
def extract_text_from_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    paragraphs = [p.get_text(strip=True) for p in soup.find_all('p') if p and p.get_text(strip=True)]
    full_text = ' '.join(paragraphs)
    sentences = [s.strip() for s in full_text.split('.') if s.strip()]
    return sentences

urls = [
    "http://gianttortoise.org/en/beyond-tracking",
    "https://www.dhs.gov/blue-campaign/what-human-trafficking",
    "https://www.dol.gov/agencies/odep/program-areas/individuals/older-workers",
    "https://michigantoday.umich.edu/2022/08/26/positively-breaking-the-age-code/"
]

all_sentences = []
for url in urls:
    all_sentences.extend(extract_text_from_url(url))
corpus2_df = pd.DataFrame({'text': all_sentences})
corpus2 = corpus2_df['text']

In [46]:
corpus2

0      Having discovered some of the mechanisms gover...
1      We knew that food availability influenced tort...
2      Previous studies had been conducted in the 198...
3      We spent several hundred hours observing torto...
4      We also studied how plant communities change a...
                             ...                        
163    I hope individuals in the UM system review thi...
164              Reply Thanks for this wonderful article
165    It gives hope to all of us! Yes, rethink age: ...
166    Everybody has different genes, so overcome ste...
167    Reply Please enable JavaScript to submit this ...
Name: text, Length: 168, dtype: object

We can take this extracted text (now seperated into lines by sentence) and pass it into a function to label each line according to the SDG labels supplied by the UN. Each classifier algorithm may come up with different labels despite being trained on the previous data.

In [43]:
def sdg_classifier2(train_corpus, corpus2, train_label, classifier_algorithm='multinomialnb', vectorizer_type='count', ngram_range=(1,1), min_df=3):    
    le = LabelEncoder()
    y_train = le.fit_transform(train_label)
    
    if vectorizer_type == 'count':
        vectorizer = CountVectorizer(ngram_range=ngram_range, stop_words='english', min_df=min_df)
    else:
        vectorizer = TfidfVectorizer(ngram_range=ngram_range, stop_words='english', min_df=min_df)

    X_train_vector = vectorizer.fit_transform(train_corpus)
    X_corpus2_vector = vectorizer.transform(corpus2)

    if classifier_algorithm == 'multinomialnb':
        clf = MultinomialNB()
    elif classifier_algorithm == 'mlp':
        clf = MLPClassifier(max_iter=5, random_state=8)
    elif classifier_algorithm == 'ridge':
        clf = RidgeClassifier(alpha=1, solver='auto', max_iter=5)
    else:
        clf = classifier_algorithm
    
    clf.fit(X_train_vector, y_train)
    
    y_pred = clf.predict(X_corpus2_vector)
    
    predicted_labels = le.inverse_transform(y_pred)
    
    return predicted_labels

In [None]:
configurations = [
    {'vectorizer': 'count', 'ngram_range': (1, 1), 'min_df': 3},
    {'vectorizer': 'count', 'ngram_range': (2, 2), 'min_df': 3},
    {'vectorizer': 'count', 'ngram_range': (1, 2), 'min_df': 3},
    {'vectorizer': 'tfidf', 'ngram_range': (1, 1), 'min_df': 3},
    {'vectorizer': 'tfidf', 'ngram_range': (2, 2), 'min_df': 3},
    {'vectorizer': 'tfidf', 'ngram_range': (1, 2), 'min_df': 3}
]

results = []

for config in configurations:
    vectorizer_type = config['vectorizer']
    ngram_range = config['ngram_range']
    min_df = config['min_df']
    
    run1 = sdg_classifier2(train_corpus=corpus, corpus2=corpus2, train_label=sdg_num, 
                           classifier_algorithm='multinomialnb', 
                           vectorizer_type=vectorizer_type, 
                           ngram_range=ngram_range, min_df=min_df)
    
    run2 = sdg_classifier2(train_corpus=corpus, corpus2=corpus2, train_label=sdg_num, 
                           classifier_algorithm='mlp', 
                           vectorizer_type=vectorizer_type, 
                           ngram_range=ngram_range, min_df=min_df)
    
    run3 = sdg_classifier2(train_corpus=corpus, corpus2=corpus2, train_label=sdg_num, 
                           classifier_algorithm='ridge', 
                           vectorizer_type=vectorizer_type, 
                           ngram_range=ngram_range, min_df=min_df)
    
    
    results.append({
        'vectorizer': vectorizer_type,
        'ngram_range': str(ngram_range),
        'min_df': min_df,
        'MultinomialNB_predicted_labels': run1,
        'MLP_predicted_labels': run2,
        'Ridge_predicted_labels': run3,
    })

results_df1 = pd.DataFrame(results)




In [45]:
print(results_df1)

  vectorizer ngram_range  min_df  \
0      count      (1, 1)       3   
1      count      (2, 2)       3   
2      count      (1, 2)       3   
3      tfidf      (1, 1)       3   
4      tfidf      (2, 2)       3   
5      tfidf      (1, 2)       3   

                      MultinomialNB_predicted_labels  \
0  [15, 2, 14, 14, 15, 15, 2, 15, 2, 2, 2, 15, 8,...   
1  [9, 2, 16, 15, 16, 15, 16, 15, 15, 15, 15, 16,...   
2  [16, 2, 14, 15, 15, 15, 3, 15, 2, 15, 15, 15, ...   
3  [16, 2, 16, 15, 15, 15, 3, 15, 2, 15, 15, 15, ...   
4  [9, 2, 16, 15, 16, 15, 16, 15, 16, 15, 15, 16,...   
5  [16, 2, 16, 15, 15, 15, 5, 15, 2, 15, 15, 15, ...   

                                MLP_predicted_labels  \
0  [4, 2, 15, 7, 15, 15, 8, 15, 2, 15, 15, 15, 8,...   
1  [9, 2, 16, 15, 9, 15, 9, 15, 14, 15, 15, 9, 9,...   
2  [4, 2, 15, 15, 7, 15, 2, 15, 2, 15, 15, 15, 8,...   
3  [15, 2, 9, 15, 15, 15, 8, 15, 2, 15, 2, 15, 11...   
4  [9, 2, 16, 15, 9, 15, 9, 15, 14, 15, 15, 9, 9,...   
5  [15, 2, 15, 15,

 Assignment: Take the main text content from these pages, and feed them into your classifier and see how your model classifies them. Are the classifications reasonable? find a case where your classification is not reasonable and explain what the model does that leads to the not ideal classification.
 ------------------------------------------------------------------------------------------------------------------------------------------------