In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.preprocessing import LabelEncoder
import requests
from bs4 import BeautifulSoup
import matplotlib as mpl
from sklearn import metrics
from sklearn import preprocessing


Preliminary Data Fetching, Cleaning, and Vectorization. This code also initiates the sdg assignment to each sentence in the array - enabling us to sort and organize each line.

In [15]:
data_dir = "/Users/luke/GitMath/"
sdg_names = pd.read_csv(data_dir + "sdg_name_definition.csv")
text_file_name = "osdg-community-data-v2024-04-01.csv"
text_df = pd.read_csv(data_dir + text_file_name,sep = "\t",  quotechar='"')
text_df.drop(text_df.columns.values[0],axis = 1, inplace=True)
text_df = text_df.query("agreement > 0.5 and (labels_positive - labels_negative) > 2").reset_index(drop=True)
corpus = text_df.text
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer.fit(corpus)
count_vector = count_vectorizer.transform(corpus).toarray() 
count_vector_df = pd.DataFrame(count_vector, columns=count_vectorizer.get_feature_names_out())
term_freq = pd.DataFrame({"term": count_vector_df.columns.values, "freq" : count_vector_df.sum(axis=0)})
term_freq.sort_values(by="freq", ascending=False)
sdg_num = text_df.sdg

SDG_CLASSIFIER takes in our data (corpus), what type of algorithm we want to use (classifier_algorithm), the vectorizer type (vectorizer_type), bigram or unigram (ngram_range), and the min_df value. It then splits the data into training and testing partitions, sorts based on the fed information, and then runs the selected classifier, returning the accuracy, recall, f1_score, and precision.

In [16]:
def sdg_classifier(corpus, classifier_algorithm, vectorizer_type='count', ngram_range=(1,1), stop_words='english', min_df=2):
    X_train, X_test, y_train, y_test = train_test_split(corpus, sdg_num, test_size=0.25, random_state=8)
    
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)
    
    if vectorizer_type == 'count':
        vectorizer = CountVectorizer(
            ngram_range=ngram_range, 
            stop_words=stop_words,
            min_df=min_df
        )
    else:
        vectorizer = TfidfVectorizer(
            ngram_range=ngram_range, 
            stop_words=stop_words,
            min_df=min_df
        )

    X_train_vector = vectorizer.fit_transform(X_train)
    X_test_vector = vectorizer.transform(X_test)

    if isinstance(classifier_algorithm, str):
        if classifier_algorithm.lower() == 'multinomialnb':
            clf = MultinomialNB()
        elif classifier_algorithm.lower() == 'mlp':
            clf = MLPClassifier(max_iter=5, random_state=8)
        elif classifier_algorithm.lower() == 'ridge':
            clf = RidgeClassifier(alpha=1, solver='auto', max_iter=5)
    else:
        clf = classifier_algorithm
    
    clf.fit(X_train_vector, y_train)
    y_pred = clf.predict(X_test_vector)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
    accuracy = accuracy_score(y_test, y_pred)
    
    return {
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'accuracy': accuracy,
    }


This cell contains all the configurations/combinations of vectorizer_type, ngram_range, and min_df we select. It then runs these selected configurations and appends the results into a table, printing a "B: " next to the best value in each column.

In [17]:
'''
configurations = [
    {'vectorizer': 'count', 'ngram_range': (1, 1), 'min_df': 2},
    {'vectorizer': 'count', 'ngram_range': (2, 2), 'min_df': 2},
    {'vectorizer': 'count', 'ngram_range': (1, 2), 'min_df': 2},
    {'vectorizer': 'tfidf', 'ngram_range': (1, 1), 'min_df': 2},
    {'vectorizer': 'tfidf', 'ngram_range': (2, 2), 'min_df': 2},
    {'vectorizer': 'tfidf', 'ngram_range': (1, 2), 'min_df': 2}
]

results = []

for config in configurations:
    vectorizer_type = config['vectorizer']
    ngram_range = config['ngram_range']
    min_df = config['min_df']
    
    run1 = sdg_classifier(corpus, classifier_algorithm='multinomialnb', 
                           vectorizer_type=vectorizer_type, 
                           ngram_range=ngram_range, min_df=min_df)
    
    run2 = sdg_classifier(corpus, classifier_algorithm='mlp', 
                           vectorizer_type=vectorizer_type, 
                           ngram_range=ngram_range, min_df=min_df)
    
    run3 = sdg_classifier(corpus, classifier_algorithm='ridge', 
                           vectorizer_type=vectorizer_type, 
                           ngram_range=ngram_range, min_df=min_df)
    
    results.append({
        'vectorizer': vectorizer_type,
        'ngram_range': str(ngram_range),
        'min_df': min_df,
        'MultinomialNB_precision': run1['precision'],
        'MLP_precision': run2['precision'],
        'Ridge_precision': run3['precision'],
        'MultinomialNB_recall': run1['recall'],
        'MLP_recall': run2['recall'],
        'Ridge_recall': run3['recall'],
        'MultinomialNB_f1': run1['f1_score'],
        'MLP_f1': run2['f1_score'],
        'Ridge_f1': run3['f1_score'],
        'MultinomialNB_accuracy': run1['accuracy'],
        'MLP_accuracy': run2['accuracy'],
        'Ridge_accuracy': run3['accuracy']
    })

results_df = pd.DataFrame(results)
def highlight_best(s):
    return ["B: " + str(v) if v == s.max() else str(v) for v in s]

highlighted_df = results_df.copy()
metrics = ['precision', 'recall', 'f1', 'accuracy']
for metric in metrics:
    highlighted_df[[f'MultinomialNB_{metric}', f'MLP_{metric}', f'Ridge_{metric}']] = highlighted_df[[f'MultinomialNB_{metric}', f'MLP_{metric}', f'Ridge_{metric}']].apply(highlight_best)

highlighted_df
'''

'\nconfigurations = [\n    {\'vectorizer\': \'count\', \'ngram_range\': (1, 1), \'min_df\': 2},\n    {\'vectorizer\': \'count\', \'ngram_range\': (2, 2), \'min_df\': 2},\n    {\'vectorizer\': \'count\', \'ngram_range\': (1, 2), \'min_df\': 2},\n    {\'vectorizer\': \'tfidf\', \'ngram_range\': (1, 1), \'min_df\': 2},\n    {\'vectorizer\': \'tfidf\', \'ngram_range\': (2, 2), \'min_df\': 2},\n    {\'vectorizer\': \'tfidf\', \'ngram_range\': (1, 2), \'min_df\': 2}\n]\n\nresults = []\n\nfor config in configurations:\n    vectorizer_type = config[\'vectorizer\']\n    ngram_range = config[\'ngram_range\']\n    min_df = config[\'min_df\']\n    \n    run1 = sdg_classifier(corpus, classifier_algorithm=\'multinomialnb\', \n                           vectorizer_type=vectorizer_type, \n                           ngram_range=ngram_range, min_df=min_df)\n    \n    run2 = sdg_classifier(corpus, classifier_algorithm=\'mlp\', \n                           vectorizer_type=vectorizer_type, \n              

Provided Links  
With min_df = 3

| Bigram with Count Vectors     | MultinomialNB | MLP  | Ridge |
|:------------------------------|:--------------|:-----|:------|
| Precision                     | .814          | **.824** | .792  |
| Recall                        | .810          | **.822** | .794  |
| F1                            | .804          | **.819** | .791  |
| Accuracy                      | .810          | **.822** | .794  |
| Unigram with Count Vectors    |               |      |       |
| Precision                     | .849          | **.878** | .811  |
| Recall                        | .848          | **.878** | .810  |
| F1                            | .846          | **.879** | .810  |
| Accuracy                      | .848          | **.878** | .810  |
| Mixed Gram with Count Vectors |               |      |       |
| Precision                     | .841          | **.891** | .874  |
| Recall                        | .830          | **.891** | .875  |
| F1                            | .820          | **.890** | .873  |
| Accuracy                      | .830          | **.891** | .875  |
| Bigram with tfidf Vectors     |               |      |       |
| Precision                     | .780          | **.830** | .821  |
| Recall                        | .695          | **.830** | .821  |
| F1                            | .665          | **.827** | .818  |
| Accuracy                      | .695          | **.830** | .822  |
| Unigram with tfidf Vectors    |               |      |       |
| Precision                     | .811          | **.882** | .880  |
| Recall                        | .745          | **.882** | **.882**  |
| F1                            | .723          | **.881** | .880  |
| Accuracy                      | .745          | **.882** | **.882**  |
| Mixed Gram with tfidf Vectors |               |      |       |
| Precision                     | .801          | **.890** | .889  |
| Recall                        | .700          | **.891** | .889  |
| F1                            | .671          | **.890** | .887  |
| Accuracy                      | .700          | **.891** | .889  |

**Bigram with Count Vectors (MultinomialNB):**
This model achieves a precision of 0.814 and recall of 0.810, leading to an F1 score of 0.804. The accuracy is 0.810, indicating a strong balance between precision and recall for this setup.

**Unigram with Count Vectors (MultinomialNB):**
With a precision of 0.849, recall of 0.848, and F1 score of 0.846, this model slightly outperforms the bigram setup. The accuracy is also higher at 0.848, making it a more reliable approach for classification.

**Mixed Gram with Count Vectors (MultinomialNB):**
This model performs comparably to the unigram approach, with precision at 0.841, recall at 0.830, and an F1 score of 0.820. Accuracy is 0.830, showing strong performance but slightly lower than the unigram model.

**Bigram with TF-IDF Vectors (MultinomialNB):**
This approach struggles in comparison, achieving a precision of 0.780 and recall of 0.695, resulting in an F1 score of 0.665. The accuracy is also lower at 0.695, suggesting limited effectiveness in distinguishing between classes.

**Unigram with TF-IDF Vectors (MultinomialNB):**
The model achieves a precision of 0.811, recall of 0.745, and an F1 score of 0.723. The accuracy is 0.745, indicating modest improvements over the bigram setup with TF-IDF vectors but still underperforming compared to the count vector approaches.

**Mixed Gram with TF-IDF Vectors (MultinomialNB):**
This model offers a slight improvement over the bigram and unigram TF-IDF setups, with a precision of 0.801, recall of 0.700, and an F1 score of 0.671. The accuracy is 0.700, reflecting moderate classification performance.

This following code extracts the text from each url in the urls list. Then it breaks down the text line by line, creating the data array **corpus2_df** with a list called **corpus2** in it.

In [31]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def extract_full_text_from_url(url):
    try:
        # Send a GET request to the website
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raise an exception for bad status codes
        
        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract text from paragraphs, preserving the full context
        paragraphs = [p.get_text(strip=True) for p in soup.find_all('p') if p and p.get_text(strip=True)]
        
        # Join paragraphs into a single full text string
        full_text = ' '.join(paragraphs)
        
        return full_text
    
    except Exception as e:
        print(f"Error extracting text from {url}: {e}")
        return None
urls = [
    "http://gianttortoise.org/en/beyond-tracking",
    "https://www.dhs.gov/blue-campaign/what-human-trafficking",
    "https://www.dol.gov/agencies/odep/program-areas/individuals/older-workers",
    "https://michigantoday.umich.edu/2022/08/26/positively-breaking-the-age-code/"
]

# Extract full text from all URLs
all_texts = []
for url in urls:
    text = extract_full_text_from_url(url)
    if text:
        all_texts.append(text)

# Create DataFrame
corpus2_df = pd.DataFrame({'text': all_texts})
corpus2 = corpus2_df['text']

In [32]:
corpus2

0    Having discovered some of the mechanisms gover...
1    An official website of the United States gover...
2    An official website of the United States gover...
3    Office of the VP for Communications – Keeping ...
Name: text, dtype: object

We can take this extracted text (now seperated into lines by sentence) and pass it into a function to label each line according to the SDG labels supplied by the UN. Each classifier algorithm may come up with different labels despite being trained on the previous data.

In [33]:
def sdg_classifier2(train_corpus, corpus2, train_label, classifier_algorithm='multinomialnb', vectorizer_type='count', ngram_range=(1,1), min_df=3):    
    le = LabelEncoder()
    y_train = le.fit_transform(train_label)
    
    if vectorizer_type == 'count':
        vectorizer = CountVectorizer(ngram_range=ngram_range, stop_words='english', min_df=min_df)
    else:
        vectorizer = TfidfVectorizer(ngram_range=ngram_range, stop_words='english', min_df=min_df)

    X_train_vector = vectorizer.fit_transform(train_corpus)
    X_corpus2_vector = vectorizer.transform(corpus2)

    if classifier_algorithm == 'multinomialnb':
        clf = MultinomialNB()
    elif classifier_algorithm == 'mlp':
        clf = MLPClassifier(max_iter=5, random_state=8)
    elif classifier_algorithm == 'ridge':
        clf = RidgeClassifier(alpha=1, solver='auto', max_iter=5)
    else:
        clf = classifier_algorithm
    
    clf.fit(X_train_vector, y_train)
    
    y_pred = clf.predict(X_corpus2_vector)
    
    predicted_labels = le.inverse_transform(y_pred)
    
    return predicted_labels

In [34]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

#for each SDG, what are the most differentiating features according to the classifier?

def get_top_features_per_sdg(clf, corpus, sdg_labels, vectorizer_type='count', n_top_features=10):
    if vectorizer_type == 'count':
        vectorizer = CountVectorizer(stop_words='english', min_df=2)
    else:
        vectorizer = TfidfVectorizer(stop_words='english', min_df=2)
    X_vector = vectorizer.fit_transform(corpus)
    feature_names = vectorizer.get_feature_names_out()

    le = LabelEncoder()
    encoded_labels = le.fit_transform(sdg_labels)
    
    # Get unique labels in the order they were encoded
    unique_labels = le.classes_
    
    # For MultinomialNB, log probabilities represent feature importance
    feature_log_prob = clf.feature_log_prob_
    
    top_features = {}
    for i, sdg in enumerate(unique_labels):
        # Ensure we don't exceed the number of log probability rows
        if i < feature_log_prob.shape[0]:
            
            # Convert log probabilities to probabilities using exponential
            feature_probs = np.exp(feature_log_prob[i])
            
            # Get indices of top features for this SDG
            top_feature_indices = feature_probs.argsort()[-n_top_features:][::-1]
            
            # Get the actual feature names and their probabilities
            top_features[sdg] = [
                (feature_names[idx], feature_probs[idx]) 
                for idx in top_feature_indices
            ]
    
    return top_features

# Usage remains the same
vectorizer = CountVectorizer(stop_words='english', min_df=2)
X_vector = vectorizer.fit_transform(corpus)
clf = MultinomialNB()
clf.fit(X_vector, sdg_num)
top_features = get_top_features_per_sdg(clf, corpus, sdg_num, vectorizer_type='count', n_top_features=10)

# Print the results
for sdg, features in top_features.items():
    print(f"SDG {sdg}:")
    for feature, prob in features:
        print(f"  {feature}: {prob:.4f}")

SDG 1:
  poverty: 0.0245
  income: 0.0100
  poor: 0.0075
  countries: 0.0075
  children: 0.0072
  social: 0.0065
  child: 0.0047
  households: 0.0046
  household: 0.0037
  deprivation: 0.0035
SDG 2:
  food: 0.0147
  agricultural: 0.0071
  production: 0.0054
  countries: 0.0049
  prices: 0.0038
  agriculture: 0.0034
  price: 0.0033
  farmers: 0.0030
  land: 0.0028
  security: 0.0027
SDG 3:
  health: 0.0245
  care: 0.0167
  services: 0.0061
  countries: 0.0057
  mental: 0.0040
  oecd: 0.0038
  primary: 0.0038
  quality: 0.0038
  patients: 0.0034
  population: 0.0033
SDG 4:
  education: 0.0188
  school: 0.0124
  students: 0.0117
  schools: 0.0088
  teachers: 0.0081
  learning: 0.0060
  oecd: 0.0058
  countries: 0.0054
  skills: 0.0047
  teacher: 0.0042
SDG 5:
  women: 0.0296
  gender: 0.0157
  men: 0.0069
  countries: 0.0061
  work: 0.0052
  equality: 0.0045
  social: 0.0042
  rights: 0.0039
  female: 0.0037
  labour: 0.0036
SDG 6:
  water: 0.0390
  management: 0.0051
  groundwater: 0.004

In [35]:
#are there any overlaps between the SDG vocabularies?

def analyze_vocabulary_overlap(top_features, detailed=True):
    # Create sets of top features for each SDG
    feature_sets = {sdg: set(feature for feature, _ in features) 
                    for sdg, features in top_features.items()}
    
    # Compute pairwise intersections
    overlap_matrix = {}
    detailed_overlap = {}
    
    for sdg1 in feature_sets:
        overlap_matrix[sdg1] = {}
        detailed_overlap[sdg1] = {}
        
        for sdg2 in feature_sets:
            intersection = feature_sets[sdg1].intersection(feature_sets[sdg2])
            overlap_count = len(intersection)
            total_unique = len(feature_sets[sdg1].union(feature_sets[sdg2]))
            overlap_percentage = (overlap_count / total_unique) * 100 if total_unique > 0 else 0
            overlap_matrix[sdg1][sdg2] = round(overlap_percentage, 2)
            
            # If detailed mode is on, store the actual overlapping features
            if detailed and sdg1 != sdg2:
                detailed_overlap[sdg1][sdg2] = list(intersection)
    
    # Return percentage overlap matrix and features
    return {
        'overlap_percentages': overlap_matrix,
        'overlapping_features': detailed_overlap
    }
overlap = analyze_vocabulary_overlap(top_features)

# Print overlap percentages
print("Overlap Percentages:")
for sdg1, overlaps in overlap['overlap_percentages'].items():
    print(f"SDG {sdg1}:")
    for sdg2, percentage in overlaps.items():
        if sdg1 != sdg2:
            print(f"  With SDG {sdg2}: {percentage}%")

# Overlapping features
print("\nDetailed Overlapping Features:")
for sdg1, sdg_overlaps in overlap['overlapping_features'].items():
    for sdg2, features in sdg_overlaps.items():
        if features:
            print(f"SDG {sdg1} and SDG {sdg2} share: {features}")

Overlap Percentages:
SDG 1:
  With SDG 2: 5.26%
  With SDG 3: 5.26%
  With SDG 4: 5.26%
  With SDG 5: 11.11%
  With SDG 6: 5.26%
  With SDG 7: 5.26%
  With SDG 8: 5.26%
  With SDG 9: 5.26%
  With SDG 10: 17.65%
  With SDG 11: 0.0%
  With SDG 12: 5.26%
  With SDG 13: 5.26%
  With SDG 14: 0.0%
  With SDG 15: 0.0%
  With SDG 16: 5.26%
SDG 2:
  With SDG 1: 5.26%
  With SDG 3: 5.26%
  With SDG 4: 5.26%
  With SDG 5: 5.26%
  With SDG 6: 5.26%
  With SDG 7: 5.26%
  With SDG 8: 5.26%
  With SDG 9: 5.26%
  With SDG 10: 5.26%
  With SDG 11: 5.26%
  With SDG 12: 11.11%
  With SDG 13: 5.26%
  With SDG 14: 5.26%
  With SDG 15: 5.26%
  With SDG 16: 0.0%
SDG 3:
  With SDG 1: 5.26%
  With SDG 2: 5.26%
  With SDG 4: 11.11%
  With SDG 5: 5.26%
  With SDG 6: 11.11%
  With SDG 7: 5.26%
  With SDG 8: 5.26%
  With SDG 9: 11.11%
  With SDG 10: 11.11%
  With SDG 11: 0.0%
  With SDG 12: 5.26%
  With SDG 13: 5.26%
  With SDG 14: 0.0%
  With SDG 15: 0.0%
  With SDG 16: 0.0%
SDG 4:
  With SDG 1: 5.26%
  With SDG 

In [36]:

configurations = [
    {'vectorizer': 'count', 'ngram_range': (1, 1), 'min_df': 3},
    {'vectorizer': 'count', 'ngram_range': (2, 2), 'min_df': 3},
    {'vectorizer': 'count', 'ngram_range': (1, 2), 'min_df': 3},
    {'vectorizer': 'tfidf', 'ngram_range': (1, 1), 'min_df': 3},
    {'vectorizer': 'tfidf', 'ngram_range': (2, 2), 'min_df': 3},
    {'vectorizer': 'tfidf', 'ngram_range': (1, 2), 'min_df': 3}
]

results = []

for config in configurations:
    vectorizer_type = config['vectorizer']
    ngram_range = config['ngram_range']
    min_df = config['min_df']
    
    run1 = sdg_classifier2(train_corpus=corpus, corpus2=corpus2, train_label=sdg_num, 
                           classifier_algorithm='multinomialnb', 
                           vectorizer_type=vectorizer_type, 
                           ngram_range=ngram_range, min_df=min_df)
    
    run2 = sdg_classifier2(train_corpus=corpus, corpus2=corpus2, train_label=sdg_num, 
                           classifier_algorithm='mlp', 
                           vectorizer_type=vectorizer_type, 
                           ngram_range=ngram_range, min_df=min_df)
    
    run3 = sdg_classifier2(train_corpus=corpus, corpus2=corpus2, train_label=sdg_num, 
                           classifier_algorithm='ridge', 
                           vectorizer_type=vectorizer_type, 
                           ngram_range=ngram_range, min_df=min_df)
    
    
    results.append({
        'vectorizer': vectorizer_type,
        'ngram_range': str(ngram_range),
        'min_df': min_df,
        'MultinomialNB_predicted_labels': run1,
        'MLP_predicted_labels': run2,
        'Ridge_predicted_labels': run3,
    })

results_df1 = pd.DataFrame(results)




In [37]:
print(results_df1)

  vectorizer ngram_range  min_df MultinomialNB_predicted_labels  \
0      count      (1, 1)       3                 [15, 16, 4, 5]   
1      count      (2, 2)       3                 [15, 16, 4, 3]   
2      count      (1, 2)       3                 [15, 16, 4, 3]   
3      tfidf      (1, 1)       3                 [15, 16, 4, 5]   
4      tfidf      (2, 2)       3                 [15, 16, 4, 3]   
5      tfidf      (1, 2)       3                 [15, 16, 4, 5]   

  MLP_predicted_labels Ridge_predicted_labels  
0        [15, 5, 4, 3]         [15, 16, 4, 3]  
1       [15, 16, 4, 3]         [15, 16, 4, 3]  
2        [15, 5, 4, 3]          [15, 5, 4, 3]  
3        [15, 5, 4, 3]         [15, 16, 4, 3]  
4       [15, 16, 4, 3]         [15, 16, 4, 3]  
5       [15, 16, 4, 3]         [15, 16, 4, 3]  


 Assignment: Take the main text content from these pages, and feed them into your classifier and see how your model classifies them. Are the classifications reasonable? find a case where your classification is not reasonable and explain what the model does that leads to the not ideal classification.
 ------------------------------------------------------------------------------------------------------------------------------------------------

Are the classifications reasonable? 
The classifications above seem generally reasonable beacuse there is only slight variation in the classifier predicted labels.

Find a case where your classification is not reasonable and explain what the model does that leads to the not ideal classification:
The Ridge classifier seems to have had a subpar classification for its ngram range(1, 2) count vectorizer. It put classified the text as sdg label 5 when most other classifiers selected sdg 16. 