In [36]:
import csv
import requests
from bs4 import BeautifulSoup as bs
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

def find_divtag(url, input_text):
    field_label = []
    html_tag = []
    tag_attrs = []
    try: 
        # connect to URL
        # define default user agent to prevent status code 403
        headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"}
        response = requests.get(url, headers = headers)

        # http request success
        if response.status_code == 200:
            soup = bs(response.content, "html.parser")
            # get divtag for every field
            for text in input_text:
                # return smallest html element that contains text (div/span/etc)
                result = [tag for tag in soup.find_all() if text in str(tag)]
                if result:
                    divtag = str(min(result, key=lambda element: len(str(element.get_text(strip=True))))) 
                    if divtag != "":           
                        field_label.append(input_text.index(text)) # 0 = name; 1 = address; 2 = zip; 3 = phone num
                        html_tag.append(divtag)
                        tag_attrs.append(bs(divtag, 'html.parser').find().attrs)
            return field_label, html_tag, tag_attrs
        # http request failed
        else:
            print(f"Status code: {response.status_code}. Failed to retrieve content from {url}.")
            return None
    # unknown exception (usually about network connectivity)
    except requests.exceptions.RequestException as e:
        print(f"Error: {e}")
        return None
    
def clean_csv():
    # read in largest scraping dataset
    df = pd.read_csv('resources_large.csv', sep=";")
    columns_to_keep = ['resource_id', 'name', 'address_1', "zip", "Phone Number", "Website where info found"]  # Replace with your actual column names

    # create a new df with only the essential fields
    df_filtered = df[columns_to_keep]
    df_cleaned = df_filtered.rename(columns={"Website where info found": "url", "address_1": "address"}).dropna(subset=["url"])
    print("Sample dataset size: " + str(df_cleaned.shape))

    # option to export small subset temporarily only for testing
    df_small = df_cleaned.head(300)
    df_small.to_csv('resources_cleaned.csv', sep=";", index=False)

def train_svm_model(x, y):
    # Manually split the data into 70% training and 30% testing
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

    # Create an SVM classifier
    svm_classifier = svm.SVC(kernel='linear', C=1)

    # Define the stratified k-fold cross-validator with 5 folds
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Perform cross-validation on the training set and get accuracy scores
    accuracy_scores = cross_val_score(svm_classifier, X_train, y_train, cv=kf)

    # Print the accuracy scores for each fold
    for fold, accuracy in enumerate(accuracy_scores, start=1):
        print(f'Fold {fold}: Accuracy = {accuracy:.2f}')
    average_accuracy = accuracy_scores.mean()
    print(f'Average Accuracy: {average_accuracy:.2f}')

    # train and evaulate model
    svm_classifier.fit(X_train, y_train)
    test_accuracy = svm_classifier.score(X_test, y_test)
    print(f'Test Accuracy: {test_accuracy:.2f}')

In [6]:
# get rid of rows with empty urls, rename fields, etc
clean_csv()

# extract all raw features
data = {}
file_input = input("Enter a CSV containing clinic data: ")
with open(file_input, mode='r') as file:
    reader = csv.reader(file, delimiter=";")
    # print field types
    header = next(reader, None)
    if header:
        print(f"Fields: {header}") 
    # exclude last index since it's URL
    field_len = len(header) - 1 

    # extract labeled input for SVM
    field_label = []    # supervised data
    html_tag = []   # feature for raw html tags
    tag_attrs = []  # feature for tag attributes
    for row in reader:
        print("Extracting features for clinic id " + row[0])
        res = find_divtag(row[header.index('url')], row[1:-1])
        if res is not None:
            field_label_temp, html_tag_temp, tag_attrs_temp = res        
            field_label += field_label_temp
            html_tag += html_tag_temp
            tag_attrs += tag_attrs_temp

Sample dataset size: (4475, 6)
Fields: ['resource_id', 'name', 'address', 'zip', 'Phone Number', 'url']
Extracting features for clinic id 3
Extracting features for clinic id 13
Extracting features for clinic id 14
Extracting features for clinic id 16
Extracting features for clinic id 17
Extracting features for clinic id 18
Extracting features for clinic id 20
Extracting features for clinic id 21
Extracting features for clinic id 23
Extracting features for clinic id 24
Extracting features for clinic id 25
Extracting features for clinic id 26
Extracting features for clinic id 27
Extracting features for clinic id 28
Extracting features for clinic id 37
Extracting features for clinic id 40
Extracting features for clinic id 48
Extracting features for clinic id 49
Extracting features for clinic id 50
Extracting features for clinic id 51
Status code: 404. Failed to retrieve content from https://snowlinehospice.org/our-services/hospice-services.
Extracting features for clinic id 52
Extracting 

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Extracting features for clinic id 391
Extracting features for clinic id 398
Extracting features for clinic id 400
Error: HTTPSConnectionPool(host='unininc.org', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x282df7f50>: Failed to resolve 'unininc.org' ([Errno 8] nodename nor servname provided, or not known)"))
Extracting features for clinic id 401
Status code: 403. Failed to retrieve content from http://frc.vesd.net/community_resources/emergency_food.
Extracting features for clinic id 403
Extracting features for clinic id 404
Status code: 403. Failed to retrieve content from https://baartprograms.com/baart-14th-street/.
Extracting features for clinic id 414
Status code: 403. Failed to retrieve content from https://www.gvhc.org/.
Extracting features for clinic id 415
Error: HTTPSConnectionPool(host='wellpathcarecenters.com', port=443): Max retries exceeded with url: /our-services/ (Caused by SSLError(SSLEOFErro

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Extracting features for clinic id 547
Extracting features for clinic id 552
Extracting features for clinic id 553
Error: HTTPSConnectionPool(host='www.latterdaysaintjoblue%20shield.org', port=443): Max retries exceeded with url: /ers/ct/?lang=eng (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x28269d350>: Failed to resolve 'www.latterdaysaintjoblue%20shield.org' ([Errno 8] nodename nor servname provided, or not known)"))
Extracting features for clinic id 557
Extracting features for clinic id 559
Extracting features for clinic id 560
Extracting features for clinic id 563
Error: HTTPSConnectionPool(host='horizonpersonnelservices.com', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x2831d5b90>: Failed to resolve 'horizonpersonnelservices.com' ([Errno 8] nodename nor servname provided, or not known)"))
Extracting features for clinic id 565
Extracting features for clinic id 566
Extrac

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Extracting features for clinic id 617
Status code: 403. Failed to retrieve content from https://dhs.lacounty.gov.
Extracting features for clinic id 618
Status code: 403. Failed to retrieve content from https://pinnacletreatment.com/.
Extracting features for clinic id 619
Extracting features for clinic id 627
Extracting features for clinic id 629
Extracting features for clinic id 630
Status code: 406. Failed to retrieve content from http://sfcesworks.org/.
Extracting features for clinic id 631
Status code: 403. Failed to retrieve content from http://frc.vesd.net/community_resources/emergency_food.
Extracting features for clinic id 633
Status code: 403. Failed to retrieve content from https://www.whittierfirstday.org/.
Extracting features for clinic id 634
Extracting features for clinic id 635
Extracting features for clinic id 636
Extracting features for clinic id 639
Extracting features for clinic id 640
Extracting features for clinic id 641
Error: HTTPSConnectionPool(host='oconnor.veri

In [12]:
# check the labels
print(field_label)
print(len(field_label))

[2, 3, 2, 1, 2, 1, 2, 3, 0, 3, 1, 2, 3, 1, 2, 1, 2, 3, 0, 1, 2, 2, 0, 2, 2, 0, 2, 1, 2, 3, 3, 0, 1, 2, 0, 0, 1, 2, 3, 3, 0, 1, 2, 1, 2, 3, 2, 0, 1, 2, 3, 0, 2, 3, 2, 3, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 2, 3, 0, 1, 2, 0, 1, 2, 0, 1, 2, 2, 3, 0, 3, 0, 2, 0, 1, 2, 3, 1, 2, 3, 2, 3, 0, 3, 1, 2, 3, 1, 2, 3, 0, 2, 3, 0, 2, 3, 1, 2, 0, 1, 2, 2, 1, 2, 1, 2, 3, 0, 1, 2, 1, 2, 3, 0, 0, 2, 1, 2, 0, 1, 2, 3, 0, 1, 2, 0, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 0, 0, 1, 2, 3, 0, 1, 2, 3, 0, 2, 3, 0, 1, 2, 0, 3, 3, 0, 1, 2, 3, 0, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 0, 1, 2, 0, 1, 2, 3, 0, 2, 3, 0, 2, 3, 1, 2, 1, 2, 3, 3, 2, 0, 2, 3, 0, 1, 2, 0, 0, 2, 0, 1, 2, 1, 2, 3, 0, 1, 2, 3, 0, 3, 2, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 0, 0, 1, 2, 2, 3, 0, 2, 3, 2, 1, 2, 3, 1, 2, 3, 0, 1, 2, 3, 2, 1, 2, 3, 0, 2, 3, 0, 2, 0, 1, 2, 3, 0, 1, 2, 3, 0, 2, 1, 2, 0, 1, 2, 3, 0, 1, 2, 0, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 1, 2, 1, 2, 0, 1, 2, 3, 0, 1, 2, 3, 0, 0, 0, 0, 2, 3, 2, 3, 0, 1, 2, 0, 0, 1, 2, 3, 1, 2, 1, 2, 3, 1, 2]
332

In [63]:
# convert features to vectors that can be used in SVM

# numeric representation of the raw html tags
vectorizer = CountVectorizer()
html_features = vectorizer.fit_transform(html_tag).toarray()

# extract relevant attribute features that's indicative of field classification
df = pd.DataFrame(tag_attrs)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# sort by most avail entries and choose the first 5 cols
nan_counts = df.isna().sum()
count_sort_col = nan_counts.sort_values().index
df_ordered = df[count_sort_col]
df_ordered_top5 = df_ordered.iloc[:, :5]
print(df_ordered_top5["class"])

0                                                    NaN
1      [elementor-element, elementor-element-38c334d,...
2                               [absolute, inset-0, z-0]
3                                [card-block__card-text]
4                                [card-block__card-text]
5                                                    NaN
6                                                    NaN
7                                [location__address_tel]
8                                                    NaN
9                                                    NaN
10                                                   NaN
11                                                   NaN
12                                                   NaN
13                                                   NaN
14                                                   NaN
15                                                   NaN
16                                                   NaN
17                             

In [88]:
mlb = MultiLabelBinarizer()
#create boolean mask matched non NaNs values
mask = df_ordered_top5['class'].notnull()

#filter by boolean indexing
arr = mlb.fit_transform(df_ordered_top5.loc[mask, 'class'].dropna().str.strip('[]').str.split(','))
print(arr)

#create DataFrame and add missing (NaN)s index values
class_labels = (pd.DataFrame(arr, index=df_ordered_top5.index[mask], columns=mlb.classes_)
               .reindex(df_ordered_top5.index, fill_value=0))
print(class_labels)


AttributeError: Can only use .str accessor with string values!

In [None]:
# todo: convert all entries to numbers


# add both feature matrices
feature_matrix = []
for i in range(len(html_features)):
    feature_matrix.append(html_features[i] + tag_features[i])

In [None]:
# train/test support vector machine
train_svm_model(feature_matrix, field_label)