In [2]:
import csv
import requests
from bs4 import BeautifulSoup as bs
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
from sklearn.discriminant_analysis import StandardScaler
from requests.adapters import HTTPAdapter, Retry
from sklearn.decomposition import PCA


def find_divtag(url, input_text):
    field_label = []
    html_tag = []
    tag_attrs = []
    text_content = []
    try: 
        # connect to URL
        # define default user agent to prevent status code 403
        headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"}
        #response = requests.get(url, headers = headers)
        s = requests.Session()
        retries = Retry(total=5,
                        backoff_factor=0.1,
                        status_forcelist=[ 500, 502, 503, 504 ])
        s.mount('http://', HTTPAdapter(max_retries=retries))
        response = s.get(url, headers=headers, timeout=3)

        # http request success
        if response.status_code == 200:
            soup = bs(response.content, "html.parser")
            # get divtag for every field
            for text in input_text:
                # return smallest html element that contains text (div/span/etc)
                result = [tag for tag in soup.find_all() if text in str(tag)]
                if result:
                    divtag = str(min(result, key=lambda element: len(str(element.get_text(strip=True))))) 
                    if divtag != "":           
                        field_label.append(input_text.index(text)) # 0 = name; 1 = address; 2 = zip; 3 = phone num
                        html_tag.append(divtag)
                        text_content.append(text)

                        tag_list = bs(divtag, 'html.parser').find().attrs
                        if tag_list is None:
                            tag_attrs.append([])
                        else:
                            tag_attrs.append(tag_list)
            return field_label, html_tag, tag_attrs, text_content
        # http request failed
        else:
            print(f"Status code: {response.status_code}. Failed to retrieve content from {url}.")
            return None
    # unknown exception (usually about network connectivity)
    except requests.exceptions.RequestException as e:
        print(f"Error: {e}")
        return None
    
def clean_csv():
    # read in largest scraping dataset
    df = pd.read_csv('resources_large.csv', sep=";")
    columns_to_keep = ['resource_id', 'name', 'address_1', "zip", "Phone Number", "Website where info found"]  # Replace with your actual column names

    # create a new df with only the essential fields
    df_filtered = df[columns_to_keep]
    df_cleaned = df_filtered.rename(columns={"Website where info found": "url", "address_1": "address"}).dropna(subset=["url"])
    print("Sample dataset size: " + str(df_cleaned.shape))

    # option to export small subset temporarily only for testing
    #df_small = df_cleaned.head(1000)
    df_cleaned.to_csv('resources_cleaned.csv', sep=";", index=False)

def train_svm_model(x, y):
    # train/test split
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.30, random_state = 101) 
    model = svm.SVC() 
    model.fit(X_train, y_train) 
    
    # performance before tuning
    predictions = model.predict(X_test) 
    print(classification_report(y_test, predictions)) 

    # hyperperameter tuning
    param_grid = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['rbf']}  
    grid = GridSearchCV(svm.SVC(), param_grid, cv = 5, refit = True, verbose = 3) 
    grid.fit(X_train, y_train) 
    print(grid.best_params_) 
    print(grid.best_estimator_)     
    
    # final performance
    grid_predictions = grid.predict(X_test) 
    print(classification_report(y_test, grid_predictions)) 
   

In [5]:
# get rid of rows with empty urls, rename fields, etc
clean_csv()

# extract all raw features
data = {}
file_input = input("Enter a CSV containing clinic data: ")
with open(file_input, mode='r') as file:
    reader = csv.reader(file, delimiter=";")
    # print field types
    header = next(reader, None)
    if header:
        print(f"Fields: {header}") 
    # exclude last index since it's URL
    field_len = len(header) - 1 

    # extract labeled input for SVM
    field_label = []    # supervised data
    html_tag = []   # feature for raw html tags
    tag_attrs = []  # feature for tag attributes
    for row in reader:
        print("Extracting features for clinic id " + row[0])
        res = find_divtag(row[header.index('url')], row[1:-1])
        if res is not None:
            field_label_temp, html_tag_temp, tag_attrs_temp = res        
            field_label += field_label_temp
            html_tag += html_tag_temp
            tag_attrs += tag_attrs_temp

Sample dataset size: (4475, 6)
Fields: ['resource_id', 'name', 'address', 'zip', 'Phone Number', 'url']
Extracting features for clinic id 3
Extracting features for clinic id 13
Extracting features for clinic id 14
Extracting features for clinic id 16
Extracting features for clinic id 17
Extracting features for clinic id 18
Extracting features for clinic id 20
Extracting features for clinic id 21
Extracting features for clinic id 23
Extracting features for clinic id 24
Extracting features for clinic id 25
Extracting features for clinic id 26
Extracting features for clinic id 27
Extracting features for clinic id 28
Extracting features for clinic id 37
Extracting features for clinic id 40
Extracting features for clinic id 48
Extracting features for clinic id 49
Extracting features for clinic id 50
Extracting features for clinic id 51
Status code: 404. Failed to retrieve content from https://snowlinehospice.org/our-services/hospice-services.
Extracting features for clinic id 52
Extracting 

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Extracting features for clinic id 391
Extracting features for clinic id 398
Extracting features for clinic id 400
Error: HTTPSConnectionPool(host='unininc.org', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x2840c7cd0>: Failed to resolve 'unininc.org' ([Errno 8] nodename nor servname provided, or not known)"))
Extracting features for clinic id 401
Extracting features for clinic id 403
Extracting features for clinic id 404
Extracting features for clinic id 414
Extracting features for clinic id 415
Error: HTTPSConnectionPool(host='wellpathcarecenters.com', port=443): Max retries exceeded with url: /our-services/ (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x283580450>, 'Connection to wellpathcarecenters.com timed out. (connect timeout=3)'))
Extracting features for clinic id 417
Extracting features for clinic id 419
Extracting features for clinic id 420
Extracting features for cli

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Extracting features for clinic id 547
Extracting features for clinic id 552
Extracting features for clinic id 553
Error: HTTPSConnectionPool(host='www.latterdaysaintjoblue%20shield.org', port=443): Max retries exceeded with url: /ers/ct/?lang=eng (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x2841df8d0>: Failed to resolve 'www.latterdaysaintjoblue%20shield.org' ([Errno 8] nodename nor servname provided, or not known)"))
Extracting features for clinic id 557
Extracting features for clinic id 559
Extracting features for clinic id 560
Extracting features for clinic id 563
Error: HTTPSConnectionPool(host='horizonpersonnelservices.com', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x2823b0c50>: Failed to resolve 'horizonpersonnelservices.com' ([Errno 8] nodename nor servname provided, or not known)"))
Extracting features for clinic id 565
Extracting features for clinic id 566
Extrac

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Extracting features for clinic id 617
Extracting features for clinic id 618
Extracting features for clinic id 619
Extracting features for clinic id 627
Extracting features for clinic id 629
Extracting features for clinic id 630
Extracting features for clinic id 631
Extracting features for clinic id 633
Extracting features for clinic id 634
Extracting features for clinic id 635
Extracting features for clinic id 636
Extracting features for clinic id 639
Extracting features for clinic id 640
Extracting features for clinic id 641
Error: HTTPSConnectionPool(host='oconnor.verity.org', port=443): Max retries exceeded with url: /OCH/woundCare.php (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x282d81e50>: Failed to resolve 'oconnor.verity.org' ([Errno 8] nodename nor servname provided, or not known)"))
Extracting features for clinic id 642
Error: HTTPSConnectionPool(host='www.partnershiphp.org', port=443): Max retries exceeded with url: /Community/Documents/Sola

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Extracting features for clinic id 662
Extracting features for clinic id 667
Extracting features for clinic id 668
Extracting features for clinic id 669
Status code: 404. Failed to retrieve content from https://www.select.com/locations/fairfield-ca/80134-7228.
Extracting features for clinic id 670
Extracting features for clinic id 673
Extracting features for clinic id 674
Status code: 404. Failed to retrieve content from http://dpss.co.riverside.ca.us/office-location/moreno-04.
Extracting features for clinic id 675
Status code: 404. Failed to retrieve content from https://wp.sbcounty.gov/dbh/mental-health-services/general/outpatient-clinics/.
Extracting features for clinic id 676
Extracting features for clinic id 677
Extracting features for clinic id 682
Extracting features for clinic id 683
Extracting features for clinic id 686
Extracting features for clinic id 690
Extracting features for clinic id 693
Extracting features for clinic id 694
Extracting features for clinic id 695
Extracti

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Extracting features for clinic id 1193
Extracting features for clinic id 1196
Extracting features for clinic id 1198
Extracting features for clinic id 1200
Extracting features for clinic id 1201
Extracting features for clinic id 1205
Extracting features for clinic id 1206
Extracting features for clinic id 1207
Extracting features for clinic id 1208
Extracting features for clinic id 1210
Extracting features for clinic id 1212
Error: HTTPSConnectionPool(host='www.stjoghcs.org', port=443): Max retries exceeded with url: /samaritan-helping-hand.html (Caused by SSLError(SSLCertVerificationError(1, "[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: Hostname mismatch, certificate is not valid for 'www.stjoghcs.org'. (_ssl.c:1006)")))
Extracting features for clinic id 1213
Extracting features for clinic id 1223
Extracting features for clinic id 1225
Extracting features for clinic id 1226
Extracting features for clinic id 1227
Extracting features for clinic id 1231
Extracting features

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Extracting features for clinic id 2074
Extracting features for clinic id 2076
Extracting features for clinic id 2089
Extracting features for clinic id 2092
Error: HTTPConnectionPool(host='www.coremedi-calclinic.com', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x283051190>: Failed to resolve 'www.coremedi-calclinic.com' ([Errno 8] nodename nor servname provided, or not known)"))
Extracting features for clinic id 2093
Error: HTTPConnectionPool(host='www.coremedi-calclinic.com', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x2840db0d0>: Failed to resolve 'www.coremedi-calclinic.com' ([Errno 8] nodename nor servname provided, or not known)"))
Extracting features for clinic id 2104
Extracting features for clinic id 2105
Status code: 403. Failed to retrieve content from https://bhsd.sccgov.org/information-resources/medication-assisted-treatme

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Extracting features for clinic id 2229
Extracting features for clinic id 2233
Extracting features for clinic id 2234
Extracting features for clinic id 2240
Error: HTTPSConnectionPool(host='www.cleanneedles.org', port=443): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x2853bc510>, 'Connection to www.cleanneedles.org timed out. (connect timeout=3)'))
Extracting features for clinic id 2248
Status code: 404. Failed to retrieve content from https://www.cnmgonline.com/en/our-team.
Extracting features for clinic id 2258
Extracting features for clinic id 2261
Extracting features for clinic id 2262
Extracting features for clinic id 2263
Error: Invalid URL 'www.mfirecovery.com': No scheme supplied. Perhaps you meant https://www.mfirecovery.com?
Extracting features for clinic id 2267
Extracting features for clinic id 2268
Extracting features for clinic id 2269
Extracting features for clinic id 2270
Extracting features for clinic id

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Extracting features for clinic id 2403
Extracting features for clinic id 2407
Extracting features for clinic id 2408
Extracting features for clinic id 2409
Extracting features for clinic id 2414
Extracting features for clinic id 2416
Extracting features for clinic id 2422
Extracting features for clinic id 2423
Extracting features for clinic id 2425
Extracting features for clinic id 2430
Extracting features for clinic id 2432
Extracting features for clinic id 2434
Error: HTTPSConnectionPool(host='health.ucdavis.edu', port=443): Max retries exceeded with url: /surgery/specialties/cardio/appointments/ (Caused by SSLError(SSLError(1, '[SSL: UNSAFE_LEGACY_RENEGOTIATION_DISABLED] unsafe legacy renegotiation disabled (_ssl.c:1006)')))
Extracting features for clinic id 2436
Error: HTTPConnectionPool(host='svsjoblue%20shield.com', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x28306de90>: Failed to resolve 'svsjoblue%20s

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Extracting features for clinic id 2566
Extracting features for clinic id 2568
Extracting features for clinic id 2570
Extracting features for clinic id 2571
Extracting features for clinic id 2572
Extracting features for clinic id 2575
Extracting features for clinic id 2593
Extracting features for clinic id 2594
Error: HTTPSConnectionPool(host='www.ruhealth.org', port=443): Max retries exceeded with url: /community-health-centers (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1006)')))
Extracting features for clinic id 2596
Error: HTTPSConnectionPool(host='www.ruhealth.org', port=443): Max retries exceeded with url: /community-health-centers (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1006)')))
Extracting features for clinic id 2598
Extracting features for clinic id 2602

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Extracting features for clinic id 2651
Extracting features for clinic id 2653
Extracting features for clinic id 2655
Status code: 404. Failed to retrieve content from http://www.sbclib.org/LibraryLocations/RialtoBranchLibrary.aspx.
Extracting features for clinic id 2656
Error: HTTPSConnectionPool(host='www.royoeyecenter.com', port=443): Max retries exceeded with url: /contact-us/roseville/ (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x282ee4f90>, 'Connection to www.royoeyecenter.com timed out. (connect timeout=3)'))
Extracting features for clinic id 2658
Extracting features for clinic id 2659
Error: HTTPSConnectionPool(host='health.ucdavis.edu', port=443): Max retries exceeded with url: /medicalcenter/cliniclocations/specialtycare/pediatric_specialties.html (Caused by SSLError(SSLError(1, '[SSL: UNSAFE_LEGACY_RENEGOTIATION_DISABLED] unsafe legacy renegotiation disabled (_ssl.c:1006)')))
Extracting features for clinic id 2660
Error: HTTPSConnectionPool(h

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Extracting features for clinic id 2696
Extracting features for clinic id 2698
Extracting features for clinic id 2700
Extracting features for clinic id 2702
Extracting features for clinic id 2704
Extracting features for clinic id 2706
Extracting features for clinic id 2707
Error: HTTPSConnectionPool(host='www.prototypes.org', port=443): Max retries exceeded with url: /programs/substance-use-disorders/ (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1006)')))
Extracting features for clinic id 2708
Error: HTTPSConnectionPool(host='www.prototypes.org', port=443): Max retries exceeded with url: /programs/substance-use-disorders/ (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x284073250>, 'Connection to www.prototypes.org timed out. (connect timeout=3)'))
Extracting features for clinic id 2711
Status code: 404. Failed to retrieve content from https://hipaa.jotform.com/211064639722151.
Extract

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Extracting features for clinic id 3858
Extracting features for clinic id 3859
Extracting features for clinic id 3860
Extracting features for clinic id 3861
Extracting features for clinic id 3862
Extracting features for clinic id 3863
Extracting features for clinic id 3864
Extracting features for clinic id 3865
Extracting features for clinic id 3868
Extracting features for clinic id 3869
Extracting features for clinic id 3872
Extracting features for clinic id 3873
Status code: 404. Failed to retrieve content from https://www.sebastian94904.com/parish-life/ministries.
Extracting features for clinic id 3878
Extracting features for clinic id 3879
Extracting features for clinic id 3881
Extracting features for clinic id 3887
Extracting features for clinic id 3898
Extracting features for clinic id 3904
Extracting features for clinic id 3907
Extracting features for clinic id 3908
Status code: 404. Failed to retrieve content from https://www.cityoforange.org/1348/El-Modena-Branch.
Extracting fe

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Extracting features for clinic id 4140
Error: HTTPSConnectionPool(host='www.murph-emmanuel.org', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x2854937d0>: Failed to establish a new connection: [Errno 61] Connection refused'))
Extracting features for clinic id 4143
Extracting features for clinic id 4144
Extracting features for clinic id 4147
Error: HTTPSConnectionPool(host='www.sober-initiative.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self-signed certificate (_ssl.c:1006)')))
Extracting features for clinic id 4149
Extracting features for clinic id 4151
Extracting features for clinic id 4152
Extracting features for clinic id 4153
Extracting features for clinic id 4156
Extracting features for clinic id 4157
Extracting features for clinic id 4158
Extracting features for clinic id 4165
Extracting fe

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Extracting features for clinic id 4304
Extracting features for clinic id 4305
Extracting features for clinic id 4306
Extracting features for clinic id 4309
Extracting features for clinic id 4310
Extracting features for clinic id 4311
Error: HTTPConnectionPool(host='www.stmarysmedi-calcenter.org', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x284217850>: Failed to resolve 'www.stmarysmedi-calcenter.org' ([Errno 8] nodename nor servname provided, or not known)"))
Extracting features for clinic id 4312
Extracting features for clinic id 4316
Extracting features for clinic id 4319
Extracting features for clinic id 4322
Extracting features for clinic id 4326
Extracting features for clinic id 4327
Extracting features for clinic id 4328
Extracting features for clinic id 4330
Error: HTTPSConnectionPool(host='youthmovingon.org', port=443): Max retries exceeded with url: /services/housing (Caused by SSLError(SSLError(1, '

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Extracting features for clinic id 4562
Extracting features for clinic id 4564
Extracting features for clinic id 4565
Extracting features for clinic id 4566
Extracting features for clinic id 4568
Error: HTTPSConnectionPool(host='concernedfamily.org', port=443): Max retries exceeded with url: /contact/ (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x282489e10>: Failed to establish a new connection: [Errno 61] Connection refused'))
Extracting features for clinic id 4569
Extracting features for clinic id 4570
Extracting features for clinic id 4574
Extracting features for clinic id 4575
Extracting features for clinic id 4576
Error: HTTPSConnectionPool(host='www.dentalhaven.com', port=443): Read timed out. (read timeout=3)
Extracting features for clinic id 4577
Extracting features for clinic id 4578
Extracting features for clinic id 4581
Extracting features for clinic id 4582
Extracting features for clinic id 4583
Error: HTTPSConnectionPool(host='virtualcare.di

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Extracting features for clinic id 4752
Extracting features for clinic id 4753
Error: HTTPSConnectionPool(host='health.ucdavis.edu', port=443): Max retries exceeded with url: /medicalcenter/cliniclocations/primarycare/rocklinp.html (Caused by SSLError(SSLError(1, '[SSL: UNSAFE_LEGACY_RENEGOTIATION_DISABLED] unsafe legacy renegotiation disabled (_ssl.c:1006)')))
Extracting features for clinic id 4754
Extracting features for clinic id 4756
Extracting features for clinic id 4757
Extracting features for clinic id 4761
Error: HTTPSConnectionPool(host='www.halocares.org', port=443): Max retries exceeded with url: /facilities (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self-signed certificate (_ssl.c:1006)')))
Extracting features for clinic id 4764
Extracting features for clinic id 4768
Extracting features for clinic id 4770
Extracting features for clinic id 4771
Status code: 404. Failed to retrieve content from https://camarin.o

KeyboardInterrupt: 

In [20]:
# check the labels
print("Number of samples: " + str(len(field_label)))

Number of samples: 2751


In [60]:
# convert features to vectors that can be used in SVM

# numeric representation of the raw html tags
vectorizer = CountVectorizer()
html_features = vectorizer.fit_transform(html_tag).toarray()

# extract relevant attribute features that's indicative of field classification
df = pd.DataFrame(tag_attrs)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# sort by most avail entries and choose the first 5 cols
nan_counts = df.isna().sum()
count_sort_col = nan_counts.sort_values().index
#print(df_ordered["name"])

In [61]:
df_ordered = df[count_sort_col].astype(str)

In [62]:
mlb = MultiLabelBinarizer()

# create one hot encoding of attribute "class"
mask = df_ordered['class'].notnull()
arr = mlb.fit_transform(df_ordered.loc[mask, 'class'].dropna().str.strip('[]').str.split(','))
class_labels = (pd.DataFrame(arr, index=df_ordered.index[mask], columns=mlb.classes_)
               .reindex(df_ordered.index, fill_value=0))
class_labels = class_labels.drop('nan', axis=1)

#print(class_labels)


In [63]:
# create one hot encoding of attribute "name"

df_ordered["type"] = df_ordered["type"].fillna("nan")
df_ordered["type"] = df_ordered["type"].values.reshape(-1, 1).tolist()
name_labels = pd.DataFrame(mlb.fit_transform(df_ordered['type']),columns=mlb.classes_, index=df_ordered.index).drop("nan", axis = 1)

print(html_features.shape)
print(class_labels.shape)
print(name_labels.shape)

(2751, 378292)
(2751, 140)
(2751, 4)


In [3]:
# reduce dimension of html_features
pca = PCA(n_components=100)
html_features_reduced = pca.fit_transform(html_features)

# add all three feature matrices
feature_matrix = []
for i in range(len(html_features)):
    feature_matrix.append(html_features[i].tolist() + class_labels.iloc[i].tolist() + name_labels.iloc[i].tolist())
# html_features[i].tolist() + 
    
# Standardize the feature matrix (seems to make accuracy worse in this case, prob since the features aren't linearly correlated)
# feature_matrix = np.array(feature_matrix)
# feature_matrix_scaled = StandardScaler().fit_transform(feature_matrix)

NameError: name 'html_features' is not defined

In [68]:
# train/test support vector machine
train_svm_model(feature_matrix, field_label)

In [None]:
# potential improvement ideas:
# Term Frequency-Inverse Document Frequency (?)
# hyperparameter tuning --> https://www.geeksforgeeks.org/svm-hyperparameter-tuning-using-gridsearchcv-ml/

In [1]:
print("hello")

hello
