In [2]:
import csv
import requests
from bs4 import BeautifulSoup as bs
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
from sklearn.discriminant_analysis import StandardScaler
from requests.adapters import HTTPAdapter, Retry
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer


def find_divtag(url, input_text):
    field_label = []
    html_tag = []
    tag_attrs = []
    text_content = []
    try: 
        # connect to URL
        # define default user agent to prevent status code 403
        headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"}
        #response = requests.get(url, headers = headers)
        s = requests.Session()
        retries = Retry(total=5,
                        backoff_factor=0.1,
                        status_forcelist=[ 500, 502, 503, 504 ])
        s.mount('http://', HTTPAdapter(max_retries=retries))
        response = s.get(url, headers=headers, timeout=3)

        # http request success
        if response.status_code == 200:
            soup = bs(response.content, "html.parser")
            # get divtag for every field
            for text in input_text:
                # return smallest html element that contains text (div/span/etc)
                result = [tag for tag in soup.find_all() if text in str(tag)]
                if result:
                    divtag = str(min(result, key=lambda element: len(str(element.get_text(strip=True))))) 
                    if divtag != "":           
                        field_label.append(input_text.index(text)) # 0 = name; 1 = address; 2 = zip; 3 = phone num
                        html_tag.append(divtag)
                        text_content.append(text)

                        tag_list = bs(divtag, 'html.parser').find().attrs
                        if tag_list is None:
                            tag_attrs.append([])
                        else:
                            tag_attrs.append(tag_list)
            return field_label, html_tag, tag_attrs, text_content
        # http request failed
        else:
            print(f"Status code: {response.status_code}. Failed to retrieve content from {url}.")
            return None
    # unknown exception (usually about network connectivity)
    except requests.exceptions.RequestException as e:
        print(f"Error: {e}")
        return None
    
def clean_csv():
    # read in largest scraping dataset
    df = pd.read_csv('resources_large.csv', sep=";")
    columns_to_keep = ['resource_id', 'name', 'address_1', "zip", "Phone Number", "Website where info found"]  # Replace with your actual column names

    # create a new df with only the essential fields
    df_filtered = df[columns_to_keep]
    df_cleaned = df_filtered.rename(columns={"Website where info found": "url", "address_1": "address"}).dropna(subset=["url"])
    print("Sample dataset size: " + str(df_cleaned.shape))

    # option to export small subset temporarily only for testing
    df_small = df_cleaned.head(100)
    df_small.to_csv('resources_cleaned.csv', sep=";", index=False)

def train_svm_model(x, y):
    # train/test split
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.30, random_state = 101) 
    model = svm.SVC() 
    model.fit(X_train, y_train) 
    
    # performance before tuning
    predictions = model.predict(X_test) 
    print(classification_report(y_test, predictions)) 

    # hyperperameter tuning
    param_grid = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['rbf']}  
    grid = GridSearchCV(svm.SVC(), param_grid, cv = 5, refit = True, verbose = 3) 
    grid.fit(X_train, y_train) 
    print(grid.best_params_) 
    print(grid.best_estimator_)     
    
    # final performance
    grid_predictions = grid.predict(X_test) 
    print(classification_report(y_test, grid_predictions)) 
   

In [8]:
# get rid of rows with empty urls, rename fields, etc
clean_csv()

# extract all raw features
data = {}
file_input = input("Enter a CSV containing clinic data: ")
with open(file_input, mode='r') as file:
    reader = csv.reader(file, delimiter=";")
    # print field types
    header = next(reader, None)
    if header:
        print(f"Fields: {header}") 
    # exclude last index since it's URL
    field_len = len(header) - 1 

    # extract labeled input for SVM
    field_label = []    # supervised data
    text_content = [] # raw text
    text_len = [] # length of the text
    text_type = [] # string, number, or mixed
    text_hyphens = [] # number of hyphens
    for row in reader:
        print("Extracting features for clinic id " + row[0])
        for element in row[1:-1]:
            field_label.append(row.index(element))
            text_content.append(element)
            print(element)


Sample dataset size: (4475, 6)
Fields: ['resource_id', 'name', 'address', 'zip', 'Phone Number', 'url']
Extracting features for clinic id 3
A.J. Thomas: Medi-Cal Clinic
0615 E 14th St
94621
(510) 835-9610
Extracting features for clinic id 13
City of Oakland - Human Services
1 Frank H Ogawa Plz
94612
(510) 238-3121
Extracting features for clinic id 14
Hoag Addiction Treatment Center - Newport Beach
1 Hoag Dr
92663
(888) 491-7124
Extracting features for clinic id 16
Sutter Roseville Medical Center - Birth Center
1 Medical Plaza Dr
95661
(916) 781-1517
Extracting features for clinic id 17
Sutter Roseville Wound Care
1 Medical Plaza Dr
95661
(916) 781-1386
Extracting features for clinic id 18
Blue Oak Dental
1 Medical Plaza Dr
95661
(916) 786-6777
Extracting features for clinic id 20
Kaiser Permanente - Addiction Medicine and Recovery Services
1 Quality Dr
95688
(707) 624-2830
Extracting features for clinic id 21
Kaiser Permanente - Pharmacy
1 Quality Dr
95688
(707) 624-2500
Extracting fea

In [65]:
# check the labels
print("Number of samples: " + str(len(field_label)))

Number of samples: 7478


In [66]:
# convert features to vectors that can be used in SVM

# numeric representation of the raw html tags
vectorizer = CountVectorizer()
html_features = vectorizer.fit_transform(html_tag).toarray()

# extract relevant attribute features that's indicative of field classification
df = pd.DataFrame(tag_attrs)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# sort by most avail entries and choose the first 5 cols
nan_counts = df.isna().sum()
count_sort_col = nan_counts.sort_values().index
#print(df_ordered["name"])

In [67]:
df_ordered = df[count_sort_col].astype(str)

In [68]:
# mlb = MultiLabelBinarizer()

# create one hot encoding of attribute "class"
# mask = df_ordered['class'].notnull()
# arr = mlb.fit_transform(df_ordered.loc[mask, 'class'].dropna().str.strip('[]').str.split(','))
# class_labels = (pd.DataFrame(arr, index=df_ordered.index[mask], columns=mlb.classes_)
#               .reindex(df_ordered.index, fill_value=0))
# class_labels = class_labels.drop('nan', axis=1)

#print(class_labels)


In [69]:
# create one hot encoding of attribute "name"

df_ordered["type"] = df_ordered["type"].fillna("nan")
df_ordered["type"] = df_ordered["type"].values.reshape(-1, 1).tolist()
name_labels = pd.DataFrame(mlb.fit_transform(df_ordered['type']),columns=mlb.classes_, index=df_ordered.index).drop("nan", axis = 1)

# print(html_features.shape)
# print(class_labels.shape)
# print(name_labels.shape)

(7478, 124511)
(7478, 1698)
(7478, 15)


In [70]:
# create features from text content
# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the text data
text_representation = vectorizer.fit_transform(text_content)
text_features = text_representation.toarray()
print(text_features.shape)


(7478, 4218)


In [71]:
# reduce dimension of html_features
pca = PCA(n_components=0.95)
html_features_reduced = pca.fit_transform(html_features)

# add all three feature matrices
feature_matrix = []
for i in range(len(html_features)):
    feature_matrix.append(html_features_reduced[i].tolist() + class_labels.iloc[i].tolist() + name_labels.iloc[i].tolist() + text_features[i].tolist())
    
# Standardize the feature matrix (seems to make accuracy worse in this case, prob since the features aren't linearly correlated)
# feature_matrix = np.array(feature_matrix)
# feature_matrix_scaled = StandardScaler().fit_transform(feature_matrix)

In [None]:
print(html_features_reduced.shape)

(861, 4)
[[-137.35467461  -60.86861489  -19.99540301  -21.05451257]
 [-135.81921771  -57.55500521   -5.33546479  -21.2092331 ]
 [-137.0775953   -60.81590577  -19.92451989  -20.7604837 ]
 ...
 [-137.33053281  -60.81907815  -19.94491252  -21.02805763]
 [-136.54198078  -60.63692226  -19.72411718  -20.084282  ]
 [-136.54198078  -60.63692226  -19.72411718  -20.084282  ]]


In [72]:
# train/test support vector machine
train_svm_model(feature_matrix, field_label)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.48      0.18      0.26       469
           1       0.00      0.00      0.00       611
           2       0.32      0.94      0.48       716
           3       0.00      0.00      0.00       448

    accuracy                           0.34      2244
   macro avg       0.20      0.28      0.19      2244
weighted avg       0.20      0.34      0.21      2244

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.463 total time= 1.4min
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.482 total time= 1.4min
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.486 total time= 1.5min
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.497 total time= 1.4min
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.489 total time= 1.4min
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.420 total time= 1.5min
[CV 2/5] END ....