In [1]:
import re
from typing import Union

from bs4 import BeautifulSoup
import pandas as pd

# Load Data

In [2]:
df = pd.read_csv("scraped_data/scraped_data.csv")
df

Unnamed: 0,web_url,is_gambling_site,is_error,error_scraping_desc,scraped_elements,exception_raised,scraping_initiation_time
0,https://www.nikosgreekrestaurant.com/,1,False,,"<html lang=""en-US"" dir=""ltr""><head><title>Just...",,2024-09-23 15:27:17.455269
1,https://www.myrkurmusic.com/,1,False,,"<html lang=""id""><head>\n <meta http-equiv=""C...",,2024-09-23 15:27:28.813171
2,https://www.hookedpierbar.com/,1,False,,"<html class=""no-js"" lang=""en-US""><!--<![endif]...",,2024-09-23 15:27:39.131279
3,https://electricfeelgood.com/,1,False,,"<html ⚡="""" lang=""id"" amp-version=""240906104400...",,2024-09-23 15:27:51.620380
4,https://thedinerqc.com/,1,False,,"<html xmlns:wormhole=""http://www.w3.org/1999/x...",,2024-09-23 15:28:12.225006
...,...,...,...,...,...,...,...
58,https://mediaindonesia.com/politik-dan-hukum/6...,0,False,,"<html lang=""id""><head>\n<meta charset=""UTF-8"">...",,2024-09-23 16:10:43.716552
59,https://www.cnbcindonesia.com/news/20220802184...,0,False,,"<html lang=""id"" class=""scroll-smooth"" data-cri...",,2024-09-23 16:10:56.539106
60,https://www.bbc.com/indonesia/articles/cg66wyd...,0,False,,"<html lang=""id"" class="""" dir=""ltr""><head><titl...",,2024-09-23 16:11:09.260479
61,https://www.inilah.com/11-cara-menghilangkan-k...,0,False,,"<html lang=""en-US"" dir=""ltr""><head><title>Just...",,2024-09-23 16:11:19.870977


# Data Cleaning

In [3]:
df_clean = df.copy()

## Remove rows with error

In [4]:
df_clean = df_clean[df_clean["is_error"]==False]

In [5]:
df_clean.head()

Unnamed: 0,web_url,is_gambling_site,is_error,error_scraping_desc,scraped_elements,exception_raised,scraping_initiation_time
0,https://www.nikosgreekrestaurant.com/,1,False,,"<html lang=""en-US"" dir=""ltr""><head><title>Just...",,2024-09-23 15:27:17.455269
1,https://www.myrkurmusic.com/,1,False,,"<html lang=""id""><head>\n <meta http-equiv=""C...",,2024-09-23 15:27:28.813171
2,https://www.hookedpierbar.com/,1,False,,"<html class=""no-js"" lang=""en-US""><!--<![endif]...",,2024-09-23 15:27:39.131279
3,https://electricfeelgood.com/,1,False,,"<html ⚡="""" lang=""id"" amp-version=""240906104400...",,2024-09-23 15:27:51.620380
4,https://thedinerqc.com/,1,False,,"<html xmlns:wormhole=""http://www.w3.org/1999/x...",,2024-09-23 15:28:12.225006


## Extract body only

In [6]:
def extractElement(html_text: str, html_element: str) -> str:
	"""
	Extract a defined element of HTML elements including the tags using BS4

		params:
			html_text (str): html text
			html_element (str): element to extract from `html_text`
		return:
			body elements as string
	"""

	return str(BeautifulSoup(html_text, "lxml").find(html_element))

In [7]:
df_clean["extracted_body"] = df_clean["scraped_elements"].apply(extractElement, html_element="body")

In [8]:
df_clean.loc[0, "extracted_body"]

'<body class="no-js"><div class="main-wrapper" role="main"><div class="main-content"><h1 class="zone-name-title h1">www.nikosgreekrestaurant.com</h1><h2 class="h2 spacer-bottom" id="AnPU2">Verify you are human by completing the action below.</h2><div id="JStsl2" style="display: grid;"><div><div><input id="cf-chl-widget-xy0lv_response" name="cf-turnstile-response" type="hidden"/></div></div></div><div class="spacer loading-spinner" id="Psjc8" style="display: none; visibility: hidden;"><div class="lds-ring"><div></div><div></div><div></div><div></div></div></div><div class="core-msg spacer spacer-top" id="UJXr0">www.nikosgreekrestaurant.com needs to review the security of your connection before proceeding.</div><div id="BQxuW7" style="display: none;"><div class="h2" id="challenge-success-text">Verification successful</div><div class="core-msg spacer">Waiting for www.nikosgreekrestaurant.com to respond...</div></div><noscript><div class="h2"><span id="challenge-error-text">Enable JavaScri

## Filter Importants Columns

In [9]:
df_clean = df_clean[["extracted_body", "is_gambling_site"]]

## Row Filtering

### Remove data points that blocked by Cloudflare

In [10]:
df_clean.loc[df_clean["extracted_body"].str.contains('cloudflare.com', case=False)]

Unnamed: 0,extracted_body,is_gambling_site
0,"<body class=""no-js""><div class=""main-wrapper"" ...",1
2,"<body>\n<div id=""cf-wrapper"">\n<div class=""cf-...",1
7,"<body class=""no-js""><div class=""main-wrapper"" ...",1
15,"<body class=""no-js""><div class=""main-wrapper"" ...",1
20,"<body>\n<div id=""cf-wrapper"">\n<div class=""cf-...",1
49,"<body class=""articles show category-bisnis imm...",0
53,"<body class=""no-js""><div class=""main-wrapper"" ...",0
61,"<body class=""no-js""><div class=""main-wrapper"" ...",0


In [11]:
df_clean = df_clean.drop(
	index = df_clean.loc[df_clean["extracted_body"].str.contains('cloudflare.com', case=False)].index
)

In [12]:
df_clean.head()

Unnamed: 0,extracted_body,is_gambling_site
1,"<body>\n<main>\n<div class=""mputama"">\n<img al...",1
3,"<body class=""amp-dark-mode amp-mode-mouse"" sty...",1
4,"<body data-spm=""pdp_revamp"" style=""overflow-y:...",1
5,"<body>\n<main>\n<div class=""mputama"">\n<img al...",1
6,"<body class=""modal-open"" style=""--expand-icon-...",1


## Text Cleaning

In [None]:
def removeScriptContents(text: str) ->str:
	"""
	Replace scripts tags and the contents

		param:
			text (str): text to clean
		return:
			new text without scripts tags and contents
	"""
	return re.sub("<script>.*?<\/script>", " ", text)

def removeHTMLTag(text: str) -> str:
	"""
	Remove HTML tags from the text

	-------------
	Ex:
	-------------
	<body>
	lorem ipsum
	dolor sit amet
	</body>

	-------------
	Return:
	-------------
	lorem ipsum
	dolor sit amet

		param:
			text (str): text to clean
		return:
			new text without HTML tags
	"""
	
	return re.sub("<.*?>" , "", text)

def removeNewLine(text: str) -> str:
	"""
	Remove new line's escape chars (\n) from the text

		param:
			text (str): text to clean
		return:
			new text without \n
	"""

	return text.replace("\n", "")

def removeContentsInsideCurlyBrackets(text: str) -> str:
	"""
	Clean text by removing curly brackets and any contents within them
	
	-------------
	Ex:
	-------------
	{
		"vars": {
			"gtag_id": "UA-241896104-1",
			...
		}
	}

	<body>
		....
	</body>

	-------------
	Return:
	-------------
	<body>
		....
	</body>

		params:
			text (str): text to clean
		return:
			new text without curly brackets and any contents whithin them
	"""

	return re.sub("{.*?}", "", text)

def removeAllExceptAlphabetPeriod(text: str) -> str:
	"""
	Replace all characters except alphabets and period [a-zA-Z.] with whitespace

		param:
			text (str): text to clean
		return:
			new text contains alphabets and periods
	"""

	return re.sub("[^a-zA-Z.]+", " ", text)

def removeWhiteSpacesBeforeAfterPeriod(text: str) -> str:
	"""
	Remove whitespaces before and after the period, but the period char preserved

	-------------
	Ex:
	-------------
	foo . buzz
	buzz. foo

	-------------
	Return:
	-------------
	foo.buzz
	buzz.foo

		param:
			text (str): text to clean
		return:
			new text that doesn't contain any whitespaces before and after period
	"""

	return re.sub("\s+(?=\.)|(?<=\.)\s+", "", text)

def removePeriod(text: str) -> str:
	"""
	Replace period with whitespace
	"""

	return re.sub("\s*\.\s*", " ", text)

def removeJqueryFunctionSelector(text: str) -> str:
	"""
	Replace jquery function and selector strings inside the text with a whitespeace using regex.
	The regex will match string started with word function until it meets a semicolon (;)

	-------------
	Ex:
	-------------
	function foo(buzz) buzz.lower();' Lorem ipsum
	$(bazz).window{lorem ipsum};' dolor sit amet

	-------------
	Return:
	-------------
	' Lorem ipsum
	' dolor sit amet
	"""

	return re.sub("(function|\$).*[;]+", " ", text)

def removeURL(text: str) -> str:
	"""
	Replace all urls with whitespace

		param:
			text (str): text to clean
		return:
			new text without any urls
	"""

	return re.sub("(www|http|https)\S*", " ", text)

def removeRepeatingWhitespace(text: str) -> str:
	"""
	Replace repeating whitespaces (at least 2) with a whitespace

		param:
			text (str): text to clean
		return:
			new text without repeating whitespaces
	"""

	return re.sub("\s{2,}", " ", text)

def textToLower(text: str) -> str:
	"""
	Convert all characters to lowercase
	
		param:
			text (str): text to transform into lowercase
		return:
			transformed text to lowercase
	"""

	return text.lower()

def stripText(text: str) -> str:
	"""
	Remove leading and trailing whitespaces in the text

		param:
			text (str): text to clean
		return:
			cleaned text
	"""

	return text.strip()

def removeTechnicalWords(text: str) -> str:
	"""
	Remove technical words that don't represent the charactheristics of both gambling and non-gambling sites
	
	Technical word list:
		- https
		- http
		- www
		- true
		- integrity
		- login
		- window
		- cmd
		- svg
		- config
		- style
		- loader
		- network
		- pixel
		- css
		- loading
		- script
		- fn
		- var
		- const
		- url

	-------------
	Ex:
	-------------
	Indonesiaurlhttpsthedinerqccomproductssamsungtinchd lorem ipsum
	cfExtPritruecfLtrueversiontokenaacaaaafeb dolor sit amet

	-------------
	Return:
	-------------
	lorem ipsum
	dolor sit amet

		param:
			text (str): text to clean
		return:
			new text after removing technical words
	"""
	return re.sub("\S*(https|http|www|true|integrity|login|window|cmd|svg|config|style|loader|network|pixel|css|script|loading|fn|var|const|url)\S*", "", text)

# def addPeriodAtEndofText(text: Union[str, None]) -> Union[str, None]:
# 	"""
# 	Add period sign (if there isn't any) at the end of text to indicate the sentence is ended
# 	Skip the process if the text is None
	
# 	-------------
# 	Ex:
# 	-------------
# 	Commodo excepteur elit consequat ullamco.
# 	Culpa officia nulla Lorem mollit enim veniam cillum
# 	None

# 	-------------
# 	Return:
# 	-------------
# 	Commodo excepteur elit consequat ullamco.
# 	Culpa officia nulla Lorem mollit enim veniam cillum.
# 	None
	
# 		param:
# 			text (str): text to clean
# 		return:
# 			new text that has period at the end of it
# 	"""
	
# 	if text == None:
# 		return None
# 	elif text[-1] == ".":
# 		return text
	
# 	return text+"."

def castEmptyTextToNone(text: str) -> Union[str, None]:
	"""
	Return None if the text is an empty string, otherwise return itself
	
		param:
			text (str): text to be casted
		return:
			str if not None, otherwise None
	"""
	
	return None if text == "" else text

def preprocessText(text: Union[str, None]) -> Union[str, None]:
	"""
	Clean the text by appling text cleaning pipeline
	Skip the process if the text is None

		param:
			text (str): text to clean
		return:
			cleaned text
	"""
	
	if text == None:
		return None

	cleaned_text = textToLower(text)
	cleaned_text = removeNewLine(cleaned_text)
	cleaned_text = removeScriptContents(cleaned_text)
	cleaned_text = removeHTMLTag(cleaned_text)
	cleaned_text = removeJqueryFunctionSelector(cleaned_text)
	cleaned_text = removeURL(cleaned_text)
	cleaned_text = removeContentsInsideCurlyBrackets(cleaned_text)
	cleaned_text = removeAllExceptAlphabetPeriod(cleaned_text)
	cleaned_text = removeTechnicalWords(cleaned_text)
	cleaned_text = removeRepeatingWhitespace(cleaned_text)
	cleaned_text = removeWhiteSpacesBeforeAfterPeriod(cleaned_text)
	cleaned_text = removePeriod(cleaned_text)
	cleaned_text = stripText(cleaned_text)
	cleaned_text = castEmptyTextToNone(cleaned_text)

	return cleaned_text

In [28]:
def extractNamePropertyValues(text: str) -> Union[str, None]:
	"""
	Extract `"name"` properties from the text, then concate them into a sentence

		param:
			text (str): text to clean
		return:
			name properties formed into a sentence
	"""

	name_prop_vals = re.findall("(?<=\"name\":)\"[^\"]*\"", text)

	for idx, name_val in enumerate(name_prop_vals):
		# remove all chars except a-zA-Z and whitespace
		str_val = re.sub("[^a-zA-Z ]+", "", name_val).strip()
		name_prop_vals[idx] = str_val

	combined_vals = " ".join(name_prop_vals)

	# return None if there's no value in combined_vals 
	# or the combined name property values are below 10 characters
	if(combined_vals == ""
		or len(combined_vals) <= 5):
		return None
	
	return combined_vals

def textPreprocessPipeline(text: str) -> str:
	"""
	Pipeline to preprocess the text with following steps:
	1. Extract name attributes then clean the text
	2. Extract values from body element then clean the text
	3. Merge these two text

		param:
			text (str): text to clean
		return:
			cleaned text
	"""

	extracted_name_vals = preprocessText(extractNamePropertyValues(text))
	extracted_body_vals = preprocessText(text)

	if extracted_name_vals == extracted_name_vals == None:
		return None
	elif extracted_name_vals != None and extracted_body_vals != None:
		return extracted_name_vals + " " + extracted_body_vals
	elif extracted_name_vals != None:
		return extracted_name_vals
	else:
		return extracted_body_vals

In [29]:
df_clean["cleaned_text"] = df_clean["extracted_body"].apply(textPreprocessPipeline)

In [35]:
df_clean[["cleaned_text", "is_gambling_site"]].dropna().to_csv("dataset/cleaned_text.csv", index=False)