In [1]:
from bs4 import BeautifulSoup
import bs4 as bs4
from urllib.parse import urlparse
import requests
import nltk 
nltk.download('words')
import re
import spacy
nlp = spacy.load("en_core_web_sm")
import joblib
pipeline = joblib.load('NLP_pipline.sav')
classes=['Adult','Business/Corporate', 'Computers and Technology','E-Commerce','Education','Food','Forums','Games',
         'Health and Fitness','Law and Government','News', 'Photography', 'Social Networking and Messaging','Sports',
         'Streaming Services', 'Travel']

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\muner\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
  return torch._C._cuda_getDeviceCount() > 0


In [2]:
def get_text_content(soup):
    '''returns the text content of the whole page with some exception to tags. See tags_to_ignore.'''
    tags_to_ignore = ['style', 'script', 'head', 'title', 'meta', '[document]',"h1","h2","h3","h4","h5","h6","noscript"]
    tags = soup.find_all(text=True)
    result = []
    for tag in tags:
        stripped_tag = tag.strip()
        if tag.parent.name not in tags_to_ignore\
            and isinstance(tag, bs4.element.Comment)==False\
            and not stripped_tag.isnumeric()\
            and len(stripped_tag)>0:
            result.append(stripped_tag)
    return ' '.join(result)
    
def scraping(URL):
    content = requests.get(URL,timeout=60).content
    soup = BeautifulSoup(content, "lxml")
    metatags = soup.find_all(lambda tag: (tag.name=="meta") & (tag.has_attr('name') & (tag.has_attr('content'))))
    metacontent = [str(tag["content"]) for tag in metatags if tag["name"] in ['keywords','description']]
    
    headtags = soup.find_all(["h1","h2","h3","h4","h5","h6"])
    headcontent = [" ".join(tag.stripped_strings) for tag in headtags]
    result = {
            "website_url": URL,
            "website_name": "".join(urlparse(URL).netloc.split(".")[-2]),
            "website_text": '. '.join(soup.title.contents)+' '.join(metacontent)+' '.join(headcontent)+get_text_content(soup)}
    return result

In [3]:
def clean_text(doc):
    '''
    Clean the document. Remove pronouns, stopwords, lemmatize the words and lowercase them
    '''
    words = set(nltk.corpus.words.words())
    doc = nlp(doc)
    tokens = []
    exclusion_list = ["nan"]
    for token in doc:
        if token.is_stop or token.is_punct or token.text.isnumeric() or (token.text.isalnum()==False) or token.text in exclusion_list :
            continue
        
        token = str(token.lemma_.lower().strip())
        tokens.append(token)
        
    text=" ".join(tokens) 
    text=re.sub(" +", " ", text)
    text= [w for w in nltk.wordpunct_tokenize(text) if w.lower() in words or not w.isalpha()]
    return " ".join(text) 

In [4]:
def beautiful_predict(URL):
    r=scraping(URL)
    text=r["website_text"]
    text=clean_text(text)
    p=pipeline.predict([text])
    print('The website category is:',classes[p[0]])

In [5]:
def edu_blocked(URL):
    r=scraping(URL)
    text=r["website_text"]
    text=clean_text(text)
    p=pipeline.predict([text])
    if p[0]==0 or p[0]==3 or p[0]==6 or p[0]==7 or p[0]==12 or p[0]==14 or p[0]==15:
        return 1
    else: return 0

In [6]:
beautiful_predict('https://openai.com/blog/chatgpt/')

The website category is: Computers and Technology


In [7]:
beautiful_predict('https://www.who.int/')

The website category is: Health and Fitness


In [8]:
beautiful_predict('https://www.iau.edu.sa/en')

The website category is: Education


In [9]:
beautiful_predict('https://scikit-learn.org/stable/')

The website category is: Computers and Technology


In [10]:
beautiful_predict('https://github.com/')

The website category is: Computers and Technology


In [11]:
beautiful_predict('https://www.allrecipes.com/recipes/')

The website category is: Food


In [12]:
beautiful_predict('https://www.reddit.com/')

The website category is: Forums


In [6]:
beautiful_predict('https://www.moh.gov.sa/en/Pages/default.aspx')

The website category is: Law and Government
