In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [None]:
# Web scraping
url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"
html = urlopen(url).read()
soup = BeautifulSoup(html, features="html.parser")
print(soup)

<!DOCTYPE html public "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">

<html>
<head>
<title>BBC NEWS | Health | Blondes 'to die out in 200 years'</title>
<meta content="BBC, News, BBC News, news online, world, uk, international, foreign, british, online, service" name="keywords"/>
<meta content="2002/09/27 11:51:55" name="OriginalPublicationDate"/>
<meta content="/1/hi/health/2284783.stm" name="UKFS_URL"/>
<meta content="/2/hi/health/2284783.stm" name="IFS_URL"/>
<meta content="text/html;charset=iso-8859-1" name="HTTP-EQUIV"/>
<meta content="Blondes 'to die out in 200 years'" name="Headline"/>
<meta content="Health" name="Section"/>
<meta content="Natural blondes are an endangered species and will die out by 2202, a study suggests." name="Description"/>
<!-- GENMaps-->
<map name="banner">
<area alt="BBC NEWS" coords="7,9,167,32" href="http://news.bbc.co.uk/1/hi.html" shape="RECT"/>
</map>
<script language="JavaScript" src="/nol/shared/js/livestats_v

In [None]:
# kill all script and style elements
for script in soup(["script", "style"]):
    script.extract()    # rip it out

soup = soup.get_text()

In [None]:
# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in soup.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
# drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)
print(text)

BBC NEWS | Health | Blondes 'to die out in 200 years'
NEWS
SPORT
WEATHER
WORLD SERVICE
A-Z INDEX
SEARCH
You are in: Health
News Front Page
Africa
Americas
Asia-Pacific
Europe
Middle East
South Asia
UK
Business
Entertainment
Science/Nature
Technology
Health
Medical notes
-------------
Talking Point
-------------
Country Profiles
In Depth
-------------
Programmes
-------------
SERVICES
Daily E-mail
News Ticker
Mobile/PDAs
-------------
Text Only
Feedback
Help
EDITIONS
Change to UK
Friday, 27 September, 2002, 11:51 GMT 12:51 UK
Blondes 'to die out in 200 years'
Scientists believe the last blondes will be in Finland
The last natural blondes will die out within 200 years, scientists believe.
A study by experts in Germany suggests people with blonde hair are an endangered species and will become extinct by 2202.
Researchers predict the last truly natural blonde will be born in Finland - the country with the highest proportion of blondes.
The frequency of blondes may drop but they won't disap

In [None]:
import nltk
import pandas as pd
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup
from spellchecker import SpellChecker
from sklearn.feature_extraction.text import CountVectorizer
import spacy

def cleaning(data):
    data = data.lower()
    # Remove html tags
    data = BeautifulSoup(data, "html.parser").get_text()
    # Remove links and emails
    data = re.sub(r'http\S+|www\S+|[\w\.-]+@[\w\.-]+', '', data)
    # Remove any thing except words, numbers, and space
    data = re.sub(r'[^\w\s]', '', data)
    # Remove numbers
    data = re.sub(r'[0-9]', '', data)
    # Remove emojis and non ascii characters
    data = re.sub(r'[^\x00-\x7F]+', '', data)
    # Correct words
    spell = SpellChecker()
    data = ' '.join([spell.correction(word) if spell.correction(word) is not None else "" for word in data.split()])
    # Remove stop words (the, and, ....)
    stop_words = set(stopwords.words("english"))
    data = ' '.join([word for word in data.split() if word not in stop_words])
    # Load the spaCy English model
    nlp = spacy.load('en_core_web_sm')
    # Process the text using spaCy
    data = nlp(data)
    # Extract lemmatized tokens
    lemmatized_tokens = [token.lemma_ for token in data]
    # Join the lemmatized tokens into a sentence
    lemmatized_text = ' '.join(lemmatized_tokens)
    data = lemmatized_text.split()
    data = list(set(data))
    print(data)
    return data
text = cleaning(text)

['prof', 'grandparent', 'family', 'change', 'iraq', 'wrinkle', 'man', 'baby', 'mar', 'bola', 'blood', 'disappear', 'choose', 'air', 'stub', 'unlikely', 'campaign', 'suggest', 'confirm', 'foot', 'say', 'next', 'become', 'within', 'sport', 'generation', 'cancer', 'launch', 'polio', 'back', 'fridge', 'news', 'big', 'professor', 'privacy', 'top', 'internet', 'rope', 'proportion', 'university', 'alert', 'vegetable', 'not', 'last', 'study', 'order', 'teach', 'attractive', 'middle', 'recessive', 'unless', 'spot', 'partner', 'site', 'america', 'dye', 'ward', 'dumb', 'sperm', 'fear', 'section', 'tuna', 'profile', 'two', 'front', 'believe', 'risk', 'business', 'story', 'get', 'aid', 'friend', 'stroke', 'scientist', 'entertainment', 'call', 'daily', 'diabetic', 'expert', 'program', 'year', 'congo', 'show', 'help', 'hair', 'country', 'child', 'die', 'reason', 'century', 'east', 'finland', 'label', 'new', 'wriggle', 'search', 'disadvantage', 'also', 'platinum', 'researcher', 'edition', 'talk', 'afr

In [None]:
for word in text :
  if len(word)<=3 :
    print(word)

man
mar
air
say
big
top
not
dye
two
get
aid
die
new
may
see
