# Ahmed Mohamed Ahemd 20200036 
# Mohamed Abd ElGhaffar  20200460
# Mohamed Essam Galal     20200465

# 

## Importing Libraries

In [2]:
import requests
from bs4 import BeautifulSoup
import re
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

## Reading Text from URL

In [1]:
def read_text_from_url(url):
    try:

        response = requests.get(url)
        content = BeautifulSoup(response.content, 'html.parser')
        text = content.get_text()
        
        return text
    
    except Exception as e:
        print("Error:", e)
        return None

## Tokenization | Stemming | Lemmatization

In [3]:
def tokenize(preprocessed_words):
    return nltk.word_tokenize(preprocessed_words)

def stemming(word):
    stemmer = PorterStemmer()
    return stemmer.stem(word)

def lemmatizing(word):
    lemmatizer = WordNetLemmatizer()
    return lemmatizer.lemmatize(word)


## Preprocessing

In [4]:
def preprocessing(text):
    preprocessed_text = re.sub(r'\s+', ' ', text)                            # Remove All WhiteSpaces
    preprocessed_text = re.sub(r'[^a-zA-Z0-9\s]', '',preprocessed_text)      # Remove All Charchters Excepts Letters and Numbers
    norm_preprocessed_text = preprocessed_text.lower()                       # Lower All Words "Normalization"
    tokens = tokenize(norm_preprocessed_text)                                # Tokenizzing
    
    stop_words = stopwords.words('english')                                  # Stop-Words to be Removed
    
    stem_words = [stemming(word) for word in tokens if word not in stop_words] # Stemming
    
    lemmatize_words = [lemmatizing(word) for word in tokens if word not in stop_words] # Lemmatizing
    
    return norm_preprocessed_text, tokens, stem_words, lemmatize_words
    

## Getting Unique Words

In [5]:
def unique_words(stem_words = None, lemmatize_words = None):
    try:
        return set(lemmatize_words), set(stem_words)
    except Exception as e:
        print("Error:", e)
        return None
    

## Testing

In [6]:
url = 'https://www.wwe.com/'
html_text = read_text_from_url(url)

In [7]:
norm_preprocessed_text, tokens, stem_words, lemmatize_words = preprocessing(html_text)

In [8]:
stem_unique_words, lemmatize_unique_words = unique_words(stem_words, lemmatize_words)

## Output

In [9]:
html_text

"\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nWWE News, Results, Photos & Video - Official Site | WWE\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \nSkip to main content\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n\n\nWWE Network\n\n\nShows\n\n\nSuperstars\n\n\nTickets\n\n\nShop\n\n\n\n\n\nWatch every Premium Live Event and get unlimited access to WWE's premium content - available to you anywhere, anytime, on any device.\nGo to WWE Network\n\n\n\n\n\nSign up for Peacock to watch. Plus, get every WWE Premium Live Event, your favorite shows, new movies, live sports, and more.\nGo to Peacock\n\n\n\n\n\nVisit Sony LIV to sign in or sign up and enjoy WWE's premium content.\nGo to Sony LIV\n\n\n\n\n\nWatch every Premium Live Event and enjoy some of your favorite WWE content on Shahid.\nGo to Shahid\n\n\n\n\n\nWatch every Premium Live Event and enjoy some of your favorite WWE content on Disney + HotStar.\nGo to Disney + Hotstar\n\n

In [10]:
norm_preprocessed_text

' wwe news results photos  video  official site  wwe skip to main content wwe network shows superstars tickets shop watch every premium live event and get unlimited access to wwes premium content  available to you anywhere anytime on any device go to wwe network sign up for peacock to watch plus get every wwe premium live event your favorite shows new movies live sports and more go to peacock visit sony liv to sign in or sign up and enjoy wwes premium content go to sony liv watch every premium live event and enjoy some of your favorite wwe content on shahid go to shahid watch every premium live event and enjoy some of your favorite wwe content on disney  hotstar go to disney  hotstar watch wrestlemania and other wwe premium live events on flow go to flow sign up for binge to watch plus get every wwe premium live event and the worlds best tv and movies get binge nnow wwe network shows superstars tickets shop watch every premium live event and get unlimited access to wwes premium content

In [11]:
tokens

['wwe',
 'news',
 'results',
 'photos',
 'video',
 'official',
 'site',
 'wwe',
 'skip',
 'to',
 'main',
 'content',
 'wwe',
 'network',
 'shows',
 'superstars',
 'tickets',
 'shop',
 'watch',
 'every',
 'premium',
 'live',
 'event',
 'and',
 'get',
 'unlimited',
 'access',
 'to',
 'wwes',
 'premium',
 'content',
 'available',
 'to',
 'you',
 'anywhere',
 'anytime',
 'on',
 'any',
 'device',
 'go',
 'to',
 'wwe',
 'network',
 'sign',
 'up',
 'for',
 'peacock',
 'to',
 'watch',
 'plus',
 'get',
 'every',
 'wwe',
 'premium',
 'live',
 'event',
 'your',
 'favorite',
 'shows',
 'new',
 'movies',
 'live',
 'sports',
 'and',
 'more',
 'go',
 'to',
 'peacock',
 'visit',
 'sony',
 'liv',
 'to',
 'sign',
 'in',
 'or',
 'sign',
 'up',
 'and',
 'enjoy',
 'wwes',
 'premium',
 'content',
 'go',
 'to',
 'sony',
 'liv',
 'watch',
 'every',
 'premium',
 'live',
 'event',
 'and',
 'enjoy',
 'some',
 'of',
 'your',
 'favorite',
 'wwe',
 'content',
 'on',
 'shahid',
 'go',
 'to',
 'shahid',
 'watch',
 'e

In [12]:
stem_words

['wwe',
 'news',
 'result',
 'photo',
 'video',
 'offici',
 'site',
 'wwe',
 'skip',
 'main',
 'content',
 'wwe',
 'network',
 'show',
 'superstar',
 'ticket',
 'shop',
 'watch',
 'everi',
 'premium',
 'live',
 'event',
 'get',
 'unlimit',
 'access',
 'wwe',
 'premium',
 'content',
 'avail',
 'anywher',
 'anytim',
 'devic',
 'go',
 'wwe',
 'network',
 'sign',
 'peacock',
 'watch',
 'plu',
 'get',
 'everi',
 'wwe',
 'premium',
 'live',
 'event',
 'favorit',
 'show',
 'new',
 'movi',
 'live',
 'sport',
 'go',
 'peacock',
 'visit',
 'soni',
 'liv',
 'sign',
 'sign',
 'enjoy',
 'wwe',
 'premium',
 'content',
 'go',
 'soni',
 'liv',
 'watch',
 'everi',
 'premium',
 'live',
 'event',
 'enjoy',
 'favorit',
 'wwe',
 'content',
 'shahid',
 'go',
 'shahid',
 'watch',
 'everi',
 'premium',
 'live',
 'event',
 'enjoy',
 'favorit',
 'wwe',
 'content',
 'disney',
 'hotstar',
 'go',
 'disney',
 'hotstar',
 'watch',
 'wrestlemania',
 'wwe',
 'premium',
 'live',
 'event',
 'flow',
 'go',
 'flow',
 'sig

In [13]:
lemmatize_words

['wwe',
 'news',
 'result',
 'photo',
 'video',
 'official',
 'site',
 'wwe',
 'skip',
 'main',
 'content',
 'wwe',
 'network',
 'show',
 'superstar',
 'ticket',
 'shop',
 'watch',
 'every',
 'premium',
 'live',
 'event',
 'get',
 'unlimited',
 'access',
 'wwes',
 'premium',
 'content',
 'available',
 'anywhere',
 'anytime',
 'device',
 'go',
 'wwe',
 'network',
 'sign',
 'peacock',
 'watch',
 'plus',
 'get',
 'every',
 'wwe',
 'premium',
 'live',
 'event',
 'favorite',
 'show',
 'new',
 'movie',
 'live',
 'sport',
 'go',
 'peacock',
 'visit',
 'sony',
 'liv',
 'sign',
 'sign',
 'enjoy',
 'wwes',
 'premium',
 'content',
 'go',
 'sony',
 'liv',
 'watch',
 'every',
 'premium',
 'live',
 'event',
 'enjoy',
 'favorite',
 'wwe',
 'content',
 'shahid',
 'go',
 'shahid',
 'watch',
 'every',
 'premium',
 'live',
 'event',
 'enjoy',
 'favorite',
 'wwe',
 'content',
 'disney',
 'hotstar',
 'go',
 'disney',
 'hotstar',
 'watch',
 'wrestlemania',
 'wwe',
 'premium',
 'live',
 'event',
 'flow',
 'g

In [14]:
stem_unique_words

{'100',
 '2024',
 '312',
 'abducts',
 'access',
 'anytime',
 'anywhere',
 'arabic',
 'available',
 'best',
 'binge',
 'brook',
 'caption',
 'card',
 'career',
 'carmelo',
 'center',
 'champion',
 'class',
 'cleveland',
 'close',
 'closed',
 'community',
 'condition',
 'contact',
 'content',
 'cookie',
 'copyright',
 'corporate',
 'dangelo',
 'decimates',
 'deliver',
 'demand',
 'device',
 'disney',
 'dragunov',
 'email',
 'enjoy',
 'event',
 'every',
 'facebook',
 'fame',
 'faq',
 'favorite',
 'femi',
 'flow',
 'follow',
 'get',
 'gift',
 'global',
 'glory',
 'go',
 'gunther',
 'hall',
 'hayes',
 'help',
 'highlight',
 'host',
 'hotstar',
 'ilja',
 'impact',
 'inducted',
 'info',
 'instagram',
 'intercontinental',
 'jensen',
 'liv',
 'live',
 'look',
 'main',
 'mark',
 'menuhome',
 'million',
 'movie',
 'music',
 'named',
 'network',
 'new',
 'news',
 'nnow',
 'nxt',
 'oba',
 'official',
 'option',
 'pas',
 'patterson',
 'peacock',
 'perez',
 'photo',
 'pinterest',
 'plus',
 'policy',


In [15]:
lemmatize_unique_words

{'100',
 '2024',
 '312',
 'abduct',
 'access',
 'anytim',
 'anywher',
 'arab',
 'avail',
 'best',
 'bing',
 'brook',
 'caption',
 'card',
 'career',
 'carmelo',
 'center',
 'champion',
 'class',
 'cleveland',
 'close',
 'commun',
 'condit',
 'contact',
 'content',
 'cooki',
 'copyright',
 'corpor',
 'dangelo',
 'decim',
 'deliv',
 'demand',
 'devic',
 'disney',
 'dragunov',
 'email',
 'enjoy',
 'event',
 'everi',
 'facebook',
 'fame',
 'faq',
 'favorit',
 'femi',
 'flow',
 'follow',
 'get',
 'gift',
 'global',
 'glori',
 'go',
 'gunther',
 'hall',
 'hay',
 'help',
 'highlight',
 'host',
 'hotstar',
 'ilja',
 'impact',
 'induct',
 'info',
 'instagram',
 'intercontinent',
 'jensen',
 'liv',
 'live',
 'look',
 'main',
 'mark',
 'menuhom',
 'million',
 'movi',
 'music',
 'name',
 'network',
 'new',
 'news',
 'nnow',
 'nxt',
 'oba',
 'offici',
 'option',
 'pass',
 'patterson',
 'peacock',
 'perez',
 'photo',
 'pinterest',
 'plu',
 'polici',
 'prefer',
 'premium',
 'prioriti',
 'privaci',
 '