# Ahmed Mohamed Ahemd 20200036 
# Mohamed Abd ElGhaffar  20200460
# Mohamed Essam Galal     20200465

# 

## Importing Libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import brown
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import random
from sklearn.feature_extraction.text import TfidfVectorizer

## Tokenization | Stemming | Lemmatization

In [2]:
def tokenize(preprocessed_words):
    return nltk.word_tokenize(preprocessed_words)

def stemming(word):
    stemmer = PorterStemmer()
    return stemmer.stem(word)

def lemmatizing(word):
    lemmatizer = WordNetLemmatizer()
    return lemmatizer.lemmatize(word)


## Preprocessing

In [3]:
def preprocessing(text):
    preprocessed_text = re.sub(r'\s+', ' ', text)                            # Remove All WhiteSpaces
    preprocessed_text = re.sub(r'[^a-zA-Z0-9\s]', '',preprocessed_text)      # Remove All Charchters Excepts Letters and Numbers
    norm_preprocessed_text = preprocessed_text.lower()                       # Lower All Words "Normalization"
    tokens = tokenize(norm_preprocessed_text)                                # Tokenizzing
    
    stop_words = stopwords.words('english')                                  # Stop-Words to be Removed
    
    stem_words = [stemming(word) for word in tokens if word not in stop_words] # Stemming
    
    lemmatize_words = [lemmatizing(word) for word in tokens if word not in stop_words] # Lemmatizing
    
    return lemmatize_words
    

## Getting Unique Words

In [4]:
def unique_words(stem_words = None, lemmatize_words = None):
    try:
        return set(lemmatize_words), set(stem_words)
    except Exception as e:
        print("Error:", e)
        return None

## Getting Categories "Fields"

In [5]:
def select_random_categories(phrases):
    brown_categories = [
        'news',
        'editorial',
        'reviews',
        'religion',
        'hobbies',
        'lore',
        'belles_lettres',
        'government',
        'learned',
        'fiction',
        'mystery',
        'science_fiction',
        'adventure',
        'romance',
        'humor'
    ]
    
    num_categories = min(phrases, len(brown_categories))
    
    random.shuffle(brown_categories)
    
    return brown_categories[:num_categories]

## Generate Documents

In [6]:
def generate_documents(phrases):
    categories = select_random_categories(phrases)
    documents = []
    for category in categories:
        brown_corpus = brown.sents(categories=[category])
        documents.append(' '.join(brown_corpus[1]))
    
    return documents

## tfidf_vectorizer

In [7]:
tfidf_vectorizer = TfidfVectorizer(tokenizer=preprocessing)

## Output

In [8]:
documents = generate_documents(5)
documents

["The jury further said in term-end presentments that the City Executive Committee , which had over-all charge of the election , `` deserves the praise and thanks of the City of Atlanta '' for the manner in which the election was conducted .",
 "Just about the most enthralling real-life example of meeting cute is the Charles MacArthur-Helen Hayes saga : reputedly all he did was give her a handful of peanuts , but he said simultaneously , `` I wish they were emeralds '' .",
 'An interne , a nurse and two attendants were in charge of us .',
 'Scotty did not go back to school .',
 'It develops and analyzes the national income , balance of international payments , and many other business indicators .']

In [10]:
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
tfidf_matrix

<5x51 sparse matrix of type '<class 'numpy.float64'>'
	with 53 stored elements in Compressed Sparse Row format>

In [13]:
i = 0
for document in (documents):
    print(f"Document {i+1}:")
    unique_words = tfidf_vectorizer.get_feature_names_out()
    j = 0
    for word in (unique_words):
        tfidf = tfidf_matrix[i, j]
        if tfidf > 0:
            print(f"      {word}: {tfidf:.4f}")
        j+=1
    print("")
    i+=1

Document 1:
      atlanta: 0.2167
      charge: 0.1748
      city: 0.4333
      committee: 0.2167
      conducted: 0.2167
      deserves: 0.2167
      election: 0.4333
      executive: 0.2167
      jury: 0.2167
      manner: 0.2167
      overall: 0.2167
      praise: 0.2167
      presentment: 0.2167
      said: 0.1748
      termend: 0.2167
      thanks: 0.2167

Document 2:
      charles: 0.2451
      cute: 0.2451
      emerald: 0.2451
      enthralling: 0.2451
      example: 0.2451
      give: 0.2451
      handful: 0.2451
      hayes: 0.2451
      macarthurhelen: 0.2451
      meeting: 0.2451
      peanut: 0.2451
      reallife: 0.2451
      reputedly: 0.2451
      saga: 0.2451
      said: 0.1977
      simultaneously: 0.2451
      wish: 0.2451

Document 3:
      attendant: 0.4207
      charge: 0.3394
      interne: 0.4207
      nurse: 0.4207
      two: 0.4207
      u: 0.4207

Document 4:
      back: 0.5000
      go: 0.5000
      school: 0.5000
      scotty: 0.5000

Document 5:
      ana