# Ahmed Mohamed Ahemd 20200036
# Mohamed Abd ElGhaffar  20200460
# Mohamed Essam Galal     20200465

#

## Importing Libraries

In [136]:
import numpy as np
from nltk.tokenize import  word_tokenize
import requests
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import brown
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import pipeline

In [137]:
nltk.download('brown')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Tokenization | Stemming | Lemmatization

In [138]:
def tokenize(preprocessed_words):
    return nltk.word_tokenize(preprocessed_words)

def stemming(word):
    stemmer = PorterStemmer()
    return stemmer.stem(word)

def lemmatizing(word):
    lemmatizer = WordNetLemmatizer()
    return lemmatizer.lemmatize(word)


## Preprocessing

In [139]:
def preprocessing(text):
    preprocessed_text = re.sub(r'\s+', ' ', text)                            # Remove All WhiteSpaces
    preprocessed_text = re.sub(r'[^a-zA-Z0-9\s]', '',preprocessed_text)      # Remove All Charchters Excepts Letters and Numbers
    norm_preprocessed_text = preprocessed_text.lower()                       # Lower All Words "Normalization"
    tokens = tokenize(norm_preprocessed_text)                                # Tokenizzing

    stop_words = stopwords.words('english')                                  # Stop-Words to be Removed

    stem_words = [stemming(word) for word in tokens if word not in stop_words] # Stemming

    lemmatize_words = [lemmatizing(word) for word in tokens if word not in stop_words] # Lemmatizing

    return lemmatize_words


## Getting Unique Words

In [140]:
def unique_words(stem_words = None, lemmatize_words = None):
    try:
        return set(lemmatize_words), set(stem_words)
    except Exception as e:
        print("Error:", e)
        return None

## Getting Categories "Fields"

In [141]:
# def select_random_categories(phrases):
#     brown_categories = [
#         'news',
#         'editorial',
#         'reviews',
#         'religion',
#         'hobbies',
#         'lore',
#         'belles_lettres',
#         'government',
#         'learned',
#         'fiction',
#         'mystery',
#         'science_fiction',
#         'adventure',
#         'romance',
#         'humor'
#     ]

#     num_categories = min(phrases, len(brown_categories))

#     random.shuffle(brown_categories)

#     return brown_categories[:num_categories]

## Generate Documents

In [142]:
generator = pipeline('text-generation', model='gpt2')
# different field
categories = ["news","editorial","reviews","religion","hobbies"]
def generate_documents():
    documents = []
    for category in categories:
        generated_text = generator(category, max_length=100)[0]['generated_text']
        documents.append(generated_text)

    return documents

# def generate_documents(phrases):
#     categories = select_random_categories(phrases)
#     documents = []
#     for category in categories:
#         brown_corpus = brown.sents(categories=[category])
#         documents.append(' '.join(brown_corpus[1]))

#     return documents

## tfidf_vectorizer

In [143]:
tfidf_vectorizer = TfidfVectorizer(tokenizer=preprocessing)

## Output

In [144]:
documents = generate_documents()
documents

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


["news are a big part of the public perception of what's being done with the NSA. When they're not trying to protect your rights, they're trying to intimidate you to do the right thing. That's how they use you to get what they want.\n\nAnd they're doing it to have you think a bigger deal about what's happening with the NSA, in every other area of public policy. If you ask me, how do you have Congress, the president, on your side,",
 'editorial that had an interest in sex in the 1980s.\n\nKonstantin is now a consultant in the department of political science at the University of Stirling. "I really had an interest in sex when I was in Stirling when I did a book on politics, I went to see it and I was amazed by how little interest that it actually drew," he said.\n\nThe former Liberal MP described it as an important read, even if the book has its downsides',
 'reviews\n\nThis was my first foray into the post-modern world of writing, being an avid fan since I was a little kid. I have come 

In [145]:
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
tfidf_matrix

<5x154 sparse matrix of type '<class 'numpy.float64'>'
	with 165 stored elements in Compressed Sparse Row format>

In [146]:
process_docs = []
for document in documents:
    doc = preprocessing(document)
    process_docs.append(doc)

allDocuments = []
word_set = []

for document in process_docs:
    allDocuments.append(document)
    for word in document:
        word_set.append(word)

word_set = set(word_set)
N_documents = len(allDocuments)

index_word = {}
i = 0
for word in word_set:
    index_word[word] = i
    i += 1

In [147]:
def TF(document, word):
    return document.count(word) / len(document)

In [148]:
def IDF(word, allDocuments):
    num_documents_with_word = sum([1 for doc in allDocuments if word in doc])
    return np.log((N_documents + 1)/(num_documents_with_word + 1)) + 1

In [149]:
def TF_IDF(document):
    TF_IDF_Vec = np.zeros((len(word_set),))
    for word in document:
        tf = TF(document,word)
        idf = IDF(word, allDocuments)

        value = tf*idf
        TF_IDF_Vec[index_word[word]] = value

    return TF_IDF_Vec

In [150]:
vectors = []
for document in allDocuments:
    vec = TF_IDF(document)
    vectors.append(vec)

In [151]:
# Normalization of TF-IDF vectors
TF_IDF_Vec_Normalize = np.array(vectors) / np.linalg.norm(vectors, axis=1, keepdims=True)

print("\nTF-IDF from Sklearn:")
i = 0
for document in documents:
    print(f"Document {i+1}:")
    unique_words = tfidf_vectorizer.get_feature_names_out()
    j = 0
    for word in unique_words:
        tfidf = tfidf_matrix[i, j]
        if tfidf > 0:
            print(f"      {word}: {tfidf:.4f}")
        j += 1
    print("")
    i += 1

print("TF-IDF from Scratch:")
i = 0
for document in allDocuments:
    arr = []
    print(f"Document {i+1}:")
    for j, feature in enumerate(word_set):
        if vectors[i][j] > 0.0:
            arr.append([feature, TF_IDF_Vec_Normalize[i][j]])
    arr = sorted(arr, key=lambda x: x[0])
    for item in arr:
        print(f"      {item[0]}: {item[1]:.4f}")
    print("")
    i += 1


TF-IDF from Sklearn:
Document 1:
      area: 0.1396
      ask: 0.1396
      big: 0.1396
      bigger: 0.1396
      congress: 0.1396
      deal: 0.1396
      done: 0.1396
      every: 0.1396
      get: 0.1396
      happening: 0.1396
      intimidate: 0.1396
      news: 0.1396
      nsa: 0.2792
      part: 0.1126
      perception: 0.1396
      policy: 0.1396
      president: 0.1396
      protect: 0.1396
      public: 0.2792
      right: 0.2792
      side: 0.1396
      thats: 0.1396
      theyre: 0.4188
      thing: 0.1126
      think: 0.1396
      trying: 0.2792
      use: 0.1396
      want: 0.1396
      whats: 0.2792

Document 2:
      1980s: 0.1509
      actually: 0.1509
      amazed: 0.1509
      book: 0.2435
      consultant: 0.1509
      department: 0.1509
      described: 0.1509
      downside: 0.1509
      drew: 0.1509
      editorial: 0.1509
      even: 0.1509
      former: 0.1509
      important: 0.1509
      interest: 0.4528
      konstantin: 0.1509
      liberal: 0.1509
     