# Imports

In [1]:
%%capture --no-display
!pip install transformers

In [2]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem import PorterStemmer
from nltk.stem.isri import ISRIStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import pipeline
import nltk
nltk.download("punkt")
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Doc 1

In [3]:
pipe = pipeline("text-generation","openai-community/gpt2")
query = "Animals are huge and"
answer = pipe(query, max_length = 1024)

In [4]:
content = answer[0]["generated_text"].split(".")

In [5]:
content[:10]

['Animals are huge and sometimes impossible to identify',
 " They're not as hard to recognise as they used to be, because their faces have been painted as big and ugly, to an extent, but there's no sign of them even in the game",
 "\n\nPushing a big animal for control is not a fool's errand",
 ' Sometimes I try to show them by using only their arms and legs',
 ' This is what I think is important: They can do other things, and maybe even walk themselves without going out into the open',
 '\n\n"If an animal is being chased as well and you get caught in a chase I would say no because that gives them a way of hiding',
 " It's almost like someone can pick them up off the grass and run up to you, so they can hide, where they can take you",
 ' If you can tell them they don\'t know what they\'re fighting for and you start running around by them but they don\'t really know what\'s coming they can run away even if they\'ve been pursued already," adds Wojciechowski, director of wildlife and park 

In [6]:
def normalize_english(text):
        text = re.sub(r'[^a-zA-Z\s.]', '', text)
        text = text.replace('\n', ' ')
        return text

def tokenize_english(text):
        tokens = word_tokenize(text)
        return tokens

def stem_english(tokens):
        stemmer = PorterStemmer()
        stems = [stemmer.stem(token) for token in tokens]
        return stems

def remove_stopwords(tokens):
        english_stopwords = set(stopwords.words('english'))
        filtered_tokens = [token for token in tokens if token.lower() not in english_stopwords]
        return filtered_tokens

def process_text(text):

    cleaned_text = normalize_english(text)

    tokens = tokenize_english(cleaned_text)

    filtered_tokens = remove_stopwords(tokens)

    stemmed_tokens = stem_english(filtered_tokens)

    return ' '.join(stemmed_tokens)


def tfidf_vectorizer(data):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(data)
    df = pd.DataFrame(tfidf_matrix.toarray(), columns = tfidf_vectorizer.get_feature_names_out())
    return df

In [7]:
filtered_tokens = [process_text(i) for i in content]

In [8]:
df = tfidf_vectorizer(filtered_tokens)
print(f"No. of unique words: {df.shape[1]}")
df.head()

No. of unique words: 131


Unnamed: 0,add,aggress,almost,alreadi,also,anim,arm,around,articl,avoid,...,walk,want,way,well,what,wildlif,without,wojciechowski,wont,would
0,0.0,0.0,0.0,0.0,0.0,0.269126,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.241179,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.438028,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.355842,0.0,0.0,0.0,0.0,0.0,0.355842,0.0,0.0,0.0


# Doc 2

In [9]:
query = "Artficial Intelligence will be the future"
answer = pipe(query, max_length = 1024)
content = answer[0]["generated_text"].split(".")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [10]:
content[:10]

['Artficial Intelligence will be the future of information and communications',
 ' The government of the Republic of Korea will continue to provide critical services to the world in our information security system',
 '"\n\nKim said the NSA is the only way to keep information, and to keep the Kim family from divulging information with impunity',
 '\n\n"The DPRK should not turn against North Korea, no matter the cost to themselves," Kim said, according to Reuters',
 "\n\nThe State Department was also seen as the most vocal proponent of North Korea's right to exist as a free and open society, but failed to acknowledge Kim's efforts to change that",
 '\n\n"Today this message was sent from the DPRK: \'If we can trust in ourselves we can trust our partners, too',
 ' The US should never accept any form of \'disclosure,\' and not stop it at the most extreme degree," State spokesperson Victoria Nuland told reporters, during a stop in New York for the UN\'s Interuniversity Association Annual Mee

In [11]:
filtered_tokens = [process_text(i) for i in content]

In [12]:
df = tfidf_vectorizer(filtered_tokens)
print(f"No. of unique words: {df.shape[1]}")
df.head()

No. of unique words: 77


Unnamed: 0,accept,accord,acknowledg,advanc,aggress,also,annual,artfici,associ,chang,...,trust,turn,un,us,victim,victoria,vocal,way,world,york
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.468646,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.31638,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.26747,0.0,0.0
3,0.0,0.357616,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.357616,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.249249,0.0,0.0,0.249249,0.0,0.0,0.0,0.249249,...,0.0,0.0,0.0,0.0,0.0,0.0,0.249249,0.0,0.0,0.0
