In [None]:
import urllib.request
import re
import nltk
import math
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
from collections import defaultdict
from nltk.stem import PorterStemmer
from gensim.summarization import summarize as gensim_summarize

def scrape_webpage(url):
    scraped_textdata = urllib.request.urlopen(url)
    textdata = scraped_textdata.read()
    parsed_textdata = BeautifulSoup(textdata, 'lxml')
    paragraphs = parsed_textdata.find_all('p')
    formatted_text = ""
    for para in paragraphs:
        formatted_text += para.text
    return formatted_text

def preprocess_text(text):
    text_without_numbers = re.sub(r'\d+', '', text)
    return text_without_numbers

def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

def remove_punctuation(tokens):
    tokenizer = re.compile(r'\w+')
    return [token.lower() for token in tokens if tokenizer.match(token)]

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [token for token in tokens if token not in stop_words]

def lemmatize_tokens(tokens):
    tag_map = defaultdict(lambda: wn.NOUN)
    tag_map['J'] = wn.ADJ
    tag_map['V'] = wn.VERB
    tag_map['R'] = wn.ADV
    lemma_function = WordNetLemmatizer()
    return [lemma_function.lemmatize(token, tag_map[tag[0]]) for token, tag in pos_tag(tokens)]

def calculate_tf(tokens):
    tf_dict = {}
    total_words = len(tokens)
    for word in tokens:
        tf_dict[word] = tf_dict.get(word, 0) + 1 / total_words
    return tf_dict

def calculate_idf(docs):
    idf_dict = {}
    total_docs = len(docs)
    for doc in docs:
        for word in set(doc):
            idf_dict[word] = idf_dict.get(word, 0) + 1
    for word, freq in idf_dict.items():
        idf_dict[word] = math.log(total_docs / (freq + 1))
    return idf_dict

def calculate_tfidf(tf, idf):
    tfidf_dict = {}
    for word, tf_value in tf.items():
        tfidf_dict[word] = tf_value * idf.get(word, 0)
    return tfidf_dict

def form_summary(sentences, tfidf_scores, threshold):
    summary = ''
    for i, sentence in enumerate(sentences):
        if tfidf_scores[i] >= threshold:
            summary += " " + sentence
    return summary

def run_summarization(text):
    sentences = sent_tokenize(text)
    tokens = tokenize_text(text)
    tokens_without_punctuations = remove_punctuation(tokens)
    tokens_without_stopwords = remove_stopwords(tokens_without_punctuations)
    lemmatized_tokens = lemmatize_tokens(tokens_without_stopwords)

    # TF-IDF Calculation
    tf = calculate_tf(lemmatized_tokens)
    idf = calculate_idf([lemmatized_tokens])
    tfidf = calculate_tfidf(tf, idf)

    # Rank sentences based on TF-IDF scores
    tfidf_scores = [sum(tfidf.get(token, 0) for token in lemmatize_tokens(remove_stopwords(remove_punctuation(tokenize_text(sentence))))) for sentence in sentences]

    # Print top sentences and form summary using TF-IDF
    k = min(5, len(sentences))  # Adjust the number of sentences in the summary
    print("Top {} sentences using TF-IDF:".format(k))
    for i in range(k):
        print(sentences[i])

    # Average score for threshold
    threshold = sum(tfidf_scores) / len(tfidf_scores)

    # Form summary based on threshold using TF-IDF
    summary_tfidf = form_summary(sentences, tfidf_scores, threshold)
    print("\nSummary using TF-IDF:")
    print(summary_tfidf)

    # Gensim Summarization
    summary_gensim = gensim_summarize(text, ratio=0.2)  # You can adjust the ratio
    print("\nSummary using Gensim:")
    print(summary_gensim)

if __name__ == '__main__':
    url = 'https://en.wikipedia.org/wiki/Natural_language_processing'
    text = scrape_webpage(url)

    # Preprocess and tokenize the text
    processed_text = preprocess_text(text)
    run_summarization(processed_text)


In [None]:
!pip install gensim==3.8.3 --user

In [None]:
pip show gensim