In [21]:
from nltk import tokenize
from nltk.tree import Tree
from nltk.tokenize import sent_tokenize
from allennlp.predictors.predictor import Predictor
import re
import heapq
import tensorflow as tf
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer
# from transformers import AutoTokenizer, AutoModelWithLMHead

import nltk
from fuzzywuzzy import fuzz
import streamlit  as st
import spacy
spacy.load('en_core_web_sm')

predictor = Predictor.from_path("https://s3-us-west-2.amazonaws.com/allennlp/models/elmo-constituency-parser-2018.03.14.tar.gz")


# Tokenizing sentence using nltk sent_tokenize

def tokenize_sentences_tf(text):
    sentences = sent_tokenize(text)
    sentences = [sentence.strip() for sentence in sentences if len(sentence) > 20]
    return sentences[0]

def get_context_tf(quest,unit,grade,title):
    return {
      "unit": unit,
      "grade" : grade,
      "title" : title,
      "type" : "True Or False",
      "quest": quest,
      "length":len(quest),

  }

# Method returns parts of speech tree for given sentence

def pos_tree_from_sentence(text):
    sentence = tokenize_sentences_tf(text)
    sentence = sentence.rstrip('?:!.,;')
    parser_output = predictor.predict(sentence=sentence)
    tree_string = parser_output["trees"]
    tree = Tree.fromstring(tree_string)
    return tree


# split at right most nounphrase or verbphrase

def get_flattened(t):
    sent_str_final = None
    if t is not None:
        sent_str = [" ".join(x.leaves()) for x in list(t)]
        sent_str_final = [" ".join(sent_str)]
        sent_str_final = sent_str_final[0]
    return sent_str_final


def get_right_most_VP_or_NP(parse_tree,last_NP = None,last_VP = None):
    if len(parse_tree.leaves()) == 1:
        return last_NP,last_VP
    last_subtree = parse_tree[-1]
    if last_subtree.label() == "NP":
        last_NP = last_subtree
    elif last_subtree.label() == "VP":
        last_VP = last_subtree
    
    return get_right_most_VP_or_NP(last_subtree,last_NP,last_VP)

def get_termination_portion(main_string, sub_string):
    combined_sub_string = sub_string.replace(" ", "")
    main_string_list = main_string.split()
    last_index = len(main_string_list)
    for i in range(last_index):
        check_string_list = main_string_list[i:]
        check_string = "".join(check_string_list)
        check_string = check_string.replace(" ", "")
        if check_string == combined_sub_string:
            return " ".join(main_string_list[:i])

    return None

def get_np_vp(tree,sentence):
    last_nounphrase, last_verbphrase =  get_right_most_VP_or_NP(tree)
    last_nounphrase_flattened = get_flattened(last_nounphrase)
    last_verbphrase_flattened = get_flattened(last_verbphrase)
    longest_phrase_to_use = ''
    if last_nounphrase is not None and last_verbphrase is not None:
        longest_phrase_to_use = max(last_nounphrase_flattened, last_verbphrase_flattened)      
    elif last_nounphrase is not None:
        longest_phrase_to_use = last_nounphrase_flattened      
    elif last_verbphrase is not None:
        longest_phrase_to_use = last_verbphrase_flattened        
    else:
        return None

    longest_phrase_to_use = re.sub(r"-LRB- ", "(", longest_phrase_to_use)
    longest_phrase_to_use = re.sub(r" -RRB-", ")", longest_phrase_to_use)
    sentence = sentence.rstrip('?:!.,;')
    split_sentence = get_termination_portion(sentence, longest_phrase_to_use)
    return split_sentence

def summarize_text(article_text):
    # Removing Square Brackets and Extra Spaces
    vAR_article_text = re.sub(r'\[[0-9]*\]', ' ', article_text)
    vAR_article_text = re.sub(r'\s+', ' ', vAR_article_text)
    # Removing special characters and digits
    vAR_formatted_article_text = re.sub('[^a-zA-Z]', ' ', vAR_article_text )
    vAR_formatted_article_text = re.sub(r'\s+', ' ', vAR_formatted_article_text)
    # Converting Text To Sentences
    vAR_sentence_list = nltk.sent_tokenize(article_text)
    vAR_stopwords = nltk.corpus.stopwords.words('english')
    # Find Weighted Frequency of Occurrence
    vAR_word_frequencies = {}
    for word in nltk.word_tokenize(vAR_formatted_article_text):
        if word not in vAR_stopwords:
            if word not in vAR_word_frequencies.keys():
                vAR_word_frequencies[word] = 1
            else:
                vAR_word_frequencies[word] += 1
    if len(vAR_word_frequencies) >0:
        maximum_frequncy = max(vAR_word_frequencies.values())
    else:
        maximum_frequncy = 1

    for word in vAR_word_frequencies.keys():
        vAR_word_frequencies[word] = (vAR_word_frequencies[word]/maximum_frequncy)
    # Calculating Sentence Scores
    vAR_sentence_scores = {}
    for sent in vAR_sentence_list:
        for word in nltk.word_tokenize(sent.lower()):
            if word in vAR_word_frequencies.keys():
                if len(sent.split(' ')) < 30:
                    if sent not in vAR_sentence_scores.keys():
                        vAR_sentence_scores[sent] = vAR_word_frequencies[word]
                    else:
                        vAR_sentence_scores[sent] += vAR_word_frequencies[word]
    vAR_summary_sentences = heapq.nlargest(15, vAR_sentence_scores, key=vAR_sentence_scores.get)
    vAR_summary = ' '.join(vAR_summary_sentences)
    return vAR_summary

def fuzzy_dup_remove(sentences):
    for i,sentence in enumerate(sentences):
        if i<len(sentences)-1:
            score = fuzz.WRatio(sentences[i],sentences[i+1])
            if score > 90:
                sentences.remove(sentences[i+1])
    return sentences

# @st.cache(show_spinner=False)
def alternate_sentences(sentences):
    from datetime import datetime
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
#    print('Before GPT2Tokenizer -',current_time)
    # GPT2tokenizer = AutoTokenizer.from_pretrained("gpt2")

    # GPT2model = AutoModelWithLMHead.from_pretrained("gpt2",pad_token_id=GPT2tokenizer.eos_token_id)
    GPT2tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    GPT2model = TFGPT2LMHeadModel.from_pretrained("gpt2",pad_token_id=GPT2tokenizer.eos_token_id)
    #Below is for GPU seetings
    # GPT2tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
    # GPT2model = TFGPT2LMHeadModel.from_pretrained("distilgpt2",pad_token_id=GPT2tokenizer.eos_token_id)
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
#    print('After GPT2Tokenizer -',current_time)
    alt_sent_list = []
    generated_sentences=[]
    print(len(sentences))
    for i,sentence in enumerate(sentences):
        if i <10:
            pos = pos_tree_from_sentence(sentence)
            # alt_sentence = alternate_sentences(pos,sentence)
            # alt_sent_list.append(alt_sentence)
            # flat_list = [item for sublist in alt_sent_list for item in sublist]
#            print('sentence - ',sentence)
            partial_sentence = get_np_vp(pos,sentence)
            if  partial_sentence is not None:
#                print('partial_sentence - ',partial_sentence)
                input_ids = GPT2tokenizer.encode(partial_sentence,return_tensors='tf')
                maximum_length = len(partial_sentence.split())+40
                # Activate top_k sampling and top_p sampling with only from 90% most likely words
#                print('type of inputid - ',type(input_ids))
#                print('inputids - ',input_ids)
#                print('pad_token_id - ',GPT2tokenizer.eos_token_id)
                sample_outputs = GPT2model.generate(
                    input_ids=input_ids,
                    do_sample=True,
                    max_length=maximum_length, 
                    top_p=1.0, # 0.85 
                    top_k=30,   #30
                    repetition_penalty  = 1.2,num_return_sequences=1)
#                print('sample - ',sample_outputs)
                sentence = sentence.replace("\n","")
#                print('############## - ',sentence)
                for i, sample_output in enumerate(sample_outputs):
                    decoded_sentence = GPT2tokenizer.decode(sample_output, skip_special_tokens=True)
                    # final_sentence = decoded_sentence
                    final_sentence = tokenize.sent_tokenize(decoded_sentence)[0]
                    final_sentence = final_sentence.replace("\r\n","")
                    final_sentence = final_sentence.replace("\n","")
                    final_sentence = final_sentence.replace("\r","")
                    generated_sentences.append(final_sentence)
                sentence = sentence.replace("\r","")
                generated_sentences.append(sentence)
#    print('$$$$$$$$$$$$$$$$ - ',generated_sentences)
    # generated_sentences = sorted(set(generated_sentences))
    return generated_sentences

2022-02-24 09:35:03.083 Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .
2022-02-24 09:35:03.212 Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .
2022-02-24 09:35:03.245 Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .
2022-02-24 09:35:05.968 instantiating registered subclass relu of <class 'allennlp.nn.activations.Activation'>
2022-02-24 09:35:05.976 instantiating registered subclass relu of <class 'allennlp.nn.activations.Activation'>
2022-02-24 09:35:05.983 instantiating registered subclass relu of <class 'allennlp.nn.activations.Activation'>
2022-02-24 09:35:05.990 instantiating registered subclass relu of <class 'allennlp.nn.activations.Activation'>
2022-02-24 09:35:29.180 loading archive file https://s3-us-west-2.amazonaws.com/allennlp/models/elmo-constituency-parser-2018.03.14.tar.gz from cache at C:\Users\ds_008\.allennlp\cache\60c14844468543e4329ce7e8d3

In [33]:
from datetime import datetime
now = datetime.now()
starttime = now.strftime("%Y-%m-%d %H:%M:%S")

import streamlit as st
import streamlit.components as stc
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import pathlib
import base64


import requests
import urllib
import pandas as pd
from requests_html import HTML
from requests_html import HTMLSession
#import trafilatura
import altair as alt
from PIL import Image
from pathlib import Path
import time
import io
import os
import docx
from docx import Document
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import re

from docx.shared import Inches, Cm
from docx.shared import RGBColor
from docx.shared import Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.text import WD_UNDERLINE
from docx.enum.table import WD_ALIGN_VERTICAL
from docx.oxml import OxmlElement, ns
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from docx.enum.text import WD_LINE_SPACING
import nltk
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
from nltk.tokenize import sent_tokenize

from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as soup
import mock

from docx.oxml.ns import qn
from docx.oxml import OxmlElement

import unicodedata

import selenium
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options

import comtypes.client

from rake_nltk import Metric, Rake
from nltk.corpus import stopwords

from summa import summarizer

import random

#import os
#import easyocr
#import cv2
from matplotlib import pyplot as plt
#import numpy as np

from collections import Iterable
import string
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
#stopwords = set(stopwords.words("english"))

#import gensim
#from gensim.summarization import summarize

from lsa_summarizer import LsaSummarizer
import nltk
nltk.download("punkt", quiet=True)
nltk.download("stopwords", quiet=True)

from nltk.corpus import stopwords

from tqdm import tqdm

def img_to_bytes(img_path):
    img_bytes = Path(img_path).read_bytes()
    encoded = base64.b64encode(img_bytes).decode()
    return encoded

header_html = "<img src='data:image/png;base64,{}' class='img-fluid'>".format(
    img_to_bytes("C:\\Users\\ds_008\\Downloads\\DSLogo.png")
)
st.markdown(
    header_html, unsafe_allow_html=True,
)


st.title("Content Creation for the Given Topic using **_Web Scraping_** and **_NLP_**")
st.sidebar.title("Content Creation for the Given Topic using Web Scraping and NLP")
#st.markdown("This application is to extract URLs and text content related to the given topic:")
st.markdown("***")
st.sidebar.markdown("This application is to extract URLs and text content for the given topic")

def get_source(url):
    """Return the source code for the provided URL. 

    Args: 
        url (string): URL of the page to scrape.

    Returns:
        response (object): HTTP response object from requests_html. 
    """

    try:
        session = HTMLSession()
        response = session.get(url)
        return response

    except requests.exceptions.RequestException as e:
        print(e)
        

url_filter = ["youtube", "pdf", "pptx", "docx", "ashx", "quora", "stackoverflow", "facebook", "stackexchange", "researchgate",
             "www.researchgate.net", "https://arxiv", "arxiv.org", "https://wasp-sweden", 
              "wasp-sweden.org", "indiascienceandtechnology", "www.indiascienceandtechnology.gov.in", "https://u2b", 
              "u2b.com", "https://cis-india", "cis-india.org",
             "shapingtomorrow", "www.shapingtomorrow.com", "frontiersin", "www.frontiersin.org", "sciedupress", "www.sciedupress.com", "http://proceedings", "proceedings.mlr.press", "aurecongroup", "www.aurecongroup.com",
             "clootrack.com", "https://clootrack", "offshore-technology", "birlasoft", "www.birlasoft.com", "informatec",
             "rm.coe.int", "www.ifc.org", "readcube", "consumersinternational", "www.consumersinternational.org", "mdpi", "www.mdpi.com", "www.theconsumergoodsforum.com",
             "shapingtomorrow", "www.shapingtomorrow.com"]
def Extract_URLs_New(Topic):
    query = Topic
#    custom_path = 'C:/Users/Darcey/Documents/NEW_PYTHON'
#    driver = webdriver.Chrome(ChromeDriverManager().install())
#    driver_location = "C:\\Users\\Darcey\\Documents\\NEW_PYTHON\\chromedriver.exe"
    options = webdriver.ChromeOptions()
    options.add_argument('--lang=en,en_US')
    options.add_argument("--disable-notifications")
    options.add_argument('--log-level=3')
    options.add_experimental_option('excludeSwitches', ['enable-logging'])
    # options.add_argument('--disable-gpu')
    # options.add_argument('--no-sandbox')
    options.add_argument('Accept=text/html,application/xhtml+xml,application/xml;q=0.9,image/webp')
    # options.add_argument('Accept-Encoding= gzip')
    # options.add_argument('Accept-Language= en-US,en;q=0.9,es;q=0.8')
    # options.add_argument('Upgrade-Insecure-Requests: 1')
    # options.add_argument('image/apng,*/*;q=0.8,application/signed-exchange;v=b3')
    # options.add_argument('user-agent=' + ua['google chrome'])
    # options.add_argument('proxy-server=' + "115.42.65.14:8080")
    # options.add_argument('Referer=' + "https://www.google.com/")
    options.add_argument('--headless')
#    options.add_argument("--disable-extensions")
#    driver = webdriver.Chrome(executable_path=driver_location,options=options)
    driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)

    driver.get("https://www.google.com/search?q={}&oq={}&hl=en&num=11".format(urllib.parse.quote(query),urllib.parse.quote(query)))
    p = driver.find_elements_by_class_name("tF2Cxc")
    titles = driver.find_elements_by_class_name("yuRUbf")
    descriptions = driver.find_elements_by_class_name("IsZvec")
    time.sleep(10)

    link_list = []
    description_list = []
    featured = False
    featured_links = 0
    title_list = []
    featured_max = 0
    featured_num = 0

    df1 = pd.DataFrame(columns=["URLs"])
    for index in range(len(p)):
        p_items = p[index].get_attribute("innerHTML")
    #    print(p_items)
        items_soup = BeautifulSoup(p_items,"html.parser")
        if(featured==False):
            if((len(items_soup.text.split("\n")) != 2)):
#                print(items_soup.text.split("\n"))
    #            df = df.append({'A': items_soup.text.split("\n")
    #            df["B"] = items_soup.text.split("\n")[1]
    #            if ((items_soup.select(".IsZvec") != None)
    #                  and (items_soup.select(".IsZvec")[0].text != "") and (items_soup.select(".IsZvec") != "")):
                a = items_soup.select("a",recursive=False)[0]["href"]
                if not any(y in a.lower().split(".") for y in url_filter) or not any(z in a.lower().split("/") for z in url_filter):
#                    st.write(a)
                    df1 = df1.append({'URLs': a}, ignore_index = True)
#                    print(df1)
                    link_list.append(a)
        title_list.append(titles[index].text)
        description_list.append("descriptions[index].text")
    description_list_new = []
    title_list_new = []
    for index in range(len(description_list)):
    #    if (description_list[index] == ""):
    #        pass
    #    elif (re.findall(r'<\w{1,}\s\w{1,}>',description_list[index]) != []):
    #        pass
    #    else:
        description_list_new.append(description_list[index])
        title_list_new.append(title_list[index])
    description_list = description_list_new
    title_list = title_list_new

    df = pd.DataFrame(columns=["Title", "Description"])
    i=0
    for title in range(len(title_list)):
#        print(title_list[title])
#        print(description_list[title])
#        print("=======================")
        df = df.append({'Title': title_list[title], 'Description': description_list[title]}, ignore_index = True)
    #    df.loc[i].B = description_list[title]
        i+=1

    #print(link_list)
    #print(len(title_list))
    #print(len(link_list))

    #for x in link_list:
    #    print(x)
    df2 = pd.concat([df, df1], axis=1)
#    st.write(df2)
    return df2

##################### To add page number in the footer ###############################################

def create_element(name):
    return OxmlElement(name)


def create_attribute(element, name, value):
    element.set(ns.qn(name), value)


def add_page_number(paragraph):
    paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER

    page_run = paragraph.add_run()
    t1 = create_element('w:t')
    create_attribute(t1, 'xml:space', 'preserve')
    t1.text = 'Page '
    page_run._r.append(t1)

    page_num_run = paragraph.add_run()

    fldChar1 = create_element('w:fldChar')
    create_attribute(fldChar1, 'w:fldCharType', 'begin')

    instrText = create_element('w:instrText')
    create_attribute(instrText, 'xml:space', 'preserve')
    instrText.text = "PAGE"

    fldChar2 = create_element('w:fldChar')
    create_attribute(fldChar2, 'w:fldCharType', 'end')

    page_num_run._r.append(fldChar1)
    page_num_run._r.append(instrText)
    page_num_run._r.append(fldChar2)

    of_run = paragraph.add_run()
    t2 = create_element('w:t')
    create_attribute(t2, 'xml:space', 'preserve')
    t2.text = ' of '
    of_run._r.append(t2)

    fldChar3 = create_element('w:fldChar')
    create_attribute(fldChar3, 'w:fldCharType', 'begin')

    instrText2 = create_element('w:instrText')
    create_attribute(instrText2, 'xml:space', 'preserve')
    instrText2.text = "NUMPAGES"

    fldChar4 = create_element('w:fldChar')
    create_attribute(fldChar4, 'w:fldCharType', 'end')

    num_pages_run = paragraph.add_run()
    num_pages_run._r.append(fldChar3)
    num_pages_run._r.append(instrText2)
    num_pages_run._r.append(fldChar4)
    
##################### To add page number in the footer ###############################################

def remove_control_characters(s):
    return "".join(ch for ch in s if unicodedata.category(ch)[0]!="C")

wdFormatPDF = 17
def convertFiletoPDF(file):
    in_file = os.path.abspath(file)
    out_file = os.path.abspath(file.replace(".docx", ".pdf"))
    word = comtypes.client.CreateObject('Word.Application')
    word.Visible = True
    time.sleep(3)
    doc = word.Documents.Open(in_file)
    doc.SaveAs(out_file, FileFormat=wdFormatPDF)
    doc.Close()
    word.Quit()


def processed_text(doc, Topic, file):
    stop_words = set(stopwords.words('english'))
    new_words = ["definition","what", "define", "explain", "mean", "detail", "short", "note"]
    unwanted_words = ["course", "courses", "certification", "certifications", "professional", "professionals", 
                      "salary", "salaries", "week", "weeks", "accenture", "ibm"]
    stop_words = list(stop_words.union(new_words))
    document = Document(file)
    lkk=[]
    if any(x in Topic.lower().split() for x in new_words):
#        print("Present")
        test1 = [word for word in Topic.lower().split() if word not in stop_words]
        test2 = " ".join(test1)
#        st.write(test2)
        ind = [i for i, para in enumerate(document.paragraphs) if test2 in para.text.lower()]
#        print("============================", test2, "=================================")
#        print(len(ind))
        if ind:
            for i, para in enumerate(document.paragraphs):
    #            print(para.text.lower())
                if len(para.text.split(' '))>15:
                    if not any(y in para.text.lower().split(" ") for y in unwanted_words):
                        for k in range(len(ind)):
                            if i == ind[k]:
                                lkk.append(para.text)
#                                st.write(para.text)
                            lkk1 = [re.sub("\[.*?\]","", _) for _ in lkk]

                            if len(lkk) ==3:
                                break
#            for k in lkk1:
#                st.write(k)
#            st.write("Raw Text", " ".join(lkk1))
#            st.write("Summarized Text:", summarizer.summarize(" ".join(lkk1), ratio=0.4))
    else:
#        print("Not Present")
        def iter_headings(paragraphs):
            for paragraph in paragraphs:
                if paragraph.style.name.startswith('Heading'):
                    yield paragraph
        heads = []
        for heading in iter_headings(doc.paragraphs):
            if not any(z in heading.text.lower().split(" ") for z in unwanted_words):
        #    print(heading.text)
                heads.append(heading.text)
#        st.write(heads)
        
        lst1 = [re.sub("\[.*?\]","", _) for _ in heads]
        lst2 = [re.sub('[^a-zA-Z0-9\s]+', '', _) for _ in lst1]

        strlst = ",".join(lst2)
        #d(w) Degree of word only.
        r = Rake(ranking_metric=Metric.WORD_DEGREE, min_length=2, max_length=2) # Uses stopwords for english from NLTK, and all puntuation characters.

        r.extract_keywords_from_text(strlst)

        li = r.get_ranked_phrases() # To get keyword phrases ranked highest to lowest.

        li1 = li[:10]
#        st.write(li1)

#        file = "Model Output - " + str(Topic)+".docx"

        document = Document(file)

        lkk = []
        for x in li1:

            # Find the index of the text and store it
            ind = [i for i, para in enumerate(document.paragraphs) if x in para.text.lower()]
#            print("============================", x, "=================================")
#            print(len(ind))
            if ind:
                for i, para in enumerate(document.paragraphs):
                    if len(para.text.split(' '))>15:
                        if not any(y in para.text.lower().split(" ") for y in unwanted_words):
                            for k in range(len(ind)):
                                if i == ind[k]:
                                    lkk.append(para.text)
    #                                st.write(para.text) 
                                lkk1 = [re.sub("\[.*?\]","", _) for _ in lkk]

def img_chk(img1):
    IMAGE_PATH = img1

    reader = easyocr.Reader(['en'])
    result = reader.readtext(IMAGE_PATH,paragraph="False")
    return result

stop_words = set(stopwords.words("english"))
def clean_string(text):
    text = ''.join([word for word in text if word not in string.punctuation])
    text = text.lower()
    text = " ".join([word for word in text.split() if word not in stop_words])
    
    return text

unit2 = ["Introductoin to Machine Learning"]

summarizer = LsaSummarizer()

stopwords = stopwords.words('english')
summarizer.stop_words = stopwords
#course_name = input("Enter the Course Name Here:")

#for x in tqdm(unit2):


Topic = input("Enter the Topic Here: ")
new_words = ["definition","what", "define", "explain", "mean", "detail", "short", "note"]
unwanted_words = ["course", "courses", "certification", "certifications", "professional", "professionals", "salary", 
                  "salaries", "week", "weeks", "accenture", "ibm", "post", "posts", "contribute", "contributed", 
                  "remember", "career", "careers", "hiring", "recruitment", "blogs", "twitter", "resources", 
                  "community", "coursera", "glassdoor", "udacity", "frequently", "offered", "phd", "faq", "faqs", 
                  "reply", "article", "articles", "thank", "thanks", "about", "conclusion", "conclusions", "newsletter", 
                 "newsletters", "subscribe", "partner", "copyright", "copyrights", "journal", "journals", "contribution",
                 "contributions"]

i=1
#if len(Topic)>0:
#    if st.sidebar.button("Extract URLs for the given topic"):
#        with st.spinner("Extracting..."):
df2 = Extract_URLs_New(Topic)
df2_filter = df2[(df2['Title']!= "")]# | (df2['URLs']!= np.nan)]
df2_filter = df2_filter[(df2['URLs'].notnull())].copy()
df2_filter['url_rank'] = np.arange(len(df2_filter)) + 1
df3 = df2_filter[['url_rank', 'URLs']]
df3['URLRank_URLS'] = "URL #" + df3['url_rank'].astype(str) + " - " + df3['URLs']
#            clean_links = Extract_Ranked_urls(links)
print("Below are the top URLs to extract content:")

for x in df3['URLRank_URLS']:
    print(x)
#    st.sidebar.markdown("*******************************")
#    if st.sidebar.button("Download Contents from URLs"):
#        with st.spinner("Downloading..."):
df2 = Extract_URLs_New(Topic)
df2_filter = df2[df2['Title']!= ""]
df2_filter = df2_filter[(df2['URLs'].notnull())].copy()
#            clean_links = Extract_Ranked_urls(links)

############ Converting listof urls to dataframe and then to tuple to create the table in word document##########

#            df = pd.DataFrame(clean_links, columns = ['urls'])
df2_filter['url_rank'] = np.arange(len(df2_filter)) + 1
df3 = df2_filter[['url_rank', 'URLs']]
Search_for_These_values = ['youtube','.pdf','.pptx'] 
pattern = '|'.join(Search_for_These_values)
df4 = df3.loc[~df3['URLs'].str.contains(pattern, case=False)]
#            print (df1)
datat = tuple(df3.itertuples(index=False, name=None))


#            print(datat)
doccc1 = docx.Document()
#docc1.add_page_break()
#            text_split = []
df2 = Extract_URLs_New(Topic)
df2_filter = df2[df2['Title']!= ""]
df2_filter = df2_filter[(df2['URLs'].notnull())].copy()
#            clean_links = Extract_Ranked_urls(links)
ua = UserAgent()
i=1
#            urlheads = []
list3 =[]
for url in df2_filter['URLs']:
    urlheads = []
    with mock.patch.object(requests.cookies.RequestsCookieJar, 'update', lambda *args, **kwargs: 0):
        req = Request(url , headers={'User-Agent': ua.random})
        time.sleep(5)
        req.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)')
        try:

            webpage = urlopen(req).read()


        except:
            try:
                chrome_options2 = Options()
                chrome_options2.add_argument("--headless")
                chrome_options2.add_argument('--log-level=3')
                chrome_options2.add_argument("--disable-gpu")
                chrome_options2.add_argument("--disable-notifications")
                chrome_options2.add_experimental_option('excludeSwitches', ['enable-logging'])
                driver2 = webdriver.Chrome(executable_path=ChromeDriverManager().install(), options = chrome_options2)
                driver2.get(url)
                # this is just to ensure that the page is loaded
                time.sleep(5) 
                html = driver2.page_source
                soup2 = BeautifulSoup(html, "html.parser")
                heading_tags = ["h1", "h2", "h3", "h4"]
                list2 = []
                list22=[]
                z=1
                for tags in soup2.find_all([heading_tags, 'p']):
                    if not tags.find([heading_tags, 'p']):
                        try:
                            list1 = tags.name + ' -> ' + tags.text.strip()
                        except:
                            pass

            except:
                pass

        else:
            try:
                page_soup = soup(webpage, "html.parser")
                time.sleep(5)
            except:
                pass
            else:
                heading_tags = ["h1", "h2", "h3", "h4"]
                list2 = []
#                        z=1
                for tags in page_soup.find_all([heading_tags, 'p']):
                    if not tags.find([heading_tags, 'p']):
                        try:
                            list1 = tags.name + ' -> ' + tags.text.strip()
                        except:
                            pass

                        list2.append(list1)


    list3.append("url -> " + url)
    list3.append(list2)


#doccc1 = docx.Document()


##################### To add Title Page ###############################################    

doccc1.add_paragraph("")
doccc1.add_paragraph("")
doccc1.add_paragraph("")
doccc1.add_paragraph("")
doccc1.add_paragraph("")
doccc1.add_paragraph("")
doccc1.add_paragraph("")
doccc1.add_paragraph("")
doccc1.add_paragraph("")
p=doccc1.add_paragraph()
p.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER
r=p.add_run(Topic)
r.font.size = Pt(24)
r.bold = True
r.underline = True
r.font.color.rgb = RGBColor(0, 0, 153)
doccc1.add_page_break()

##################### To add table of contents - 3rd Document ###############################################



paragraph = doccc1.add_paragraph()
toc = paragraph.add_run("\t\t\t\t Table of Contents \t")
toc.bold = True
toc.font.size = Pt(15)
toc.font.color.rgb = RGBColor(0, 0, 153)
run = paragraph.add_run()
fldChar = OxmlElement('w:fldChar')  # creates a new element
fldChar.set(qn('w:fldCharType'), 'begin')  # sets attribute on element
instrText = OxmlElement('w:instrText')
instrText.set(qn('xml:space'), 'preserve')  # sets attribute on element
instrText.text = 'TOC \\o "1-2" \\h \\z \\u'   # change 1-3 depending on heading levels you need

fldChar2 = OxmlElement('w:fldChar')
fldChar2.set(qn('w:fldCharType'), 'separate')
fldChar3 = OxmlElement('w:updateFields')
fldChar3.set(qn('w:val'), 'true')
#fldChar3.text = "Right-click to update field."
fldChar2.append(fldChar3)

fldChar4 = OxmlElement('w:fldChar')
fldChar4.set(qn('w:fldCharType'), 'end')

r_element = run._r
r_element.append(fldChar)
r_element.append(instrText)
r_element.append(fldChar2)
r_element.append(fldChar4)
p_element = paragraph._p

doccc1.add_page_break()

##################### To add logo - 3rd Document ###############################################


logo_path = 'C:\\Users\\ds_008\\Downloads\\DSLogo.png'    # Path of the image file
section = doccc1.sections[0]   # Create a section
sec_header = section.header   # Create header 
header_tp = sec_header.add_paragraph()  # Add a paragraph in the header, you can add any anything in the paragraph
header_run = header_tp.add_run()   # Add a run in the paragraph. In the run you can set the values 
header_run.add_picture(logo_path, width=Inches(1.3))  # Add a picture and set width.
#rml_header = "\t Applied Artificial Intelligence for Schools Content \t Generation by Topic \t"
headr = course_name+ ": "+"Content Generated by Topic as a Reference Material"
header_run.add_text("\n                                                                                                        ")
#    header_run.add_text("Applied Artificial Intelligence for Schools Content Generation by Topic")
header_run.add_text(headr)
header_run.add_text("\n_______________________________________________________________________________________")
header_run.font.size =  Pt(13)
header_run.font.color.rgb = RGBColor(0, 0, 0)
header_run.font.bold = True


doccc1.add_paragraph('')

##################### To add footer with page number - 3rd Document ###############################################


section = doccc1.sections[0]
footer = section.footer
footer_para = footer.paragraphs[0]
footer_para.text = "_________________________________________________________________________________ \t \n\n © DeepSphere.AI | Confidential and Proprietary |Not for Distribution \t"
add_page_number(doccc1.sections[0].footer.paragraphs[0])

##################### To add table with url and its ranking - 3rd Document ###############################################

#Add heading for the table - 2nd Document


topic1 = "Extracted URls and its Ranking for the Given Topic: " + Topic
table_heading = doccc1.add_heading(topic1, 1)
doccc1.add_paragraph('')
table_heading.style.font.color.rgb = RGBColor(0, 0, 153)
table_heading.style.font.size = Pt(16)
table_heading.style.font.bold = True
table_heading.style.font.all_caps = True

table = doccc1.add_table(rows=1, cols=2)
row = table.rows[0].cells
row[0].text = 'URL Rank'
row[1].text = 'URLs'
for url_rank, urls in datat:
    row = table.add_row().cells
    row[0].text = str(url_rank) + "                               "
    row[1].text = urls

table.style = 'Colorful List'
table.autofit = False
table.allow_autofit = False
cell = table.rows[0].cells[0]
cell.paragraphs[0].alignment = WD_ALIGN_PARAGRAPH.CENTER
cell.paragraphs[0].vertical_alignment = WD_ALIGN_VERTICAL.CENTER
cell1 = table.rows[0].cells[1]
cell1.paragraphs[0].alignment = WD_ALIGN_PARAGRAPH.CENTER
cell1.paragraphs[0].vertical_alignment = WD_ALIGN_VERTICAL.CENTER
table.columns[0].width = Inches(1.6)
table.columns[1].width = Inches(5)
bolding_columns = [1]
for row in list(range(1, len(table.columns[1].cells))):
    for column in bolding_columns:
        table.rows[row].cells[column].paragraphs[0].runs[0].font.color.rgb = RGBColor(0, 0, 153)


doccc1.add_page_break()

def single_list(list,ignore_types=(str)): 
    for item in list:
        if isinstance(item, Iterable) and not isinstance(item, ignore_types):
            yield from single_list(item,ignore_types=(str))
        else:
            yield item

#Item_list = [10,20,[30,40],[50,'Null',70],100]
items_single=single_list(list3)
i=1
df = pd.DataFrame()
for item in items_single:
    df = df.append({'col1' : item}, ignore_index=True)
df = df.reset_index()
df['index'] = df['index'] +1
df_p = df[df['col1'].str.startswith('p -> ')]
df_notp = df[~df['col1'].str.startswith('p -> ')]
#    df_notp['col1'] = df_notp['col1'].str.replace("""[EDIT]""", "", regex=False)

cleaned = list(map(clean_string, df_p['col1']))
#print(cleaned)
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(cleaned)

for x in range(0,X.shape[0]):
    for y in range(x,X.shape[0]):
        if(x!=y):
            #if(cosine_similarity(X[x],X[y])>threshold):
#                print("sent " + str(x) + ": \n",sentences[x])
#                print("sent " + str(y) + ": \n",sentences[y])
#                print("Cosine similarity between :" + "sent " + str(x) + " " + "sent " + str(y), " \n",cosine_similarity(X[x],X[y]))
#                print()
                if cosine_similarity(X[x],X[y]) >0.13:
                    df_p['col1'][y] = ""
dfnew1 = pd.concat([df_p, df_notp])
df_new = dfnew1.sort_values(by = "index")

df_p['col1'] = df_p['col1'].str.replace("p -> ", "")
df_p1 = df_p[df_p['col1'].apply(lambda x: len(x.split(' ')) > 15)]
df_p1 = df_p1[df_p1['col1'].apply(lambda x: len(x.split(' ')) > 50)]
list1 = df_p1['col1'].tolist()
text = "".join(list1)
text = re.sub(r"http\S+", "", text)

def tokenize_sentences(text):
    sentences = sent_tokenize(text)
    sentences = [sentence.strip() for sentence in sentences if len(sentence) > 20]
    return sentences  

if len(text)>=20:
    text = text.replace('\n',' ')

    
# sentences = summarize_text(text)
# sentences = [i for n, i in enumerate(tokenize_sentences(sentences)) if i not in tokenize_sentences(sentences)[:n]]
# sentences = fuzzy_dup_remove(sentences)
# split_list = []
# for i,sentence in enumerate(sentences):
#     if i <5:
#         pos = pos_tree_from_sentence(sentence)
#         split_sentence = get_np_vp(pos,sentence)
#         split_list.append(split_sentence)
#print('split_sentence in app.py- ',split_list)


alt_sent_list = []

sentences = summarize_text(text)
#print("#####################Summarize Text", sentences, "#####################")
sentences = [i for n, i in enumerate(tokenize_sentences(sentences)) if i not in tokenize_sentences(sentences)[:n]]
#print("#####################tokenize_sentences", sentences, "#####################")
sentences = fuzzy_dup_remove(sentences)
#print("#####################fuzzy_dup_remove", sentences, "#####################")
flat_list = alternate_sentences(sentences)

true_sents = flat_list[1::2]
false_sents = flat_list[0::2]
# print("################ True Sentences ################### \n")
# i=0
# for x in true_sents:
#     i+=1
#     print(i, " - ", x, "\n")
# print("\n\n")
# print("################ False Sentences ################### \n")
# j=0
# for y in false_sents:
#     j+=1
#     print(j, " - ", y, "\n")

            # list_write = flat_list
            # random.shuffle(list_write)
            # for i,sentence in enumerate(sentences):
            #     if i <5:
            #         pos = pos_tree_from_sentence(sentence)
            #         alt_sentence = alternate_sentences(pos,sentence)
            #         alt_sent_list.append(alt_sentence)
            #         flat_list = [item for sublist in alt_sent_list for item in sublist]
#                 output_file(text,"Input Text")
#                 output_file(flat_list,quest)
#             li2 = []
#             # for i in flat_list:
#             #     for j in list_write:
#             #         print('i value - ',i)
#             #         print('j value - ',j)
#             #         if i==j:
#             #             li2.append('True')
#             #         else:
#             #             li2.append('False')
#             #         break
#             for i,itm in enumerate(flat_list):
#                 if i%2==0:
#                     li2.append('False')
#                 else:
#                     li2.append('True')
#             doc = DocxTemplate("TrueFalse_Template.docx")
#             quest = list(zip(flat_list,li2))
# #                 print('questttttttttttttttttt - ',quest)
#             context = get_context_tf(quest,unit,grade,title)
#             doc.render(context)
# # #                 doc.save("complex_case_manageme   nt_true_or_false.docx")
#             doc.save("School-Questions/8th/"+"unit"+context["unit"]+"_true_or_false.docx")
#     else:
#         st.error("Please select input file!")
# vm_col1,vm_col2,vm_col3 = st.beta_columns((1,1,2))
# vm_col1.success("Validate Model")
# vm_col2.success("Step 6")
# if vm_col3.button('View Model Outcome'):
#     if text is not None:
#         alt_sent_list = []
#         sentences = summarize_text(text)
#         sentences = [i for n, i in enumerate(tokenize_sentences(sentences)) if i not in tokenize_sentences(sentences)[:n]]
#         sentences = fuzzy_dup_remove(sentences)
#         flat_list = list(alternate_sentences(sentences))

#    nname = Topic + ".csv"
#    df_new.to_csv(nname)

#     i=1
#     for x in range(len(df_new['col1'])):
#     #    i=1
#         try:
#             if df_new['col1'][x].startswith('url -> '):
#                 para1 = doccc1.add_paragraph().add_run("-----------------------------------------------------------------------------------------")
#                 para1.bold = True
#                 para1.font.size = Pt(14)
#                 para1.font.color.rgb = RGBColor(0, 0, 153)
#             #    docu.add_paragraph("")
#                 txt1 = "**Content Set #" + str(i) + "**"
#                 para2 = doccc1.add_paragraph().add_run(txt1)
#                 para2.bold = True
#                 para2.font.size = Pt(14)
#                 para2.font.color.rgb = RGBColor(0, 0, 153)
#             #    docu.add_paragraph("")
#             #    docu.add_paragraph("")
#                 txt2 = "URL #" + str(i) + ":    " + df_new['col1'][x].replace("url -> ", "")
#                 para3 = doccc1.add_paragraph().add_run(txt2)
#                 para3.bold = True
#                 para3.font.size = Pt(14)
#                 para3.font.color.rgb = RGBColor(0, 0, 153)
#             #    docu.add_paragraph("")
#                 para4 = doccc1.add_paragraph().add_run("-----------------------------------------------------------------------------------------")
#                 para4.bold=True
#                 para4.font.size = Pt(14)
#                 para4.font.color.rgb = RGBColor(0, 0, 153)
#                 i+=1
#             elif df_new['col1'][x] == "":
#                 pass
#             elif df_new['col1'][x].startswith("h") and df_new['col1'][x+1].startswith('url -> '):
#                 df_new['col1'][x] = ""
#             elif df_new['col1'][x].startswith("h") and df_new['col1'][x+1].startswith("h"):
#                 df_new['col1'][x] = ""
#             elif df_new['col1'][x].startswith("h") and df_new['col1'][x+1] == "":
#                 df_new['col1'][x] = ""
#             elif df_new['col1'][x].startswith("h") and df_new['col1'][x+1].startswith("p") and len(df_new['col1'][x+1].split()) < 15:
#                 df_new['col1'][x] = ""
#                 df_new['col1'][x+1] = ""
#             elif df_new['col1'][x].startswith("p") and len(df_new['col1'][x].split()) < 15:
#                 df_new['col1'][x] = ""
#             elif df_new['col1'][x].startswith("h") and any(y in df_new['col1'][x].lower().split(" ") for y in unwanted_words):
#                 df_new['col1'][x] = ""
#             elif df_new['col1'][x] == df_new['col1'][x+1]:
#                 df_new['col1'][x] = ""
#             elif df_new['col1'][x].startswith("h1") and df_new['col1'][x+1].startswith("p"):
#                 h1 = doccc1.add_heading(df_new['col1'][x].replace("h1 -> ", ""), 1)
#                 h1.style.font.color.rgb = RGBColor(0, 0, 139)
#                 h1.style.font.size = Pt(14)
#                 h1.style.font.bold = True
#                 h1.style.font.all_caps = True
#             elif df_new['col1'][x].startswith("h2") and df_new['col1'][x+1].startswith("p"):
#                 h2 = doccc1.add_heading(df_new['col1'][x].replace("h2 -> ", ""), 2)
#                 h2.style.font.color.rgb = RGBColor(0, 0, 139)
#                 h2.style.font.size = Pt(14)
#                 h2.style.font.bold = True
#                 h2.style.font.all_caps = True
#             elif df_new['col1'][x].startswith("h3") and df_new['col1'][x+1].startswith("p"):
#                 h3 = doccc1.add_heading(df_new['col1'][x].replace("h3 -> ", ""), 3)
#                 h3.style.font.color.rgb = RGBColor(0, 0, 139)
#                 h3.style.font.size = Pt(14)
#                 h3.style.font.bold = True
#                 h3.style.font.all_caps = True
#             elif df_new['col1'][x].startswith("h4") and df_new['col1'][x+1].startswith("p"):
#                 h4 = doccc1.add_heading(df_new['col1'][x].replace("h4 -> ", ""), 4)
#                 h4.style.font.color.rgb = RGBColor(0, 0, 139)
#                 h4.style.font.size = Pt(14)
#                 h4.style.font.bold = True
#                 h4.style.font.all_caps = True
#             else:
#                 try:
#                     df_new['col1'][x].replace("p -> ", "")
#                 except Exception as ee:
#                     doccc1.add_paragraph(str(ee))
#                 else:
#                     if (df_new['col1'][x].startswith("p")) and ((len(df_new['col1'][x].split(". "))) > 3):
#                         doccc1.add_paragraph(summarizer(remove_control_characters(df_new['col1'][x].replace('\x00','')).replace("p -> ", ""), (len(df_new['col1'][x].split(". ")) +1)/2))
#                     else:
#                         doccc1.add_paragraph(remove_control_characters(df_new['col1'][x].replace('\x00','')).replace("p -> ", ""))
#         except:
#             break
#         else:
#             pass
#     doccc1.save(str(Topic)+".docx")
#     convertFiletoPDF(str(Topic)+".docx")
now = datetime.now()
endtime = now.strftime("%Y-%m-%d %H:%M:%S")

fmt = '%Y-%m-%d %H:%M:%S'
d1 = datetime.strptime(starttime, fmt)
d2 = datetime.strptime(endtime, fmt)

diff = d2 -d1
diff_minutes = (diff.days * 24 * 60) + (diff.seconds/60)

print(starttime)
print(endtime)
print(diff_minutes)
print(flat_list)

Enter the Topic Here: Introduction to Machine Learning




2022-02-24 15:52:03.006 

Current google-chrome version is 98.0.4758
2022-02-24 15:52:06.838 Current google-chrome version is 98.0.4758
Get LATEST chromedriver version for 98.0.4758 google-chrome
2022-02-24 15:52:06.844 Get LATEST chromedriver version for 98.0.4758 google-chrome
Trying to download new driver from https://chromedriver.storage.googleapis.com/98.0.4758.102/chromedriver_win32.zip
2022-02-24 15:52:07.061 Trying to download new driver from https://chromedriver.storage.googleapis.com/98.0.4758.102/chromedriver_win32.zip
Driver has been saved in cache [C:\Users\ds_008\.wdm\drivers\chromedriver\win32\98.0.4758.102]
2022-02-24 15:52:09.215 Driver has been saved in cache [C:\Users\ds_008\.wdm\drivers\chromedriver\win32\98.0.4758.102]


2022-02-24 15:52:31.864 



Below are the top URLs to extract content:
URL #1 - https://www.digitalocean.com/community/tutorials/an-introduction-to-machine-learning#:~:text=Machine%20learning%20is%20a%20subfield%20of%20artificial%20intelligence%20(AI).&text=Because%20of%20this%2C%20machine%20learning,has%20benefitted%20from%20machine%20learning.
URL #2 - https://www.geeksforgeeks.org/introduction-machine-learning/
URL #3 - https://towardsdatascience.com/introduction-to-machine-learning-for-beginners-eed6024fdb08
URL #4 - https://developers.google.com/machine-learning/crash-course/ml-intro
URL #5 - https://www.edureka.co/blog/introduction-to-machine-learning/
URL #6 - https://www.javatpoint.com/machine-learning
URL #7 - https://www.udacity.com/course/intro-to-machine-learning--ud120
URL #8 - https://www.simplilearn.com/tutorials/machine-learning-tutorial/introduction-to-machine-learning
URL #9 - https://www.softwaretestinghelp.com/machine-learning-tutorials/
URL #10 - https://www.coursera.org/learn/machine-learnin

Current google-chrome version is 98.0.4758
2022-02-24 15:52:34.409 Current google-chrome version is 98.0.4758
Get LATEST chromedriver version for 98.0.4758 google-chrome
2022-02-24 15:52:34.413 Get LATEST chromedriver version for 98.0.4758 google-chrome
Trying to download new driver from https://chromedriver.storage.googleapis.com/98.0.4758.102/chromedriver_win32.zip
2022-02-24 15:52:34.542 Trying to download new driver from https://chromedriver.storage.googleapis.com/98.0.4758.102/chromedriver_win32.zip
Driver has been saved in cache [C:\Users\ds_008\.wdm\drivers\chromedriver\win32\98.0.4758.102]
2022-02-24 15:52:35.606 Driver has been saved in cache [C:\Users\ds_008\.wdm\drivers\chromedriver\win32\98.0.4758.102]


2022-02-24 15:52:52.123 

Current google-chrome version is 98.0.4758
2022-02-24 15:52:54.456 Current google-chrome version is 98.0.4758
Get LATEST chromedriver version for 98.0.4758 google-chrome
2022-02-24 15:52:54.459 Get LATEST chromedriver version for 98.0.4758 google-c

15
2022-02-24 15:51:50
2022-02-24 16:07:12
15.366666666666667


["After learning the initial steps of Reinforcement Learning, we'll see how to use them in some cases.",
 "After learning the initial  steps of Reinforcement Learning, we'll move to Q Learning, as well as Deep Q Learning.",
 'The below block diagram explains the working of Machine Learning algorithm:The need for machine learning is a necessity in all programming applications and also can help us to make our data more accessible.',
 'The below block diagram explains the working of Machine Learning algorithm:The need for machine learning is increasing day by day.',
 "If it's less than 0.5, you won’t be able to.In this section of the introduction to machine learning tutorial, we will discuss some amazing use cases of using a model-learned AI (AI) for various cognitive functions - tasks like classification and decision making, data analysis etc.",
 "If it's less than 0.5, you won’t be able to.In this section of the introduction to machine learning tutorial, we will discuss some amazing use