In [None]:

import spacy
from spacy.lang.en import English
import networkx as nx
import matplotlib.pyplot as plt

def getSentences(text):
    nlp = English()
    nlp.add_pipe(nlp.create_pipe('sentencizer'))
    document = nlp(text)
    return [sent.string.strip() for sent in document.sents]

def printToken(token):
    print(token.text, "->", token.dep_)

def appendChunk(original, chunk):
    return original + ' ' + chunk

def isRelationCandidate(token):
    deps = ["ROOT", "adj", "attr", "agent", "amod"]
    return any(subs in token.dep_ for subs in deps)

def isConstructionCandidate(token):
    deps = ["compound", "prep", "conj", "mod"]
    return any(subs in token.dep_ for subs in deps)

def processSubjectObjectPairs(tokens):
    subject = ''
    object = ''
    relation = ''
    subjectConstruction = ''
    objectConstruction = ''
    for token in tokens:
        printToken(token)
        if "punct" in token.dep_:
            continue
        if isRelationCandidate(token):
            relation = appendChunk(relation, token.lemma_)
        if isConstructionCandidate(token):
            if subjectConstruction:
                subjectConstruction = appendChunk(subjectConstruction, token.text)
            if objectConstruction:
                objectConstruction = appendChunk(objectConstruction, token.text)
        if "subj" in token.dep_:
            subject = appendChunk(subject, token.text)
            subject = appendChunk(subjectConstruction, subject)
            subjectConstruction = ''
        if "obj" in token.dep_:
            object = appendChunk(object, token.text)
            object = appendChunk(objectConstruction, object)
            objectConstruction = ''

    print (subject.strip(), ",", relation.strip(), ",", object.strip())
    return (subject.strip(), relation.strip(), object.strip())

def processSentence(sentence):
    tokens = nlp_model(sentence)
    return processSubjectObjectPairs(tokens)

def printGraph(triples):
    G = nx.Graph()
    for triple in triples:
        G.add_node(triple[0])
        G.add_node(triple[1])
        G.add_node(triple[2])
        G.add_edge(triple[0], triple[1])
        G.add_edge(triple[1], triple[2])

    pos = nx.spring_layout(G,k=0.15,iterations=20)
    
    
    plt.figure()
    nx.draw(G, pos, edge_color='black', width=1, linewidths=1,
            node_size=500, node_color='seagreen', alpha=0.9,
            labels={node: node for node in G.nodes()})
    plt.axis('off')
    plt.show()

if __name__ == "__main__":

    text = "legendary's only mandate was to include monarch, rodan, mothra, and king ghidorah."\
            "ten writers contributed to building on the treatment."\
            "the script took a year to come together."\
            "dougherty also changed, revised, and improved lines during filming and post-production."\
            "due to this, the film became an ensemble piece."\
            "it can't just look like big dinosaurs."\
            "other actors perform the body."\
            "production designer scott chambliss managed all the art directors.the single was released on may 13, 2019."\
            "all tracks are written by bear mccreary, except where noted."\
            "the score is also conducted by mccreary."\
            "on december 10, 2018, the film's first teaser poster and ccxp trailer were released."\
            "in april 2019, the main theatrical poster was released online."\
            "the film was originally scheduled to be released on june 8, 2018."\
            "the collectible tickets were offered in two sizes: standard  and godzilla-sized ."\
            "the 4k release includes hdr10, hdr10"\
            "the retail exclusives will also include limited special clear files."\
            "such heroes are ready with one-liners, puns, and dry quips."\
            "it was action with a science fiction twist."\
            "currently, action films requiring extensive stunt work and special effects tend to be expensive."\
            "examples include the indiana jones franchise and many superhero films."\
            "themes or elements often prevalent in typical action-horror films"\
            "paul blart: mall cop is a recent spoof of this trend ." 

    sentences = getSentences(text)
    nlp_model = spacy.load('en_core_web_sm')

    triples = []
    print (text)
    for sentence in sentences:
        triples.append(processSentence(sentence))

    printGraph(triples)






In [None]:
from bs4 import BeautifulSoup
import urllib
import requests
import re,math
import os
from sklearn.feature_extraction.text import CountVectorizer
import codecs
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter

#This function crawls the hyperlinks fron the list and writes the data into a text file
def func(i,name):                                
        html = requests.get(i).content
        #1 Recoding
        unicode_str = html.decode("ISO-8859-1")
        encoded_str = unicode_str.encode("ascii",'ignore')
        news_soup = BeautifulSoup(encoded_str, "html.parser")
        a_text = news_soup.find_all('p')
        #2 Removing
        y=[re.sub(r'<.+?>',r'',str(a)) for a in a_text]

        file1 = open('test.txt', 'w')
        for item in y:
            file1.write("%s\n" % item)
        os.rename("test.txt",name)

#This function calculates the csoine similarity between the querry and the documents
def get_cosine(vec1, vec2):
     intersection = set(vec1.keys()) & set(vec2.keys())
     numerator = sum([vec1[x] * vec2[x] for x in intersection])

     sum1 = sum([vec1[x]**2 for x in vec1.keys()])
     sum2 = sum([vec2[x]**2 for x in vec2.keys()])
     denominator = math.sqrt(sum1) * math.sqrt(sum2)

     if not denominator:
        return 0.0
     else:
        return float(numerator) / denominator

def text_to_vector(text):
     words = WORD.findall(text)
     #print(Counter(words)
     return Counter(words)
        
WORD = re.compile(r'\w+')


#TASK -1
#The URL od the wikipedia page of IIT Delhi is taken
url = "https://en.wikipedia.org/wiki/Indian_Institute_of_Technology_Delhi"
response = requests.get(url)
data = response.text
soup = BeautifulSoup(data, 'lxml')
links = []

# A list named 'links' is created and the crawler crawls the website and stoores all the hyperlnks in this list 
for link in soup.findAll('a', attrs={'href': re.compile("^http://")}):
    links.append(link.get('href'))
print('\n')
print('The hyperlinks found on the wikiperia page of IIT Delhi: ')
print(links)
print('\n')

#TASK -2
#All the hyperlinks are opened one by one and the content is stored in the text file.
name=[]
for i in range(50):
    s = "file"+ str(i) +".txt" 
    name.append(s)
x=0
for i in links:
    func(i,name[x])
    x=x+1

# The stop words are removed and tokenization of the content of tct file takes place
stop_words = set(stopwords.words('english'))
filtered_sentence = []

main =[]
i=0
for item in name:
    file1 = codecs.open(item, encoding='utf-8')
    word_tokens = word_tokenize(file1.read())
    for w in word_tokens:
        if w not in stop_words:
            s = s + " "+w
    main.append(s)

#TASK-3 
# vectorisation for documents and terms take place
vectorizer = CountVectorizer()
p = vectorizer.fit_transform(main)
print('The matrix after vectorization of the documents :')
print('\n')
print(p.toarray())
print('\n')

#Task-4
#The querry is taken fron the user and cosine similarity is calculated between wuerry and every document
print('Enter a query: ')

all_cos=[]
rank=[]
text1 = raw_input()
vector1 = text_to_vector(text1)
for i in range(50):
    text2 = codecs.open(name[i], encoding='ISO-8859-1').read()
    vector2 = text_to_vector(text2)
    cosine = get_cosine(vector1, vector2)
    all_cos.append(cosine)
    rank.append(cosine)
    print ('Cosine:', i, cosine)

rank.sort(reverse=True)

#TASK-5
# The rank od document based on similarity and the url of top 10 documents are displayed 
print('Rank of documents based on similarity is as follows:')
print('\n')
for i in range(50):
    print ('Rank:', i,': ', rank[i])

j = 1
while j < 11:
    maxpos= all_cos.index(max(all_cos)) 
    s = all_cos[maxpos]
    print('\n')
    print ('Document ',j)
    print('Value of similarity :',s)
    print ('URL for that page is :',links[maxpos])
    print('\n')
    all_cos.remove(s)
    j += 1
    



In [None]:
import re
import argparse
from urllib.parse import urljoin, urlparse
from urllib.request import urlopen
from urllib.error import HTTPError
from collections import Counter, defaultdict
from math import log10
from bs4 import BeautifulSoup
import numpy as np

prob = 0.05
target_delta = 0.04

stop_words = [
    'a', 'also', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'do',
    'for', 'have', 'is', 'in', 'it', 'of', 'or', 'see', 'so',
    'that', 'the', 'this', 'to', 'we'
]

#Taking the input
#Search query and list of urls
def input():
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter
    )
    parser.add_argument(
        'indian',
        type=str,
        nargs='+',
        help='Search query string can contain multiple words'
    )
    parser.add_argument(
        'https://en.wikipedia.org/wiki/Indian_Institute_of_Technology_Delhi',
        type=str,
        nargs='+',
        help='At least one seed url for the crawler to start from'
    )
    return parser.parse_args()

def crawler(urls, _frontier={}, _bases=None):
    '''
    Takes a list of urls as argument and crawls them recursively until
    no new url can be found.
    Returns a sorted list of tuples (url, content, links).
    `links` is a list of urls.
    '''
    if not _bases:
        _bases = [urlparse(u).netloc for u in urls]
    for url in [u.rstrip('/') for u in urls]:
        if url in _frontier:
            continue
        try:
            response = urlopen(url)
        except HTTPError as e:
            print(e, url)
            continue

        page = parse(response, url, _bases)
        print('crawled %s with %s links' % (url, len(page[2])))
        _frontier[url] = page
        crawler(page[2], _frontier, _bases)
    return sorted(_frontier.values())


def parse(html, url, bases):
    '''
    Takes an html string and a url as arguments.
    Returns a tuple (url, content, links) parsed from the html.
    '''
    try:
        data = BeautifulSoup(html, 'lxml')

        content = data.body.get_text().strip()

        links = [urljoin(url, l.get('href')) for l in data.findAll('a')]
        links = [l for l in links if urlparse(l).netloc in bases]
    except AttributeError:
        pass

    return url, content, links

def page_rank(pages):
    '''
    Returns a matrix with documents as columns
    and values for each round as rows.
    Number of rows depends on how long it takes to reach the target_delta.
    '''
    N = len(pages)
    transition_matrix = create_transition_matrix(pages)
    ranks_in_steps = [[1 / N] * N]
    while True:
        possibilities = ranks_in_steps[-1] * transition_matrix
        delta = get_delta(possibilities, ranks_in_steps[-1])
        ranks_in_steps.append(np.squeeze(np.asarray(possibilities)))
        if delta <= target_delta:
            return ranks_in_steps

def create_transition_matrix(pages):
    '''
    Returns a matrix with document urls as rows
    and document links as columns.
    Each cell contains the probability for a document
    to transition to a link.
    '''
    links = get_links(pages)
    urls = get_urls(pages)
    N = len(pages)
    m = np.matrix([[weight_link(N, u, l) for u in urls] for l in links])
    return prob_to_transition(N, m)


def weight_link(N, url, links):
    if not links:
        return 1 / N
    if url in links:
        return 1 / len(links)
    else:
        return 0


def prob_to_transition(N, m):
    return m * (1 - prob) + prob / N


def get_delta(a, b):
    return np.abs(a - b).sum()




def get_links(pages):
    return [links for url, content, links in pages]


def best_rank(ranks, pages):
    '''
    Returns a dict with document urls as keys
    and their ranks as values.
    '''
    list_of_url = [url for url, content , links in pages]
    return dict(zip(list_of_url, ranks[-1]))

# Index

def create_index(pages):
    '''
    Returns the index as a dict with terms as keys
    and lists tuples(url, count) as values.
    Count says how many times the term occured in the document.
    '''
    index = defaultdict(list)
    for url, content, links in pages:
        counts = count_terms(content)
        for term, count in counts.items():
            index[term].append((url, count))
    return index


def count_terms(content):
    '''
    content is a text string.
    Returns a Counter with terms as keys
    and their occurence as values.
    '''
    return Counter(get_terms(content))


normalize = re.compile('[^a-z0-9]+')


def get_terms(s):
    '''
    Get a list of terms from a string.
    Terms are lower case and all special characters are removed.
    '''
    normalized = [normalize.sub('', t.lower()) for t in s.split()]
    return [t for t in normalized if t not in stop_words]


def weight_index(index, N):
    '''
    Takes an index as first argument
    and the total number of documents as second argument.
    Returns a new index with tf_idf weights instead of simple counts.
    '''
    weighted_index = defaultdict(list)
    for term, docs in index.items():
        df = len(docs)
        for url, count in docs:
            weight = tf_idf(count, N, df)
            weighted_index[term].append((url, weight))
    return weighted_index


def tf_idf(tf, N, df):
    return 1 + log10(tf) * log10(N / df)

#word term frequency -  1 + log10(tf)
#in document frequency -  log10(N / df)


def normalize_index(index):
    '''
    Takes an index as argument.
    Returns a new index with normalized weights.
    '''
    lengths = doc_lengths(index)
    norm_index = defaultdict(list)
    for term, docs in index.items():
        for url, weight in docs:
            norm_index[term].append((url, weight / lengths[url]))
    return norm_index


def doc_lengths(index):
    '''
    Returns a dict with document urls as keys
    and vector lengths as values.
    The length is calculated using the vector of weights
    for the terms in the document.
    '''
    doc_vectors = defaultdict(list)
    for docs in index.values():
        for url, weight in docs:
            doc_vectors[url].append(weight)
    return {url: np.linalg.norm(doc) for url, doc in doc_vectors.items()}


# Search & Scoring

def cosine_similarity(index, N, query):
    '''
    query is a string of terms.
    Returns a sorted list of tuples (url, score).
    Score is calculated using the cosine distance
    between document and query.
    '''
    scores = defaultdict(int)
    print(query)
    terms = query[0]
    qw = {t: tf_idf(1, N, len(index[t])) for t in terms if t in index}
    query_len = np.linalg.norm(list(qw.values()))
    for term in qw:
        query_weight = qw[term] / query_len
        for url, weight in index[term]:
            scores[url] += weight * query_weight
    return sorted(scores.items(), key=lambda x: x[1], reverse=True)


def combined_search(index, N, rank, query):
    '''
    Returns a sorted list of tuples (url, score).
    Score is the product of the cosine similarity and the PageRank.
    '''
    scores = cosine_similarity(index, N, query)
    combined = [(doc, score * rank[doc]) for doc, score in scores]
    return sorted(combined, key=lambda x: x[1], reverse=True)


def print_combined_search(index, N, rank, query):
    print('Search results for "%s":' % (query))
    for url, score in combined_search(index, N, rank, query):
        print('%.6f  %s' % (score, url))


def main():
    args = input()
    # Computing
    pages = crawler(args.url)
    ranks = page_rank(pages)
    rank = best_rank(ranks, pages)
    N = len(pages)
    index = create_index(pages)
    weighted_index = weight_index(index, N)
    norm_index = normalize_index(weighted_index)

    # Print results
    print()
    print('Number of pages:', len(pages))
    print('Terms in index:', len(index))
    print('Iterations for PageRank:', len(ranks))
    print()
    print_combined_search(norm_index, N, rank, args.query)




if __name__ == "__main__":
    main()


In [None]:
from bs4 import BeautifulSoup
import urllib2
import requests
import re,math
import os
from sklearn.feature_extraction.text import CountVectorizer
import codecs
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter

#This function crawls the hyperlinks fron the list and writes the data into a text file
def func(i,name):                                
        html = requests.get(i).content
        #1 Recoding
        unicode_str = html.decode("ISO-8859-1")
        encoded_str = unicode_str.encode("ascii",'ignore')
        news_soup = BeautifulSoup(encoded_str, "html.parser")
        a_text = news_soup.find_all('p')
        #2 Removing
        y=[re.sub(r'<.+?>',r'',str(a)) for a in a_text]

        file1 = open('test.txt', 'w')
        for item in y:
            file1.write("%s\n" % item)
        os.rename("test.txt",name)

#This function calculates the csoine similarity between the querry and the documents
def get_cosine(vec1, vec2):
     intersection = set(vec1.keys()) & set(vec2.keys())
     numerator = sum([vec1[x] * vec2[x] for x in intersection])

     sum1 = sum([vec1[x]**2 for x in vec1.keys()])
     sum2 = sum([vec2[x]**2 for x in vec2.keys()])
     denominator = math.sqrt(sum1) * math.sqrt(sum2)

     if not denominator:
        return 0.0
     else:
        return float(numerator) / denominator

def text_to_vector(text):
     words = WORD.findall(text)
     #print(Counter(words)
     return Counter(words)
        
WORD = re.compile(r'\w+')


#TASK -1
#The URL od the wikipedia page of IIT Delhi is taken
url = "https://en.wikipedia.org/wiki/Indian_Institute_of_Technology_Delhi"
response = requests.get(url)
data = response.text
soup = BeautifulSoup(data, 'lxml')
links = []

# A list named 'links' is created and the crawler crawls the website and stoores all the hyperlnks in this list 
for link in soup.findAll('a', attrs={'href': re.compile("^http://")}):
    links.append(link.get('href'))
print('\n')
print('The hyperlinks found on the wikiperia page of IIT Delhi: ')
print(links)
print('\n')

#TASK -2
#All the hyperlinks are opened one by one and the content is stored in the text file.
name=[]
for i in range(50):
    s = "file"+ str(i) +".txt" 
    name.append(s)
x=0
for i in links:
    func(i,name[x])
    x=x+1

# The stop words are removed and tokenization of the content of tct file takes place
stop_words = set(stopwords.words('english'))
filtered_sentence = []

main =[]
i=0
for item in name:
    file1 = codecs.open(item, encoding='utf-8')
    word_tokens = word_tokenize(file1.read())
    for w in word_tokens:
        if w not in stop_words:
            s = s + " "+w
    main.append(s)

#TASK-3 
# vectorisation for documents and terms take place
vectorizer = CountVectorizer()
p = vectorizer.fit_transform(main)
print('The matrix after vectorization of the documents :')
print('\n')
print(p.toarray())
print('\n')

#Task-4
#The querry is taken fron the user and cosine similarity is calculated between wuerry and every document
print('Enter a query: ')

all_cos=[]
rank=[]
text1 = raw_input()
vector1 = text_to_vector(text1)
for i in range(50):
    text2 = codecs.open(name[i], encoding='ISO-8859-1').read()
    vector2 = text_to_vector(text2)
    cosine = get_cosine(vector1, vector2)
    all_cos.append(cosine)
    rank.append(cosine)
    print 'Cosine:', i, cosine

rank.sort(reverse=True)

#TASK-5
# The rank od document based on similarity and the url of top 10 documents are displayed 
print('Rank of documents based on similarity is as follows:')
print('\n')
for i in range(50):
    print 'Rank:', i,': ', rank[i]

j = 1
while j < 11:
    maxpos= all_cos.index(max(all_cos)) 
    s = all_cos[maxpos]
    print('\n')
    print 'Document ',j
    print'Value of similarity :',s
    print 'URL for that page is :',links[maxpos]
    print('\n')
    all_cos.remove(s)
    j += 1
    



In [None]:
"""import pprint
import nest_asyncio
nest_asyncio.apply()

from search_engine_parser.core.engines.yahoo import Search as YahooSearch
search_args = ('preaching to the choir', 1)
ysearch = YahooSearch()
yresults = ysearch.search(*search_args)
  # print 10th link from yahoo search
print(yresults["links"][4])"""





try: 
    from googlesearch import search 
except ImportError: 
    print("No module named 'google' found") 

# to search 
query = "phishing"
tab=[]
for j in search(query, tld="com", num=2, stop=2, pause=10): 
    tab.append(j)
print(tab)
with open('tes.txt', 'w') as f:
    for item in tab:
        f.write("%s\n" % item)



 





In [None]:
import requests
from bs4 import BeautifulSoup

def get_links_recursive(base, path, visited, max_depth=3, depth=0):
    if depth < max_depth:
        try:
            soup = BeautifulSoup(requests.get(base + path).text, "html.parser")

            for link in soup.find_all("a"):
                href = link.get("href")

                if href not in visited:
                    visited.add(href)
                    print(f"at depth {depth}: {href}")

                    if href.startswith("http"):
                        get_links_recursive(href, "", visited, max_depth, depth + 1)
                    else:
                        get_links_recursive(base, href, visited, max_depth, depth + 1)
        except:
            pass


get_links_recursive("http://toscrape.com", "", set(["http://toscrape.com"]))








In [None]:
import requests
from bs4 import BeautifulSoup
from collections import deque

visited = set(["http://toscrape.com"])
dq = deque([["http://toscrape.com", "", 0]])
max_depth = 3

while dq:
    base, path, depth = dq.popleft()
    #                         ^^^^ removing "left" makes this a DFS (stack)

    if depth < max_depth:
        try:
            soup = BeautifulSoup(requests.get(base + path).text, "html.parser")

            for link in soup.find_all("a"):
                href = link.get("href")
                
                if href not in visited:
                    visited.add(href)
                    print("  " * depth + f"at depth {depth}: {href}")

                    if href.startswith("http"):
                        dq.append([href, "", depth + 1])
                    else:
                        dq.append([base, href, depth + 1])
        except:
            pass

In [23]:
from collections import deque
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from urllib.request import urlopen
import sys
import urllib
import urllib.robotparser
# Read URL from command line
url = "https://www.sharecare.com/health/wellness-healthy-living/what-is-health-and-wellness"

print ("===================")
print ("Page to be crawled:", url)
print( "===================")
print()

def robot_parser(url):
    rp = urllib.robotparser.RobotFileParser()
    rp.set_url(url+'/robots.txt')
    rp.read()
    if not rp.can_fetch("*", url):
        a=0
    else :
        a=1
    return a



# Create queue
queue = []

# Maintains list of visited pages
visited_list = []


# Crawl the page and populate the queue with newly found URLs
def crawl(url):
    visited_list.append((url,len(queue)))
    if len(queue) > 5:
        return
    z=0
    urlf = urlopen(url)
    soup = BeautifulSoup(urlf.read())
    urls = soup.findAll("a", href=True)
   
    for i in urls:
      
        
        flag = 0
        
        # Complete relative URLs and strip trailing slash
        complete_url = urljoin(url, i["href"]).rstrip('/')
        # Check if the URL already exists in the queue
        
        for j in queue:  
            if j == complete_url:             
                flag = 1
                break

        # If not found in queue
        if flag == 0:
            if len(queue) > 5: 
                return
            if (visited_list.count(complete_url)) == 0 and robot_parser(complete_url)==1:
                
                print(complete_url)  # a modifier 
                queue.append((complete_url,len(queue)))
            
    # Pop one URL from the queue from the left side so that it can be crawled
    queue.sort()
    current = queue.pop(0)
    # Recursive call to crawl until the queue is populated with 100 URLs
    crawl(current)

crawl(url)
print(queue)





Page to be crawled: https://www.sharecare.com/health/wellness-healthy-living/what-is-health-and-wellness

https://www.sharecare.com
https://www.sharecare.com/health/allergy
https://www.sharecare.com/health/cancer
https://www.sharecare.com/covid19
https://www.sharecare.com/health/type-2-diabetes
https://www.sharecare.com/health/heart-disease
[('https://www.sharecare.com', 0), ('https://www.sharecare.com/health/allergy', 1), ('https://www.sharecare.com/health/cancer', 2), ('https://www.sharecare.com/covid19', 3), ('https://www.sharecare.com/health/type-2-diabetes', 4), ('https://www.sharecare.com/health/heart-disease', 5)]


In [2]:
tel = {'jack':4098, 'sape':4139}
print(tel['sape'])

4139


In [16]:
customers =[]
customers.append((2,"Harry"))
customers.append((3,"charles"))
customers.append((1,"riya"))
customers.sort(reverse=True)
print( customers.pop()[1])
#i = 5 + tup()[0]

riya


In [20]:
import urllib.robotparser
def robot_parser(url):
    rp = urllib.robotparser.RobotFileParser()
    rp.set_url(url+'/robots.txt')
    rp.read()
    if not rp.can_fetch("*", url):
        print('error')
    else :
        print('bon')
robot_parser('https://www.sharecare.com')

bon
