In [73]:
import wikipedia as wiki
from collections import defaultdict, Counter 
import re
import heapq
import time

from sentence_transformers import SentenceTransformer
from transformers import BertForSequenceClassification, BertTokenizer
import torch
import numpy as np

from tqdm import tqdm
import urllib

from requests_html import HTMLSession
import requests
from bs4 import BeautifulSoup
import html_text

import nltk
from os.path import isfile, join

from sklearn.metrics.pairwise import cosine_similarity

import networkx as nx

from transformers import pipeline
from nltk.tokenize import word_tokenize

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords 

import pickle

In [61]:
# Removes extraneous s from the end of a title
def clean_title(title):
    return title[:-1].lower() if title[-1] == 's' else title.lower()

In [166]:
TITLE = clean_title("computer architecture")
SUBSECTIONS = 4
RELATED_TITLES = wiki.search(TITLE)

In [167]:
RELATED_TITLES

['Computer architecture',
 'Word (computer architecture)',
 'Multithreading (computer architecture)',
 'Hazard (computer architecture)',
 'Von Neumann architecture',
 'Predication (computer architecture)',
 'Computer',
 'Microarchitecture',
 'Computer science',
 'Computer architecture simulator']

## Dynamic subheading generation

In [190]:
def NounExtractor(text):
    nouns = []
    result = []
    sentences = nltk.sent_tokenize(text)
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        words = [word for word in words if word not in set(stopwords.words('english'))]
        tagged = nltk.pos_tag(words)
        cleaned_sentence = []
        for (word, tag) in tagged:
            if tag != 'NN' and tag != 'NNP' and tag != 'NNS' and tag != 'NNPS': # If the word is a proper noun  
                if word.isalnum():
                    cleaned_sentence.append(word)
            else:
                if word.isalnum() and len(word) > 2:
                    nouns.append(word)
        result.append(' '.join(cleaned_sentence))
    return nouns, ' '.join(result)

In [191]:
with open('clustering_subtopics/clusters.pickle', 'rb') as handle:
    clusters = pickle.load(handle)
clustered_subtopics = list(clusters.values())
clustered_subtopics_flat = [j for sub in clustered_subtopics for j in sub]
clustered_subtopics_flat_clean = []
for word in clustered_subtopics_flat:
    word = word.lower()
    word = re.sub(r'_', ' ', word)
    clustered_subtopics_flat_clean.append(word)
clustered_subtopics_flat_clean = set(clustered_subtopics_flat_clean)

In [192]:
# Get filtered or raw (dependent on raw argument) sub sections for an article's content
def get_subsections(data, raw=False):
    subsections = re.findall('\n== ([a-zA-z ]+) ==', data)
    if raw:
        return [subsection.lower() for subsection in subsections]
    
    subsections = [clean_title(subsection) for subsection in subsections]
    blacklisted_articles = ["reference", "see also", "external link", "note", "further reading"]
    subsections = [subsection.lower() for subsection in subsections if subsection not in blacklisted_articles]
    return subsections

In [198]:
# extracts subsections and content from related pages (cleans formatting)
def get_important_subsections_and_content(related_titles):
    topics = []
    related_paper_section_content = defaultdict(list)
    for related_title in tqdm(related_titles):
        # Get a WikipediaPage for every string title
        try:
            related_page = wiki.WikipediaPage(title=related_title)
        except wiki.DisambiguationError as e:
            continue

        content = (related_page.content).lower()
        subsections = get_subsections(content, raw=True)
        topics.extend(get_subsections(content))
        delimiters = ''
        for subsection in subsections:
            delimiters += '== ' + str(subsection) + ' ==|'
        delimiters = delimiters[:-1]
        words = re.split(delimiters, content)
        words = [word.replace('\n', '') for word in words]
        words = str(words[0])
        nouns, words = NounExtractor(words)
        important_nouns.extend(nouns)
        related_paper_section_content['intro'].append(str(words[0]))
        for i, subsection in enumerate(subsections):
            related_paper_section_content[subsection].append(str(words[i+1]))

    
    topics_set = set(topics).intersection(clustered_subtopics_flat_clean)
    common_subsections = Counter(list(topics_set))
    important_subsections = heapq.nlargest(SUBSECTIONS, common_subsections, key=common_subsections.__getitem__)
    return important_subsections, related_paper_section_content

In [199]:
# run related_paper_section_content separately <- makes it faster
important_nouns = []
important_subsections, related_paper_section_content = get_important_subsections_and_content(RELATED_TITLES)
# important_subsections.insert(0, 'intro')
print("The subheadings are: ", important_subsections)

100%|███████████████████████████████████████████| 10/10 [00:08<00:00,  1.17it/s]

The subheadings are:  ['hardware', 'overview', 'philosophy', 'background']





In [200]:
#important_subsections = ['history', 'source', 'type', 'overview', 'intoduction', 'examples', 'applications', 'syntax']

In [201]:
# Use the clusters to prevent this step from taking subtopics which are in the same "cluster" so they will be
# quite different

In [203]:
nouns_counter = Counter(important_nouns)
words_related_to_topic = heapq.nlargest(10, nouns_counter, key=nouns_counter.__getitem__)

In [204]:
words_related_to_topic

['computer',
 'word',
 'architecture',
 'computers',
 'instruction',
 'design',
 'data',
 'devices',
 'systems',
 'unit']

## Subheading encoding

In [180]:
start = time.time()
model = SentenceTransformer('sentence-transformers/all-roberta-large-v1')
print("Model initializaiton: ", time.time() - start)

Model initializaiton:  5.173375844955444


In [181]:
topic_embeddings = defaultdict(list)
# Creates word embeddings for subsection headings
features = []
for subsections in tqdm(important_subsections):
    paras = related_paper_section_content[subsections]
    topic_emb = np.average(model.encode(paras), 0)
    features.append(list(topic_emb))
features_tensor = torch.tensor(features)
print("Subheading embeddings generated")

100%|█████████████████████████████████████████████| 4/4 [00:00<00:00,  9.42it/s]

Subheading embeddings generated





In [220]:
features_tensor.shape

torch.Size([4, 1024])

## Format of output data

In [182]:
# {
#     website_1: { intro: [para1, para7] -> summarize into 1-2 sentences
#                  history: [...],   
#                  ...
#     }
#     website_2: ...
#     .
#     .
#     .
# }



# {
#     intro: { website_1 : []
        
#     }
    
    
# }

## Web scraping

In [183]:
def get_source(url):
    try:
        session = HTMLSession()
        response = session.get(url)
        return response
    except requests.exceptions.RequestException as e:
        print(e)
        
def google_search(query):
    query = urllib.parse.quote_plus(query)
    response = get_source("https://www.google.com/search?q=" + query)
    links = list(set(response.html.absolute_links))
    # Get rid of these from the domains that are used
    google_domains = ('https://www.google.'
                      'https://google.',
                      'https://www.google.com/search?',
                      'https://webcache.googleusercontent.', 
                      'http://webcache.googleusercontent.', 
                      'https://policies.google.',
                      'https://support.google.',
                      'https://maps.google.',
                      'https://www.coursera.org',
                      'https://www.youtube.com',
                     'https://online.umich.edu/',
                      'https://docs.oracle.com/',
                      'https://www.cise.ufl.edu/~mssz/CompOrg/CDA-lang.html',
                      'https://study.com/academy',
                      'https://www.redhat.com',
                      'https://www.oreilly.com',
                      'https://scholar.google.com',
                      'https://machinelearningknowledge',
                      'https://interestingengineering.com',
                      'https://www.nature.com/',
                      'https://machinelearningmastery.com',
                      'https://www.thelancet.com/',
                      'https://m.youtube.com',
                      'https://www.mathworks.com',
                      'https://www.deeplearningbook',
                      'https://u.today',
                      'https://docplayer.net',
                      'https://translate.google.com',
                      'http://51.91.248.81',
                      'https://twitter'
                     )
    
    for url in links[:]:
        url_check = url.split('#')[0]
        if url_check in urls_visited or url.startswith(google_domains):
            links.remove(url)
        if url_check not in urls_visited:
            urls_visited.add(url_check)
        if url[-3:] == 'pdf':
            links.remove(url)
    return links

# deprecated
def collect_data_from_url(results):
    data = []
    for url in results:
        print(url)
        # Specially collect data from wikipedia
        if url.startswith('http://en.wikipedia.org/wiki/') or url.startswith('https://en.wikipedia.org/wiki/'):
            search_term = url.replace('http://en.wikipedia.org/wiki/', '').replace('https://en.wikipedia.org/wiki/', '').replace('_', ' ').replace('%E2%80%93', '-').replace('%27', "'")
            sentences = wiki.WikipediaPage(title=search_term).content
            text_info = ''
            for sent in sentences.split('.'):
                if sent == '' or len(sent) > 500 or len(sent) < 10:
                    continue
                sent_emb = torch.from_numpy(model.encode(sent))
                if float(sent_emb @ topic_emb) < 0.3:
                    continue
                text_info += (sent + '. ')

            item = {
                'title': search_term,
                'link': url,
                'text': text_info,
                'emb': model.encode(text_info)
            }
            data.append(item) 
        else:
            try: 
                page = requests.get(url, timeout=(5, 10))
            except requests.exceptions.Timeout as err: 
                #print("here")
                continue
            #print(page)
            soup = BeautifulSoup(page.content, "html.parser",from_encoding="iso-8859-1")
            p = soup.find_all('p')
            paragraphs = []
            for x in p:
                paragraphs.append(str(x))
            if len(paragraphs) == 0:
                continue
            text_info = ''
            for para in paragraphs:
                if para == '':
                    continue
                sentences = html_text.extract_text(para, guess_layout=False)
                for sent in sentences.split('.'):
                    if sent == '' or len(sent) > 500 or len(sent) < 10:
                        continue
                    sent_emb = torch.from_numpy(model.encode(sent))
                    if float(sent_emb @ topic_emb) < 0.3:
                        continue
                    text_info += (sent + '. ')
            if len(text_info) < 100 or len(text_info) > 10000: 
                continue
            item = {
                'title': "<UNK>",
                'link': url,
                'text': text_info,
                'emb': model.encode(text_info)
            }
            data.append(item)
    return data

In [184]:
urls_visited = set()
results = []
for text in tqdm(important_subsections):
    if text == 'intro':
        results.extend(google_search("what is " + TITLE))
    else:
        results.extend(google_search(TITLE + " " + text.lower()))

print("We have selected ", len(results), " webpages.")

100%|█████████████████████████████████████████████| 4/4 [00:03<00:00,  1.01it/s]

We have selected  31  webpages.





In [110]:
# headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
# raw_dataset = defaultdict(list)
# count_of_paras = 0
# # Get website urls and the paragraph tags in them
# for result in tqdm(results):
#     temp_dataset = ''
#     paragraphs = []
#     temp_cleaned_para = []
#     try:
#         page = requests.get(result, timeout=(5, 10), headers=headers)
#     except:
#         continue
#     soup = BeautifulSoup(page.content, "html.parser")
#     p = soup.find_all('p')
    
#     for x in p:
#         paragraphs.append(str(x))
#     for i, para in enumerate(paragraphs):
#         if para != '':
#             temp_cleaned_para.append(html_text.extract_text(para, guess_layout=False))

#     for i, para in enumerate(temp_cleaned_para):
#         if len(nltk.word_tokenize(para)) > 30 and len(nltk.word_tokenize(para)) < 150:
#             para = re.sub('[\[].*?[\]]', '', para)
#             raw_dataset[result].append(para)
#             count_of_paras += 1
# print("We have scraped ", count_of_paras, " paragraphs.")

100%|███████████████████████████████████████████| 30/30 [00:17<00:00,  1.69it/s]

We have scraped  409  paragraphs.





In [185]:
def run(result):
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
    raw_dataset = defaultdict(list)
    temp_dataset = ''
    paragraphs = []
    temp_cleaned_para = []
    try:
        page = requests.get(result, timeout=(5, 10), headers=headers)
    except:
        print("sad")
    soup = BeautifulSoup(page.content, "html.parser")
    p = soup.find_all('p')

    for x in p:
        paragraphs.append(str(x))
    for i, para in enumerate(paragraphs):
        if para != '':
            temp_cleaned_para.append(html_text.extract_text(para, guess_layout=False))

    for i, para in enumerate(temp_cleaned_para):
        if len(nltk.word_tokenize(para)) > 30 and len(nltk.word_tokenize(para)) < 150:
            para = re.sub('[\[].*?[\]]', '', para)
            raw_dataset[result].append(para)
    return raw_dataset
                

In [186]:
import time
from joblib import Parallel, delayed
start = time.time()
results = Parallel(n_jobs=10)(delayed(run)(result) for result in results)
print(time.time() - start)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [208]:
raw_dataset = defaultdict(list)
for dic in results:
    raw_dataset.update(dic)

In [209]:
len(raw_dataset)

20

In [223]:
raw_dataset

defaultdict(list,
            {'https://eng.libretexts.org/Bookshelves/Computer_Science/Programming_Languages/Book%3A_Python_for_Everybody_(Severance)/01%3A_Introduction/1.03%3A_Computer_Hardware_Architecture': ['Before we start learning the language we speak to give instructions to computers to develop software, we need to learn a small amount about how computers are built. If you were to take apart your computer or cell phone and look deep inside, you would find the following parts:',
              'While most of the detail of how these components work is best left to computer builders, it helps to have some terminology so we can talk about these different parts as we write our programs.',
              'As a programmer, your job is to use and orchestrate each of these resources to solve the problem that you need to solve and analyze the data you get from the solution. As a programmer you will mostly be "talking" to the CPU and telling it what to do next. Sometimes you will tell the 

## Subheading classification and dataset generation

In [229]:
topic_subheading_dataset = defaultdict(lambda: defaultdict(list))
for webpage, data in tqdm(raw_dataset.items()):
    data_noun, data_for_model = [], []
    for d in data:
        output = NounExtractor(d)
        data_noun.append(output[0])
        data_for_model.append(output[1])
    paragraph_embedding = torch.tensor(model.encode(data_for_model))
    labels = torch.argmax((features_tensor @ paragraph_embedding.T), axis=0)
    i = 0
    for paragraph, label in zip(data, labels):
        if len(set(words_related_to_topic).intersection(set(data_noun[i]))) > 2:
            if len(topic_subheading_dataset[important_subsections[label]]) > 5:
                continue
            topic_subheading_dataset[important_subsections[label]][webpage].append(paragraph)
        i += 1
    else:
        continue

100%|███████████████████████████████████████████| 20/20 [00:22<00:00,  1.13s/it]


In [230]:
for subheading, websites in topic_subheading_dataset.items():
    print(subheading, ': ', len(websites))

philosophy :  6
hardware :  6
overview :  5
background :  6


## Summarization

In [232]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                          do_lower_case=True)
summarizer = pipeline('summarization', model="sshleifer/distilbart-cnn-12-6" )
import spacy
nlp = spacy.load('en_core_web_sm')
import language_tool_python

In [233]:
summarized_dataset = defaultdict(lambda: defaultdict(str))

In [234]:
for subheading, websites in tqdm(topic_subheading_dataset.items()):
    for url, paragraphs in (websites.items()):
        data = ''.join(paragraphs)
        if len(tokenizer([data])['input_ids'][0]) > 1023:
                count = 0
                data_nlp = nlp(data)
                sentences = list(data_nlp.sents)
                #print(sentences)
                data = ""
                for sentence in sentences:
                    sentence = str(sentence)
                    #print(type(sentence))
                    count += (2 + len(word_tokenize(sentence)))
                    if count < 924:
                        data += sentence
        summary_text = summarizer(data, max_length=len(word_tokenize(data))//2\
                                  , min_length = len(word_tokenize(data))//4)[0]['summary_text']
#         tool = language_tool_python.LanguageTool('en-US') 
#         summary_text = tool.correct(summary_text)
#         tool.close()
        summarized_dataset[subheading][url] = summary_text

  0%|                                                     | 0/4 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (763 > 512). Running this sequence through the model will result in indexing errors
100%|█████████████████████████████████████████████| 4/4 [01:02<00:00, 15.52s/it]


In [235]:
summarized_dataset

defaultdict(<function __main__.<lambda>()>,
            {'philosophy': defaultdict(str,
                         {'http://www2.latech.edu/~choi/Bens/Teaching/Csc364/index.htm': ' Topics include processor, control unit and microprogramming, computer arithmetic',
                          'https://en.wikipedia.org/wiki/Computer_architecture': " The first documented computer architecture was in the correspondence between Charles Babbage and Ada Lovelace, describing the analytical engine . Konrad Zuse described in two patent applications for his future projects that machine instructions could be stored in the same storage used for data, i.e., the stored-program concept . The term “architecture” in computer literature can be traced to the work of Lyle R. Johnson and Frederick P. Brooks, Jr., members of the Machine Organization department in IBM's main research center in 1959 . Later, computer users came to use the term in many less explicit ways . Computers do not understand high-level prog

In [None]:
# 'personal life': defaultdict(str,
#                          {'https://www.beursschouwburg.be/en/events/we-object/we-object-with-joelle-sambi-nzeba-babs-gons/': 
#                           " 'JOËLLE SAMBI NZEBA was born in Belgium, but spent part of her childhood in 
#                           Kinshasa before returning to Brussels where she now lives and works. Alongside 
#                           her professional activities which are carried out within the context of a feminist 
#                           movement, she is a writer. She graduated from the Université Libre de Bruxelles with a 
#                           degree in information and communication (journalism) and is the author of several 
#                           prize-winning works of fiction (Je ne sais pas rêver, 2002 and Le monde est gueule 
#                           de chèvre, 2007). Through her activism (Merhaba, Festival Massimadi Bruxelles) and 
#                           writing, Joëlle Sambi Nzeba tries to question situations of powerlessness. She gets 
#                           people talking about identity, the norm and belonging.'",
                          
#                           'https://bela.be/auteur/joelle-sambi': 
#                           " Noelle Samba is co-présidente de l'Euro Central Asian Lesbian Community. She is also a member of the Belgian Network For Black Lives. Samba dissolve de provenance et travail d’scripture, LE Congo, son history et la Belgium contemporize sent enjoins presents en filigree.",
                          
#                           'https://www.laicite.be/magazine-article/joelle-sambi-nzeba-portrait-pluriel/': 
#                           ' Un true entire LES Chou de Belles à la sauce cocktail days un bar poussiéreux, parfait applique de son cousin kinds, at Mating. Un non-sens quo. C’est CA la Belgium en moi. Noelle Samba Zebra.',


In [None]:
{'overview': defaultdict(str,
                         {'https://online.princeton.edu/computer-architecture': ' Building on a computer organization base, this course explores techniques that go into designing a modern microprocessor. Fundamental understanding of computer architecture is key not only for students interested in hardware and processor design. This course will explore how the computer architect can utilize the increasing number of transistors available to improve the performance of a processor. Focus will be given to architectures that can exploit different forms of parallelism, whether they be implicit or explicit.',
                          'https://www.educba.com/types-of-computer-architecture/': ' Each memory has multiple locations and each location has a unique address. We can address the contents of memory by its location irrespective of what type of data and instructions are present in the memory. Microarchitecture performs in a certain way. It reads the instruction and decodes it, will find parallel data to process the instruction. It is used in microprocessors, microcontrollers.',
                          'https://geteducationskills.com/computer-architecture/': ' This chapter provides a first examination of the principal forms of supercomputer architecture and the underlying concepts that govern their performance. It is here, at the structural and logical levels, that parallelism of operation in its many forms and size is first presented. This chapter introduces the basic foundations of computer architecture in general and for high-performance computer systems in particular. The chapter provides an overview of all computer cores, from those few in the smallest mobile phones to potentially millions making up the world’',
                          'https://en.wikipedia.org/wiki/Computer_architecture': ' Computer architecture is concerned with balancing the performance, efficiency, cost, and reliability of a computer system. Longer and more complex instructions take longer for the processor to decode and can be more costly to implement effectively. Memory organization defines how instructions interact with the memory, and how memory interacts with itself. Computers that control machinery usually need low interrupt latencies. Multimedia projects may need very rapid data access, while virtual machines may need fast interrupts.',
                          'https://learn.saylor.org/course/CS301': ' In this unit, we will discuss various components of MIPS processor architecture. This unit will ask you to apply the information you learned in units 2, 3, and 4 to create a simple processor architecture. We will also discuss a technique known as pipe lining, which is used to improve processor performance. The unit will conclude with a look at some programming techniques used in the context of parallel machines.'}),
'source': defaultdict(str,
                         {'https://online.princeton.edu/computer-architecture': ' "Thank you for making this excellent course available! It was very insightful, the explanations were great -- it really helped to understand a lot of the behind-the-scenes magic that I\'ve been taking for granted in 20 years as a software engineer. Thanks a lot!" says the author of the book "thank you a lot" and the course was "very insightful"',
                          'https://www.educba.com/types-of-computer-architecture/': ' The name defines itself, the design will satisfy user requirements such as architecture, module, interfaces and data for a system, and it is connected to product development. Modular systems are made by standardizing hardware and software. It is the process of taking marketing information and creating product design to be manufacture. Modular design is a process of standardizing software and hardware to meet user requirements.',
                          'https://geteducationskills.com/computer-architecture/': " Designing a computer is about designing a machine that holds and manipulates data. This book is about how to make specialized brains. It's all about processing information, processing information. Designing computers is about making computers that hold and manipulate data, rather than computers that run software and run software. The book is published by Simon Tisdale, a British-based publisher, and is available on Amazon.com.",
                          'https://learn.saylor.org/course/CS301': ' We will begin this unit with an overview of digital components, identifying the building blocks of digital logic. We will build on that foundation by writing truth tables and learning about more complicated sequential digital systems with memory. This unit serves as background information for the processor design techniques we learn in later units. To receive a free Course Completion Certificate, you will need a grade of 70% or higher on this final exam.'}),
'type': defaultdict(str,
                         {'https://online.princeton.edu/computer-architecture': ' "I am a VLSI Design professional working in the field of CPU/SoC architecture and Design. This course helped me to reinforce the basics and also to find more interesting topics to explore and research. The course content was very good covering the essential concepts," says one of the course\'s students. The course was very successful, says the author of the book.',
                          'https://www.educba.com/types-of-computer-architecture/': " Data and instructions are stored in a single read/write memory within the computer system. Harvard's architecture is used when data and code is present in different memory blocks. A separate memory block is needed for data and instruction. Data can be accessed by one memory location and instruction can be. Accessed by a different. Location in different types of computer architecture in different computer architectures.",
                          'https://www.codecademy.com/learn/computer-architecture': " In this course, you’ll learn about what the main physical components of a computer are, why 0 and 1 are such important numbers within computing, how instruction set architecture (ISA) establishes communication between the hardware and software components. The course will teach you computer architecture with a combination of lessons, articles, quizzes, problem sets, and projects. At the end of the course you'll be prompted to create your own CPU simulator in Python.",
                          'https://geteducationskills.com/computer-architecture/': ' Computer engineering is a science or a set of rules stating how brain software and hardware are joined together and interact to make a computer work. It not only determines how the brain works but also of which technologies the computer is capable. The best programs for aspiring computer architects are computer-based fields because they offer students the most hands-on experience in database design or network security.',
                          'https://en.wikipedia.org/wiki/Computer_architecture': ' Computers do not understand high-level programming languages such as Java, C++, or most programming languages used. A processor only understands instructions encoded in some numerical fashion, usually as binary numbers. Software tools, such as compilers, translate those high level languages into instructions that the processor can understand. The ISA defines items in the computer that are available to a program.',
                          'https://learn.saylor.org/course/CS301': ' The purpose of this course is to cultivate an understanding of modern computing technology through an in-depth study of the interface between hardware and software. The course will conclude with a look at the recent switch from sequential processing to parallel processing by looking at the parallel computing models and their programming implications. You will learn about modern computer architecture and the Von Neumann architecture, pipe lining, memory management, storage, and other input/output topics.'}),
'history': defaultdict(str,
                         {'https://www.educba.com/types-of-computer-architecture/': ' Computer architecture consists of rules and methods or procedures which describe the implementation, functionality of the computer systems. Architecture is built as per the user’s needs by taking care of the economic and financial constraints. The computer system has the processor, memory, I/O devices and communication channels that connect to it. It has digital signal processors that will execute small or highly audio or video algorithms, and it is reproducible.',
                          'https://www.codecademy.com/learn/computer-architecture': ' Create a simple calculator application in Python that uses a 32-bit Instruction Set Architecture that the student designs to read and execute binary instructions. It simulates the basic CPU function in the computer hierarchy. Students can go in knowing zero, nothing, and just get a grasp on everything as you go and start building right away. I know from first-hand experience that you can go into knowing zero.',
                          'https://geteducationskills.com/computer-architecture/': ' Computer Architecture: In computer manufacturing, computer engineering is a set of rules and methods that describe the functionality, organization, and utilization of computer systems. Computer architects are expected to see an employment growth of 6% between 2016 and 2026 as reported by the U.S. Bureau of Labor Statistics. While cloud computing has decreased the need for computer architects somewhat, they will continue to be in demand as businesses continue to increase their technology needs.',
                          'https://en.wikipedia.org/wiki/Computer_architecture': ' In computer engineering, computer architecture is a set of rules and methods that describe the functionality, organization, and implementation of computer systems. The first documented computer architecture was in correspondence between Charles Babbage and Ada Lovelace, describing the analytical engine. Konrad Zuse described in two patent applications for his future projects that machine instructions could be stored in the same storage used for data, i.e., the stored-program concept.',
                          'https://learn.saylor.org/course/CS301': ' In this unit, we will discuss some advances in technology that led to the development of modern computers. We will discuss the importance of computing power and how it motivated the switch from a single-core to a multicore processor. In previous units, you learned about how computer memory stores information, how numbers are represented in a computer memory word (typically, 32 or 64 bits) We will also discuss the designs of adders, multipliers, and dividers.'})})