<a href="https://colab.research.google.com/github/Karthick47v2/mcq-generator/blob/main/summarize-extract-genQ.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip3 install transformers==4.15.0    # only this ver works with FastT5
!pip3 install SentencePiece
!pip3 install git+https://github.com/boudinfl/pke.git
!pip3 install fastt5

!pip install keybert
!pip3 install keyphrase-vectorizers
############################################################################################

In [None]:
## IMPORT LIBS
# natural language toolkit for helping utilities
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

# text preprocessing
import string
import re

# meaningful keyword extraction
from keybert import KeyBERT
from keyphrase_vectorizers import KeyphraseCountVectorizer

# onnx model inferece 
from fastT5 import get_onnx_model, get_onnx_runtime_sessions, OnnxT5 

# working with transformers
from transformers import AutoTokenizer

# helper
import numpy as np

# file dir helper
from pathlib import Path
import os

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
############################################################################################

In [None]:
## INITIALIZE ALL REQUIRED MODELS
# initialize keyword extration model (KeyBERT) and keypharse vectorizer for meaningful keywords
kw_model = KeyBERT()
vectorizer = KeyphraseCountVectorizer()

# initialize summarize model
model_path = '/content/gdrive/MyDrive/mcq-gen/t5-summarize/t5-base'
model_name = "t5-base"
encoder_path = os.path.join(model_path, f"{model_name}-encoder-quantized.onnx")
decoder_path = os.path.join(model_path, f"{model_name}-decoder-quantized.onnx")
init_decoder_path = os.path.join(model_path, f"{model_name}-init-decoder-quantized.onnx")

model_sessions = get_onnx_runtime_sessions((encoder_path,decoder_path,init_decoder_path))
sum_model = OnnxT5(model_path, model_sessions)
sum_tokenizer = AutoTokenizer.from_pretrained(model_path)

# initialize question generation model
model_path = '/content/gdrive/MyDrive/mcq-gen/t5-question'
model_name = 't5_squad_v1'
encoder_path = os.path.join(model_path, f"{model_name}-encoder-quantized.onnx")
decoder_path = os.path.join(model_path, f"{model_name}-decoder-quantized.onnx")
init_decoder_path = os.path.join(model_path, f"{model_name}-init-decoder-quantized.onnx")

model_sessions = get_onnx_runtime_sessions((encoder_path,decoder_path,init_decoder_path))
q_model = OnnxT5(model_path, model_sessions)
q_tokenizer = AutoTokenizer.from_pretrained(model_path)

In [9]:
## HELPER FUNCIONS FOR SUMMARIZATION
# preprocess the text (removing unwanted signs)
# remove all signs other than -,-,a-z,A-Z,0-9..... and remove all extra blank spaces
def preprocess_bulk_text(text):
  text = text.strip()
  text = re.sub('[\u2010-\u2013]', '-', text)
  text = re.sub('[^a-zA-Z0-9\.,-?%&*()]', ' ', text)
  text = re.sub(' {2,}', ' ', text)
  return text

# split the bulk input text into required input length for summarizing model
def split_text(text, range=300):
  bulk_text = preprocess_bulk_text(text)
  splitted_texts = []
  # split whole input into $(range) block of meaningful text. (only split after a full stop)
  while(len(bulk_text) > range):
    i = range
    while((i < len(bulk_text)) and (bulk_text[i] != '.')):
      i += 1
    splitted_texts.append(bulk_text[:(i+1)])
    bulk_text = bulk_text.replace(bulk_text[:(i+1)], "")
  return splitted_texts

# preprocess splitted text to required input format for summarizer model
def preprocess_splitted_text(text):
  # "summarize: xxxxxxxx" is the input format for model
  encode = sum_tokenizer.encode_plus("summarize: " + text, return_tensors='pt', pad_to_max_length=False, truncation=True)
  return encode["input_ids"], encode["attention_mask"]

# summarize input text
def summarize(text):
  input_tokens_ids, attention_mask = preprocess_splitted_text(text)
  # encoded output
  summary_encoded = sum_model.generate(input_ids=input_tokens_ids, 
                                   attention_mask=attention_mask,
                                   num_beams=3,                       # get the sentence with max prob of 3 tokens
                                   num_return_sequences=1,            # only need 1 outpu
                                   no_repeat_ngram_size=2,            # no repeat of 2 ngram
                                   max_length=512,                    # model's in length - default
                                   early_stopping=True)
  

  # decode summarized token
  output = sum_tokenizer.decode(summary_encoded[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
  return postprocess_summary(output)

# postpress the output of summarizer model for fair readable output
# capitalize firt word of sentence. put spaces in required place
def postprocess_summary(text):
  output = ""

  for x in sent_tokenize(text):
    x = x.capitalize()
    output += " " + x
  return output

In [10]:
## HELPER FUNCTIONS FOR KEYWORD EXTRACTION
# extract keywords using KeyBERT
def extract_keywords(text, kw_pop):
  kw = kw_model.extract_keywords(text, vectorizer=vectorizer)

  kw_ls = []
  for i in kw:
    # 0 -> keyword, 1-> confidence / probability
    kw_ls.append(i[0])
  return kw_ls

# extract keywords from both summary and original text and only 
# return keywords which are common (extra validation)

# max keywords per summary-original pair is 5 so that we can reduce 
# unnecessary extra questions
def filter_keywords(original, summarized, kw_pop=5):
  orig_ls = extract_keywords(original, kw_pop)
  sum_ls = extract_keywords(summarized, kw_pop)
  orig_ls = set(orig_ls)
  return list(orig_ls.intersection(sum_ls))

In [11]:
## HELPER FUNCTIONS FOR QUESTION GENERATION
# preprocess the summary for question generation
def preprocess_summary(context, answer):
  # "context: XXXXXXXXxxxxx answer: XXXXXXXXX" is the required format for question generation model
  text = "context: {} answer: {}".format(context, answer)
  encode = q_tokenizer.encode_plus(text, 
                                   return_tensors='pt',
                                   max_length = 382,                  # for meaningful context-question pair ---- no a magical number
                                   pad_to_max_length=False,      
                                   truncation=True)
  return encode["input_ids"], encode["attention_mask"]

# generate questions from context-answer pair
def gen_question(context, answer):
  input_tokens_ids, attention_mask = preprocess_summary(context, answer)

  # encoded output
  question_encoded = q_model.generate(input_ids=input_tokens_ids, 
                                             attention_mask=attention_mask,
                                             num_beams=5,             # 5 gave good results 
                                             no_repeat_ngram_size=2,  #
                                             max_length=72,           # single question's max token len-- just an arbitary no -- but its enough
                                             early_stopping=True)
  
  # decode summarized token and post process it before print
  output = q_tokenizer.decode(question_encoded[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
  output = output.replace("question: ", "")
  output = output.strip()
  return output


In [12]:
full_text = """
Manufacturing processes are the steps through which raw materials are transformed into a final product. The manufacturing process begins with the use of the materials and then modified through manufacturing processes to become the required part. The process involves use of machinery, tools, power and labour. During the process, it adds greater valve to the final product. Therefore, manufacturing is a value added process.
Machinery Tools Power Labour
Raw materials Product
Raw materials: Raw materials are often natural resources such as crude oil, iron ore and wood. They are harvested from the earth. Processed materials are materials refined by humans.eg steel, petrol, paper, glass Iron ore: Iron ore is available on earth in the form of rocks rich in iron oxide. It also contains other impurities such as Sulphur. Iron is extracted from iron ore by heating them (above 1250 deg C) with coke (Coke is a refined form of coal) in a furnace called blast furnace. Oxides and other impurities are removed from the iron ore to leave the iron behind which is called ‘Pig iron’.
Pig iron ingots Pig iron contains higher percentage of carbon (3.5 to 4.5%). Therefore, it is very hard and brittle and it cannot be used as an engineering material. To become steel, it must be melted again and reprocessed to reduce the carbon to the correct amount. Steel is iron and up to 1.5% carbon. Elements such as chromium, nickel, tungsten, vanadium can also be added to steel, to get different properties. When added, they are called alloy steels. After preparation of the correct material, the liquid steel is allowed to solidify in the form of a billet, slab or an ingot. These are again processed to make final products.eg. Sheets, tubes, rods This includes production of iron, copper, aluminum and other metals from their ores.
The basic properties of materials
1. Physical properties:
These can be considered to include density, specific gravity and melting point.
2. Electrical properties:
Electrical properties are resistivity and conductivity.
3. Thermal properties:
These are displayed when there is heat input to a material and include expansion, thermal conductivity and specific heat.
4. Chemical properties:
These are, for example, corrosion
5. Mechanical properties:
The mechanical properties of materials defined the behavior of materials under the action of external forces called loads or stresses. There is a measure of strength and durability of a material in service. These properties are great importance in the design of machines and structures.
The most important mechanical properties
1. Strength
The strength of material is its capacity to withstand failure under the action of external
loads. The stronger the material the greater the load it can withstand before failure.
Types of Stresses: -
(i). Tensile Stress: the force acts to pull materials apart
(ii). Compressive Stress: the force squeezes material
(iii). Shear Stress: the force causes one part to slide on another part
2. Elasticity
The elasticity of a metal is its power of coming back to its original shape after deformation when the loads are removed.
3. Plasticity
The plasticity of a metal is the ability to change its shape without destruction under the application of loads, and to retain its shape, when the loads are withdrawn.
4. Toughness
This ability of a material to resist hammering or impact loads without fracturing. Toughness is a high desirable quality for structural and machine parts which have to withstand shocks and vibrations.
5. Brittleness
The brittleness of material is the property of braking without much permanent distortion. These materials break into pieces due to impact.
6. Malleability
The malleability of metal is its ability to change shape by external force (hammering) without breaking.
7. Ductility
The ductility of a metal is the property which enables a metal to be drawn into wires without breaking. Copper is a ductile material. The ductility of a material increases with the temperature.
8. Hardness
This is ability of a material to resist wear. (eg. knife, file). Hard materials can be used to cut soft materials.
Classification of engineering materials
Following chart gives a classification of engineering materials that are commonly used in a
Workshop
Plain carbon steel: Carbon steel is the most widely used kind of steel. The properties of carbon steel depend primarily on the amount of carbon it contains. There are three types of plain carbon steels. (a) Mild steel (up to 0.3% C) (b) Medium carbon steel (0.3 to 0.8%) (c) High carbon steel (0.8 to 1.5%) The following graph shows the variation of (i) strength (toughness), (ii) ductility and malleability, (iii) hardness & brittleness with increase in carbon percentage.
Cast iron
Cast iron is made by melting pig iron with cast iron scrap and steel scrap in cupola furnaces and poured into molds to make castings. Cast Iron is generally defined as an alloy of iron with 2.5 % to 3 5 % Carbon, and usually with small amount of Silicon and Manganese. Due to its high carbon percentage that exists in the form of graphite, it has self lubricating property.
Cast iron is comparatively weak and brittle in tension and has a high hardness. It can take high compression loads. It has lubricating properties and easily cast. (casting is a manufacturing process)
Cast iron is used to make automotive parts such as engine blocks, cylinder liners, cylinder heads, machine parts, industrial components, pump housings and motor housings. Applications of mild steel, medium carbon steel, high carbon steel and cast iron are given below.
Alloy steels: Steel has Iron and Carbon. Alloy steel has one or more alloying elements other than iron and carbon such as chromium, vanadium, nickel, tungsten etc. The basic properties of steel given above in this chapter can be changed by adding various alloying elements. Applications of alloy steels:- (a) Stainless steel: - Good quality stainless steel has about 18 to 20% Chromium as the alloying element. Stainless steel does not readily corrode, rust or stain with water as ordinary steel does. It has the ability to resist oxidation.
Stainless steel sheet Cutlery setWrist watch strap
Medical equipment Kitchen items Tubes and sections
(b) High speed steel (HSS):-
High Speed Steel contains about 18% Tungsten as the alloying element. It performs a high hardness and a high wear resistance. Therefore, this material is used to make cutting tools (tools to cut other materials of less hardness).
(c) Vanadium steel:-
Vanadium has the ability to improve the strength and toughness of steel. Therefore, it is used to make tools which need high strength.
Chromium – Vanadium steel spanner
Non ferrous metals:-
Non-ferrous metals do not contain iron (Fe) in appreciable amounts. They are more expensive than ferrous metals. They have exceptional engineering properties.
Example of some non ferrous metals used in the industry: • Aluminum has low weight and resistance to corrosion. • Copper has high electrical and thermal conductivity and resistance to corrosion. Electrical wires (conductors) are made of copper • Lead is a good conductor of electricity. Lead and Tin are mixed 1 : 1 ratio to make solder. • Tin is applied on ferrous metals (inside cans) to prevent corrosion • Zinc has high ability to resist corrosion. Melted zinc is applied on ferrous metals to prevent corrosion. The process is called galvanizing.
Non ferrous alloys:-
(a) Aluminum that is used in the industry is not pure aluminum, but they are alloys. Alloying elements are copper, silicon, tin, zinc etc. Their applications are automobile bodies, aircrafts, cans, alloy wheels, tubes and sections and sheets.
(b) Applications of Copper alloys:
Brass and bronze are copper alloys. Brass (copper and zinc) and bronze (copper and tin).
Brass and bronze are used to make water and gas taps, pipes, bushes, door hinges and
door locks, household items, and statues.

"""

In [None]:
## MAIN FUNCTIONS
# generate questions from keywords
splitted_text = split_text(full_text)

summary = []
filtered_kw = []
questions_w_ans = []
all_answers = []

# summarize and find keywords for each splitted text
for i in range(len(splitted_text)):
  summary.append(summarize(splitted_text[i]))
  filtered_kw.append(filter_keywords(splitted_text[i], summary[i]))

# generate questions for each keywords
for i in range(len(filtered_kw)):
  for x in filtered_kw[i]:
    questions_w_ans.append([gen_question(summary[i], x), x])

In [21]:
questions_w_ans[0]

['Manufacturing is the step through which raw materials are transformed into a final product?',
 'processes']