# Advanced Certification in AIML
## A Program by IIIT-H and TalentSprint

In [None]:
#@title Experiment Walkthrough
from IPython.display import HTML

HTML("""<video width="320" height="240" controls>
  <source src="https://cdn.iiith.talentsprint.com/aiml/Experiment_related_data/Walkthrough/Aptitude_classification.mp4">
</video>
""")

### Learning Objectives:

At the end of the experiment, you will be able to:

*  generate vectors using Word2Vec model

## Dataset
Being able to classify the questions will be difficult in natural language processing. The dataset is taken from the TalentSprint aptitude questions which contains more than 20K questions.

## Description
This dataset has the following columns:
1. **Category:** Gives the high-level categorization of the question
2. **Sub-Category:** Determines the type of questions
3. **Article:** Gives the article name of the question
4. **Questions:** Questions are listed
5. **Answers:** Contains answers



The dataset, which is considered in the experiment is partially pre-processed using BeautifulSoup and removed punctuations, HTML tags.


### Setup Steps

In [None]:
#@title Please enter your registration id to start: (e.g. P181900101) { run: "auto", display-mode: "form" }
Id = "" #@param {type:"string"}


In [None]:
#@title Please enter your password (normally your phone number) to continue: { run: "auto", display-mode: "form" }
password = "" #@param {type:"string"}


In [None]:
#@title Run this cell to complete the setup for this Notebook
from IPython import get_ipython

ipython = get_ipython()
  
notebook= "U2W7_02_Aptitude_Classification_B" #name of the notebook

def setup():
    ipython.magic("sx wget https://cdn.iiith.talentsprint.com/aiml/Experiment_related_data/Cleaned_Aptitude_Classification.csv")
    ipython.magic("sx wget https://cdn.talentsprint.com/talentsprint1/archives/sc/aiml/experiment_related_data/AIML_DS_GOOGLENEWS-VECTORS-NEGATIVE-300_STD.rar")
    ipython.magic("sx unrar e /content/AIML_DS_GOOGLENEWS-VECTORS-NEGATIVE-300_STD.rar") 
    from IPython.display import HTML, display
    display(HTML('<script src="https://dashboard.talentsprint.com/aiml/record_ip.html?traineeId={0}&recordId={1}"></script>'.format(getId(),submission_id)))
    print("Setup completed successfully")
    return

def submit_notebook():
    ipython.magic("notebook -e "+ notebook + ".ipynb")
    
    import requests, json, base64, datetime

    url = "https://dashboard.talentsprint.com/xp/app/save_notebook_attempts"
    if not submission_id:
      data = {"id" : getId(), "notebook" : notebook, "mobile" : getPassword()}
      r = requests.post(url, data = data)
      r = json.loads(r.text)

      if r["status"] == "Success":
          return r["record_id"]
      elif "err" in r:        
        print(r["err"])
        return None        
      else:
        print ("Something is wrong, the notebook will not be submitted for grading")
        return None
    
    elif getAnswer() and getComplexity() and getAdditional() and getConcepts() and getWalkthrough() and getComments() and getMentorSupport():
      f = open(notebook + ".ipynb", "rb")
      file_hash = base64.b64encode(f.read())

      data = {"complexity" : Complexity, "additional" :Additional, 
              "concepts" : Concepts, "record_id" : submission_id, 
              "answer" : Answer, "id" : Id, "file_hash" : file_hash,
              "notebook" : notebook, "feedback_walkthrough":Walkthrough ,
              "feedback_experiments_input" : Comments,
              "feedback_mentor_support": Mentor_support}

      r = requests.post(url, data = data)
      r = json.loads(r.text)
      if "err" in r:        
        print(r["err"])
        return None   
      else:
        print("Your submission is successful.")
        print("Ref Id:", submission_id)
        print("Date of submission: ", r["date"])
        print("Time of submission: ", r["time"])
        print("View your submissions: https://aiml.iiith.talentsprint.com/notebook_submissions")
        #print("For any queries/discrepancies, please connect with mentors through the chat icon in LMS dashboard.")
        return submission_id
    else: submission_id
    

def getAdditional():
  try:
    if not Additional: 
      raise NameError
    else:
      return Additional  
  except NameError:
    print ("Please answer Additional Question")
    return None

def getComplexity():
  try:
    if not Complexity:
      raise NameError
    else:
      return Complexity
  except NameError:
    print ("Please answer Complexity Question")
    return None
  
def getConcepts():
  try:
    if not Concepts:
      raise NameError
    else:
      return Concepts
  except NameError:
    print ("Please answer Concepts Question")
    return None
  
  
def getWalkthrough():
  try:
    if not Walkthrough:
      raise NameError
    else:
      return Walkthrough
  except NameError:
    print ("Please answer Walkthrough Question")
    return None
  
def getComments():
  try:
    if not Comments:
      raise NameError
    else:
      return Comments
  except NameError:
    print ("Please answer Comments Question")
    return None
  

def getMentorSupport():
  try:
    if not Mentor_support:
      raise NameError
    else:
      return Mentor_support
  except NameError:
    print ("Please answer Mentor support Question")
    return None

def getAnswer():
  try:
    if not Answer:
      raise NameError 
    else: 
      return Answer
  except NameError:
    print ("Please answer Question")
    return None
  

def getId():
  try: 
    return Id if Id else None
  except NameError:
    return None

def getPassword():
  try:
    return password if password else None
  except NameError:
    return None

submission_id = None
### Setup 
if getPassword() and getId():
  submission_id = submit_notebook()
  if submission_id:
    setup() 
else:
  print ("Please complete Id and Password cells before running setup")



#### Importing required packages

In [None]:
# Importing and downloading required packages
import nltk
import gensim
import pandas as pd
from nltk.stem import WordNetLemmatizer 
from nltk import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download("stopwords")
nltk.download('wordnet')

### Data loading and preparation

Loading the aptitude classification dataset containing all the aptitude questions of various sub-categories

Selecting the two sub-categories (Time and Distance, Finding Errors) from the loaded data

In [None]:
data = pd.read_csv("/content/Cleaned_Aptitude_Classification.csv")
data.shape

In [None]:
data.head()

Out of 15 sub-categories from the data, choosing two sub-categories for this experiment

In [None]:
# Extracting two sub-categories questions 
category1_Que = data[data['Sub-Category']=='Misspell words']['Questions'].values
category2_Que = data[data['Sub-Category']== 'Finding Errors']['Questions'].values

In [None]:
# Printing the sample question from first chosen Sub-Category
category1_Que[0]

#### Pre-processing and tokenization

Pre-processing the text and applying tokenization to get vocabulary words of both chosen sub-categories

In [None]:
# Intializing nltk requirements for pre-processing
lemmatizer = WordNetLemmatizer()
stoplist = set(stopwords.words('english')) 

In [None]:
# Tokenize the sentence and get vocab words
def Tokenize(AllQuestions):
  pre_processed_words = []
  for each in AllQuestions:
    # YOUR CODE HERE to tokenize and pre-process the words
    
  pre_processed_words = set(pre_processed_words)

  pre_processed_words = [word for word in pre_processed_words if word not in stoplist]
  return pre_processed_words

In [None]:
# Calling the above Tokenize function to get vocab words of both sub-categories
category1_words = Tokenize(category1_Que)
category2_words = Tokenize(category2_Que)

# Combining the words of two sub-categories
all_words = category1_words + category2_words
print("Number of valid words after pre-processing:", len(all_words))


### Loading the word2vec model

Load Gensim pretrained model

  * Gensim is an open source Python library for natural language processing. It is developed and is maintained by the Czech natural language processing researcher Radim Řehůřek and his company RaRe Technologies. 

  * Use gensim to load a word2vec model, pretrained on google news, covering approximately 3 million words and phrases. The vector length is 300 features.

  * Download the google news bin file with the limit 500000 words and save in a binary word2vec format. If **binary = True**, then the data will be saved in binary word2vec format, else it will be saved in plain text.


In [None]:
# Load 300 vectors directly from the file. As the model is in .bin extension, we need to enable default parameter, binary = True
model = gensim.models.KeyedVectors.load_word2vec_format('AIML_DS_GOOGLENEWS-VECTORS-NEGATIVE-300_STD.bin', binary=True, limit=500000)

In [None]:
# Pre-trained model gives representation of 300 size vector
print("Dimension of the word 'tree': ", len(model['tree']))

### Generate vectors for each word

Words that appear in both the sub-categories will have the same representation but different label, which may lead to less accuracy in classification, Ignoring the words that are intersecting both the chosen sub-categories

In [None]:
# Get vector representation using model for the all the extraced words of two sub-categories
vectors, labels = [], []
for word in all_words:
  try:
    # Ignoring the words that appear in both sub-categories
    if ~(word in category1_words and word in category2_words):
      # YOUR CODE HERE to generate vectors and append label
  except:
    pass
print("Number of words:", len(labels))
print("Number of dimensions in each vector:", len(vectors[0]))

### Split the Data into train and test

In [None]:
# YOUR CODE HERE to split the data

### Fit the model and calculate the accuracy

In [None]:
# YOUR CODE HERE to classify

### Ungraded Exercise: 

Take any other two sub-categories and get vector representation using word2vec

In [None]:
# YOUR CODE HERE

## Please answer the questions below to complete the experiment:

In [None]:
#@title Word embeddings capture multiple dimensions of data and are represented as vectors { run: "auto", form-width: "500px", display-mode: "form" }
Answer = "" #@param ["","True","False"]


In [None]:
#@title How was the experiment? { run: "auto", form-width: "500px", display-mode: "form" }
Complexity = "" #@param ["","Too Simple, I am wasting time", "Good, But Not Challenging for me", "Good and Challenging for me", "Was Tough, but I did it", "Too Difficult for me"]


In [None]:
#@title If it was too easy, what more would you have liked to be added? If it was very difficult, what would you have liked to have been removed? { run: "auto", display-mode: "form" }
Additional = "" #@param {type:"string"}


In [None]:
#@title Can you identify the concepts from the lecture which this experiment covered? { run: "auto", vertical-output: true, display-mode: "form" }
Concepts = "" #@param ["","Yes", "No"]


In [None]:
#@title  Experiment walkthrough video? { run: "auto", vertical-output: true, display-mode: "form" }
Walkthrough = "" #@param ["","Very Useful", "Somewhat Useful", "Not Useful", "Didn't use"]


In [None]:
#@title  Text and image description/explanation and code comments within the experiment: { run: "auto", vertical-output: true, display-mode: "form" }
Comments = "" #@param ["","Very Useful", "Somewhat Useful", "Not Useful", "Didn't use"]


In [None]:
#@title Mentor Support: { run: "auto", vertical-output: true, display-mode: "form" }
Mentor_support = "" #@param ["","Very Useful", "Somewhat Useful", "Not Useful", "Didn't use"]


In [None]:
#@title Run this cell to submit your notebook for grading { vertical-output: true }
try:
  if submission_id:
      return_id = submit_notebook()
      if return_id : submission_id = return_id
  else:
      print("Please complete the setup first.")
except NameError:
  print ("Please complete the setup first.")