In [1]:
# data visualisation and manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#configure

#import nltk
import nltk
from nltk.corpus import stopwords

#preprocessing
from nltk.corpus import stopwords  #stopwords
from nltk.stem import WordNetLemmatizer  # lammatizer from WordNet

# vectorizers for creating the document-term-matrix (DTM)
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

#stop-words
stop_words=set(nltk.corpus.stopwords.words('english'))

In [2]:
df=pd.read_csv('G:/CS Year 4/1st Semester/Graduation/Test3/Dataset.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,studentAnswer,referenceAnswer,ref_1,ref_2,accuracy
0,0,By letting it sit in a dish for a day.,"The water was evaporated, leaving the salt.",We evaporated the salt from the water because ...,I put some water in a dish and let the water e...,incorrect
1,1,Let the water evaporate and the salt is left b...,"The water was evaporated, leaving the salt.",We evaporated the salt from the water because ...,I put some water in a dish and let the water e...,correct
2,2,The water evaporated and left salt crystals.,"The water was evaporated, leaving the salt.",We evaporated the salt from the water because ...,I put some water in a dish and let the water e...,correct
3,3,I saw a pinkish grayish color that was blockin...,"The water was evaporated, leaving the salt.",We evaporated the salt from the water because ...,I put some water in a dish and let the water e...,incorrect
4,4,You have to slowly tip the vial for only the w...,"The water was evaporated, leaving the salt.",We evaporated the salt from the water because ...,I put some water in a dish and let the water e...,incorrect


# DATA CLEANING & PRE-PROCESSING using Lemmatizer

In [4]:
def clean_text(headline):
  le=WordNetLemmatizer()
  word_tokens=word_tokenize(headline)
  tokens=[le.lemmatize(w) for w in word_tokens if w not in stop_words and len(w)>3]
  cleaned_text=" ".join(tokens)
  return cleaned_text

In [5]:
vect =TfidfVectorizer(stop_words=stop_words,max_features=1000) # to play with. min_df,max_df,max_features etc...
vect_text=vect.fit_transform(df['studentAnswer'])

In [6]:
print(vect_text.shape)
print(vect_text)

(4925, 1000)
  (0, 198)	0.4688001590192522
  (0, 224)	0.46496666479704374
  (0, 765)	0.526297034675938
  (0, 459)	0.5357647271166253
  (1, 65)	0.5135175067271285
  (1, 452)	0.412879040429809
  (1, 708)	0.33628185623130286
  (1, 265)	0.44536577453626647
  (1, 963)	0.25501185101989426
  (1, 456)	0.43515912507870236
  (2, 189)	0.42097114600507246
  (2, 266)	0.5865554299389977
  (2, 452)	0.4838560193369118
  (2, 708)	0.3940912092847385
  (2, 963)	0.29885028552133597
  (3, 75)	0.6075247506354002
  (3, 148)	0.45221552281229876
  (3, 716)	0.579132556094528
  (3, 963)	0.3016956757469561
  (4, 347)	0.40783233763618426
  (4, 944)	0.6156543672477247
  (4, 773)	0.5931106185423215
  (4, 963)	0.3207214970022335
  (5, 881)	0.39459954700616756
  (5, 708)	0.3328028517631009
  :	:
  (4917, 278)	0.5410981452801741
  (4917, 154)	0.5610504890380187
  (4917, 580)	0.3397920135260598
  (4918, 154)	0.5562283127941136
  (4918, 233)	0.7071402099613585
  (4918, 177)	0.4365349785550061
  (4919, 258)	0.605405579004

In [7]:
from sklearn.decomposition import TruncatedSVD
lsa_model = TruncatedSVD(n_components=10, algorithm='randomized', n_iter=10, random_state=42)

lsa_top=lsa_model.fit_transform(vect_text)

# We can now see the most frequent and rare words in the students' answers on idf score. The lesser the value, more common is the word in the students' answers.

In [8]:
print(lsa_top)
print(lsa_top.shape)  # (no_of_doc*no_of_topics)

[[ 0.01409174 -0.01137884 -0.01540341 ... -0.00039546  0.01060031
   0.00041955]
 [ 0.15231533 -0.15867724 -0.22247695 ... -0.09520784  0.09716631
   0.00505616]
 [ 0.18065488 -0.19972986 -0.26795988 ... -0.16045746  0.28025506
   0.15343015]
 ...
 [ 0.03129362 -0.02742252  0.03947328 ...  0.0131192   0.00598386
  -0.0010587 ]
 [ 0.02421383 -0.02486601  0.03506129 ...  0.01118825  0.01032264
  -0.00382459]
 [ 0.05429792 -0.02582328  0.03419618 ... -0.0307217  -0.02342217
   0.02496368]]
(4925, 10)


In [9]:
l=lsa_top[0]
print("Document 0 :")
for i,answer in enumerate(l):
  print("Answer ",i," : ",answer*100)

Document 0 :
Answer  0  :  1.4091741424342528
Answer  1  :  -1.1378838636842505
Answer  2  :  -1.5403406119532357
Answer  3  :  -0.8190379919692595
Answer  4  :  -0.17717802469807512
Answer  5  :  0.015038967584978035
Answer  6  :  -0.27221548599013196
Answer  7  :  -0.039546056740992507
Answer  8  :  1.060031283023906
Answer  9  :  0.041954622736524665


# Now we can get a list of the important words for each of the 10 answers as shown. For simplicity here I have shown 10 words for each answer. "Student Answer"

In [10]:
# most important words for each topic
vocab = vect.get_feature_names()

for i, comp in enumerate(lsa_model.components_):
    vocab_comp = zip(vocab, comp)
    sorted_words = sorted(vocab_comp, key= lambda x:x[1], reverse=True)[:10]
    print("Answer "+str(i)+": ")
    for t in sorted_words:
        print(t[0],end=" ")
    print("\n")

Answer 0: 
would one water pitch different sound make see string higher 

Answer 1: 
pitch would string higher lower sound high low make short 

Answer 2: 
one light motor run bulb circuit time work goes switch 

Answer 3: 
one sugar scratch see variable experiment pitch pure harder fat 

Answer 4: 
different sammy sound wood make sizes metal yes marbles nathaniel 

Answer 5: 
would sugar see pure cookies food know bubbles cream fruity 

Answer 6: 
sugar food string pure cookies sound cream fruity indicates make 

Answer 7: 
heat light sun time black goes sugar faster speed dark 

Answer 8: 
crystals salt light like string look different pitch goes time 

Answer 9: 
heat like crystals salt black look yes circuit make white 

