In [1]:
import fitz
import pandas as pd
import numpy as np
import nltk
import re
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from textblob import TextBlob

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def questions(question):
    clean_text_1 = word_tokenize(str.lower(question))
    
    clean_text_2 = []
    for words in clean_text_1:
        res1 = re.sub(r'[^\w\s]',"",words)
        res = re.sub(r'[^\D]',"",res1)
        if res != "":
            clean_text_2.append(res)
            
    clean_text_3 = {i for i in clean_text_2 if i not in stopwords.words('english') and len(i)>1 and len(i)!=53}
    
    redundant_words = ['mcqs','question','set','questions','mcq','__________','next','page']
    clean_text_4 = [i for i in clean_text_3 if i not in redundant_words]
    
    sbs = SnowballStemmer("english")
    stemmed_tokens = [sbs.stem(tokens) for tokens in clean_text_4]
    clean_text_5 = " ".join(stemmed_tokens)
    
    return clean_text_5

In [3]:
def extract_text_sci(n):
    path = r".\Questions\Science and Tech\qna-scte-"+ str(n) + ".pdf"
    with fitz.open(path) as doc:
        text = ""
        for page in doc:
            text += page.getText()
    return text

In [4]:
ques_sci = [[sentence for index, sentence in enumerate(extract_text_sci(i).split("Question ")) 
         if type(int(sentence[0]))== int and index!=0] for i in range(1,10)]
q_sci=[]
for set in ques_sci:
    for s in set:
        q_sci.append(questions(s))

In [5]:
def extract_text_hi(n):
    path = r".\Questions\History\anhi-dca-"+ str(n) + ".pdf"
    with fitz.open(path) as doc:
        text = ""
        for page in doc:
            text += page.getText()
    return text

In [6]:
ques_hi = [[sentence for index, sentence in enumerate(extract_text_hi(i).split("Download Now")) 
         if index==1][0].split("Show Answer")[:-1] for i in range(1,10)]

r = 'Ancient Indian History Quiz Multiple Choice Questions \
( - GKToday\nhttps://www.gktoday.in/quizbase/ancient-indian-history-multiple-choice-questions'

q_hi=[]
for set in ques_hi:
    for s in set:
        if r in s:
            s = s.replace(r,'')
        q_hi.append(questions(s))

q_hi = [i for i in q_hi if len(i)>1 ]

In [7]:
q = q_sci + q_hi

In [8]:
l = list(np.ones(len(q_sci)))+list(np.zeros(len(q_hi)))

In [9]:
df = pd.DataFrame(pd.Series(q),columns=['Question'])
df['subject'] = l
df.subject = df.subject.apply(lambda x: int(x))
df.head()

Unnamed: 0,Question,subject
0,found decad american electr institut aiee engin,1
1,databas file one inform part record report typ...,1
2,optic usual sowar mean order open sensor os ab...,1
3,radio decad type transatlant occur technolog b...,1
4,file document usual imag audio oic subtop refe...,1


## TF-IDR

In [10]:
# bag of words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
#for model-building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [12]:
X_train, X_val, y_train, y_val = train_test_split(df["Question"],df.subject,test_size=0.2,shuffle=True)

In [13]:
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train) 
X_val_vectors_tfidf = tfidf_vectorizer.transform(X_val)

## ML

In [14]:
model_params = {
    'SVM': {
        'model': svm.SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear']
        }  
    },
    'Random Forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10]
        }
    },
    'Logistic Regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    },
    'Naive Bayes': {
        'model': MultinomialNB(),
        'params':{}
    },
    'SGD':{
        'model': SGDClassifier(),
        'params':{}
    }
}


In [15]:
scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(X_train_vectors_tfidf, y_train)
    scores.append({
        'Model': model_name,
        'Best Score': clf.best_score_,
        'Best Params': clf.best_params_
    })
    
df = pd.DataFrame(scores,columns=['Model','Best Score','Best Params'])
df

Unnamed: 0,Model,Best Score,Best Params
0,SVM,0.879654,"{'C': 10, 'kernel': 'linear'}"
1,Random Forest,0.842857,{'n_estimators': 5}
2,Logistic Regression,0.814286,{'C': 5}
3,Naive Bayes,0.82381,{}
4,SGD,0.972294,{}


## Test

In [16]:
print(clf)

GridSearchCV(cv=5, estimator=SGDClassifier(), param_grid={})


In [17]:
clf.score(X_val_vectors_tfidf,y_val)

0.8888888888888888

In [18]:
t = ['''The term `kayotsarga’ is related to
a) A way of following rules in Buddhism
b) A yogic posture of Jain meditation
c) Ashoka’s principles to follow dhamma
d) Vedic literature influenced by Dasas''',
     
'''In the context of recent advances in human reproductive technology, ‘Pronuclear Transfer” is used for
(a) Fertilization of egg in vitro by the donor sperm
(b) Genetic modification of sperm producing cells
(c) Development of stem cells into functional embryos
(d) Prevention of mitochondrial diseases in offspring''']

k = [questions(i) for i in t]

In [19]:
clf.predict(tfidf_vectorizer.transform(k))

array([0, 1], dtype=int64)