In [1]:
%matplotlib inline
import os
import re
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from joblib import dump, load
import pickle

# loading machine learning modules
import sklearn
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans

from dataset_collector_saver_class import LoadDataset
#directory,*,tar_names=None,Type=None,rmChar=False,_dict=False,folder_name=False

In [2]:
# Collecting the dataset
maths_path = '/home/ngoni97/Documents/MATHEMATICS'

maths = LoadDataset(maths_path, 
                      tar_names=['ADVANCED',
                                 'ORDINARY AND PARTIAL DIFFERENTIAL EQUATIONS'], 
                      Type='documents', 
                      rmChar=True, _dict=True, 
                     folder_name=True)

maths_dataset = maths.returnDataset()
maths_data_dict = maths.returnDataDict()

# Tokenizing

In [3]:
# create a customized stop_words list on top of the standard 'english'
stop_words = nltk.corpus.stopwords.words('english')
stop_words.extend(
    ['1st','2nd','3rd','4th','5th','6th','7th','8th','9th','10th','ed','','pdfdrive']) # is a list, so I can append or expand with my own list
# e.g., ['pdfdrive', e.t.c ]

wpt = nltk.WordPunctTokenizer()

def Normalize(doc):
    """ remove unwanted characters and stopwords """
    # lower case and remove special characters/whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]','', doc, re.I)
    doc = doc.lower()
    doc.strip()
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalise_data = np.vectorize(Normalize)

In [4]:
data = list(maths_data_dict.keys())
labels = list(maths_data_dict.values())

norm_data = normalise_data(data)
norm_labels = normalise_data(labels)

print('norm_data:\n', norm_data)
print('\n\nnorm_labels:', norm_labels)
#print(list(data))

norm_data:
 ['vasile pop ovidiu furdui auth square matrices order 2 theory'
 'essential mathematical methods physical sciences'
 'comprehensive introduction differential geometry vol 3'
 '536 puzzles curious problems' 'princeton companion mathematics'
 'introduction mathematical modelling'
 'mathematical modeling models analysis applications'
 'manifolds tensors forms' '1300 math formulas'
 'vector calculus linear algebra differential forms'
 'multivariable calculus'
 'robert sobot engineering mathematics example vol ii calculus'
 'mathematical problems puzzles polish mathematical olympiads straszewicz 1965'
 'handbook linear algebra'
 'modern geometry methods applications part ii geometry topology manifolds'
 'comprehensive introduction differential geometry vol 2'
 'moderne mathematische methoden der physik band 1'
 'mathematical proofs 2' 'mathematical techniques engineers scientists'
 'special functions mathematics engineers second edition'
 'advanced engineering mathematics k stro

In [5]:
df = pd.DataFrame({'data':norm_data, 'labels':norm_labels })
display(df)

Unnamed: 0,data,labels
0,vasile pop ovidiu furdui auth square matrices ...,advanced
1,essential mathematical methods physical sciences,advanced
2,comprehensive introduction differential geomet...,advanced
3,536 puzzles curious problems,advanced
4,princeton companion mathematics,advanced
...,...,...
458,schaums outline differential equations 2,ordinary partial differential equations
459,partial differential equations calculus variat...,ordinary partial differential equations
460,differential equations handbook,ordinary partial differential equations
461,differential equations linear nonlinear ordina...,ordinary partial differential equations


In [36]:
tfidf_vectoriser = TfidfVectorizer(min_df=0., max_df=7, max_features=200)

x = tfidf_vectoriser.fit_transform(norm_data)
X = x.toarray()
feature_names = tfidf_vectoriser.get_feature_names_out()
#encoder = OneHotEncoder()
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(norm_labels)

#print(X)
#print(y)
print(feature_names)
print(len(feature_names))

['1933' '1992' '29' '2e' '5e' 'advances' 'agarwal' 'algebras' 'algorithms'
 'alpha' 'analytic' 'analytical' 'andreescu' 'anthony' 'approach' 'auth'
 'band' 'basic' 'bernhard' 'boundary' 'butcher' 'classics' 'colorado'
 'combinatorial' 'combinatorics' 'companion' 'comprehensive' 'compress'
 'computation' 'computer' 'concepts' 'contemporary' 'continued'
 'continuous' 'continuum' 'curves' 'david' 'der' 'development'
 'difference' 'dr' 'dynamical' 'early' 'elements' 'essential' 'euclidean'
 'euler' 'exact' 'example' 'excursion' 'exercises' 'fast' 'first' 'forms'
 'formulas' 'fourier' 'fr' 'fractions' 'frontiers' 'function'
 'fundamentals' 'genius' 'geometric' 'gerrish' 'graduate' 'graph' 'greens'
 'group' 'groups' 'grundlagen' 'hardiman' 'hardy' 'higher' 'hilbert'
 'hilberts' 'hopf' 'iii' 'inference' 'integral' 'integrals' 'integration'
 'international' 'introductory' 'intuitive' 'invitation' 'john' 'knapp'
 'kreyszig' 'kuliah' 'lectures' 'leonhard' 'library' 'lie' 'limits'
 'logic' 'manif

In [37]:
Df = pd.DataFrame(X, columns=feature_names)
display(Df)

Unnamed: 0,1933,1992,29,2e,5e,advances,agarwal,algebras,algorithms,alpha,...,value,variable,variables,variational,variations,view,visual,wiley,yaglom,zeta
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
458,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
459,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0
460,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
461,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0


In [38]:
# data coupled with the encoded labels
encoded_labels = pd.DataFrame(y, columns=['encoded labels'])
new_df = pd.concat([df, encoded_labels], axis=1)
display(new_df)

Unnamed: 0,data,labels,encoded labels
0,vasile pop ovidiu furdui auth square matrices ...,advanced,0
1,essential mathematical methods physical sciences,advanced,0
2,comprehensive introduction differential geomet...,advanced,0
3,536 puzzles curious problems,advanced,0
4,princeton companion mathematics,advanced,0
...,...,...,...
458,schaums outline differential equations 2,ordinary partial differential equations,1
459,partial differential equations calculus variat...,ordinary partial differential equations,1
460,differential equations handbook,ordinary partial differential equations,1
461,differential equations linear nonlinear ordina...,ordinary partial differential equations,1


# splitting the data into train set and test set

In [39]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.40, random_state=33)

# fitting the multinomialNB

In [40]:
model = MultinomialNB(alpha=.5)
model.fit(X_train, y_train)

# predicting

In [41]:
y_pred = model.predict(X_test)

# accuracy

In [42]:
print('accuracy score: {:.4f}'.format(accuracy_score(y_test, y_pred)))

accuracy score: 0.9409


# testing

In [43]:
#new_data = ['Natural Language Processing With spaCy in Python – Real Python']
new_data = ['How To Use Steghide And StegoSuite Steganography Tools In Kali Linux - GeeksforGeeks']
new = normalise_data(new_data)
print(new)

custom_word_vec = tfidf_vectoriser.transform(new)
prediction = model.predict(custom_word_vec)
print(prediction)
if prediction[0] == 0:
    print("prediction succcessful, it belongs to 'advanced'")
elif prediction[0] == 1:
    print("prediction succcessful, it belongs to 'ordinary and partial differential equations'")
else:
    print("doesn't belong to any of the two classes")

['use steghide stegosuite steganography tools kali linux geeksforgeeks']
[0]
prediction succcessful, it belongs to 'advanced'


# Error!!!!!
model not working effectively
it's not predicting a False for something it did not train or test on,
I need more data for training

which means I need to add an extra caveat for viewing pdf contents and training on that
as well