In [13]:
%matplotlib inline
import os
import re
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from joblib import dump, load
import pickle

# loading machine learning modules
import sklearn
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans

from dataset_collector_saver_class import LoadDataset
#directory,*,tar_names=None,Type=None,rmChar=False,_dict=False,folder_name=False


In [14]:
# physics
physics_path = '/home/ngoni97/Documents/PHYSICS'

physics = LoadDataset(physics_path, 
                      tar_names=['ADVANCED'], 
                      Type='documents', 
                      rmChar=True, _dict=True, 
                     folder_name=True)
physics_dataset = physics.returnDataset()
print('Physics:\n\n', np.array(physics_dataset[0]))

Physics:

 ['the quantum theory of fields volume 1  pdfdrive '
 'solid state physics  lecturenotes2012'
 'quantum field theory and the standard model  pdfdrive '
 'pisello gravitationelectromagnetismquantisedcharge'
 'i v savelyev fundametals of theoretical physics vol 1'
 'hydraulic machines textbook'
 'compendium of theoretical physics,springer2005,529p,0387257993'
 'introduction to theoretical particle physics'
 'statistical physics  pdfdrive '
 '50 physics ideas you really need to know  joanne baker'
 'quantum mechanics the theoretical minimum leonard susskind and art friedman'
 '1 mod 1 atomic nuclear physics' 'han ohanian 2'
 'slaterfrank introductiontotheoreticalphysics'
 'thermodynamics  statistical mechanics by stowe' 'han ohanian 1'
 'statistical field theory  an introduction to exactly solved models in statistical physics   pdfdrive '
 'introduction to solid state physics  pdfdrive ' 'wilsonchap'
 'lecture notes on mathematical methods of theoretical physics '
 "3,000 solved

In [9]:
maths_path = '/home/ngoni97/Documents/MATHEMATICS'

maths = LoadDataset(maths_path, 
                      tar_names=['ADVANCED', 'ORDINARY AND PARTIAL DIFFERENTIAL EQUATIONS'], 
                      Type='documents', 
                      rmChar=True, _dict=True, 
                     folder_name=True)
maths_dataset = maths.returnDataset()
maths_data_dict = maths.returnDataDict()
print('Maths features:\n', np.array(maths_dataset))
print('\n\nMaths feature_labels dictionary:\n', maths_data_dict)

Maths features:
 ['jordansmith' "schaum's outline differential equations"
 'differential and difference equations with applications  contributions from the international conference on differential  difference equations and applications  pdfdrive '
 'differential equations  an introduction to modern methods and applications  pdfdrive '
 'textbook on ordinary differential equations a theoretical approach'
 'differential equations   zill'
 'introduction to partial differential equations  pdfdrive '
 'ordinary differential equations  mathematical tools for physicists  pdfdrive '
 'elementary differential equations and boundary value problems'
 'butcher   numerical methods for ordinary differential equations 2e wiley, 2008bn'
 'an introduction to ordinary differential equations' 'de complete 4'
 'handbook of ordinary differential equations  exact solutions, methods, and problems  pdfdrive '
 'ordinary differential equations'
 'butcher   numerical methods for ordinary differential equations 

In [15]:
features = maths_data_dict.keys()
labels = maths_data_dict.values()

print(features)
print(labels)

odict_keys(['jordansmith', "schaum's outline differential equations", 'differential and difference equations with applications  contributions from the international conference on differential  difference equations and applications  pdfdrive ', 'differential equations  an introduction to modern methods and applications  pdfdrive ', 'textbook on ordinary differential equations a theoretical approach', 'differential equations   zill', 'introduction to partial differential equations  pdfdrive ', 'ordinary differential equations  mathematical tools for physicists  pdfdrive ', 'elementary differential equations and boundary value problems', 'butcher   numerical methods for ordinary differential equations 2e wiley, 2008bn', 'an introduction to ordinary differential equations', 'de complete 4', 'handbook of ordinary differential equations  exact solutions, methods, and problems  pdfdrive ', 'ordinary differential equations', 'butcher   numerical methods for ordinary differential equations 2e w

# Tokenizing

In [16]:
# create a customized stop_words list on top of the standard 'english'
stop_words = nltk.corpus.stopwords.words('english')
stop_words.extend(
    ['1st','2nd','3rd','4th','5th','6th','7th','8th','9th','10th','ed','','pdfdrive']) # is a list, so I can append or expand with my own list
# e.g., ['pdfdrive', e.t.c ]

wpt = nltk.WordPunctTokenizer()

def Normalize(doc):
    # lower case and remove special characters/whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]','', doc, re.I)
    doc = doc.lower()
    doc.strip()
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalise_data = np.vectorize(Normalize)

Maths_Norm = normalise_data(maths_dataset)
Physics_Norm = normalise_data(physics_dataset[0])
print('maths_norm:\n')
print(Maths_Norm)
print('\n\nphysics_norm:\n')
print(Physics_Norm)

maths_norm:

['jordansmith' 'schaums outline differential equations'
 'differential difference equations applications contributions international conference differential difference equations applications'
 'differential equations introduction modern methods applications'
 'textbook ordinary differential equations theoretical approach'
 'differential equations zill'
 'introduction partial differential equations'
 'ordinary differential equations mathematical tools physicists'
 'elementary differential equations boundary value problems'
 'butcher numerical methods ordinary differential equations 2e wiley 2008bn'
 'introduction ordinary differential equations' 'de complete 4'
 'handbook ordinary differential equations exact solutions methods problems'
 'ordinary differential equations'
 'butcher numerical methods ordinary differential equations 2e wiley 2008'
 'coddington e' 'mathematics elementary differential equations'
 'ordinary partial differential equations agarwal regan'
 'invitati

In [17]:
# vectorisation
maths_vectoriser = TfidfVectorizer(min_df=0., max_df=3, use_idf=True)

# maths
maths_vec_matrix = maths_vectoriser.fit_transform(Maths_Norm)
maths_vec_matrix_array = maths_vec_matrix.toarray()

# displaying
# maths
maths_feature_names = maths_vectoriser.get_feature_names_out()
maths_df = pd.DataFrame(maths_vec_matrix_array, columns=maths_feature_names)
display(maths_df)

# saving to a pickle file
with open('maths_vectoriser.pkl', 'wb') as file:
    pickle.dump(maths_vectoriser, file)

physics_vectoriser = TfidfVectorizer(min_df=0., max_df=3, use_idf=True)
# physics
physics_vec_matrix = physics_vectoriser.fit_transform(Physics_Norm)
physics_vec_matrix_array = physics_vec_matrix.toarray()

# displaying
# physics
physics_feature_names = physics_vectoriser.get_feature_names_out()
physics_df = pd.DataFrame(physics_vec_matrix_array, columns=physics_feature_names)
display(physics_df)

# saving to a pickle file
with open('physics_vectoriser.pkl', 'wb') as file:
    pickle.dump(physics_vectoriser, file)

Unnamed: 0,0000unse,0071383417,100,102,11,1138063471,1138063479,11th,1300,15,...,yu,zakon,zeitz,zeros,zill,zimmerman,zorich1,zuckerman,zuming,zurich
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
463,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
464,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
465,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,0387257993,05,1000,12,13th,1612,1962t656s,1978,1989,200,...,wolfgang,wordpress,worked,world,yan,young,zemanskyrichard,zemanskys,zhang,zheng
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
307,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
308,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
309,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Word Embeddings

In [None]:
from gensim.models import word2vec

wpt = nltk.WordPunctTokenizer()


In [None]:
tokenized_maths_data = [wpt.tokenize(sent) for sent in Maths_Norm]
tokenized_physics_data = [wpt.tokenize(sent) for sent in Physics_Norm]

In [None]:
# set values for various parameters
feature_size = 10 # word vector dimensionality
window_context = 10 # context window size
min_word_count = 1 # minimum word count
sample = 1e-3 # downsample setting for frequent words

# maths
w2v_maths_model = word2vec.Word2Vec(tokenized_maths_data, vector_size=feature_size,
                              window=window_context, min_count=min_word_count,
                              sample=sample)
# physics
w2v_physics_model = word2vec.Word2Vec(tokenized_physics_data, vector_size=feature_size,
                              window=window_context, min_count=min_word_count,
                              sample=sample)

In [None]:
w2v_maths_model.wv['mathematical']

In [None]:
w2v_physics_model.wv['theoretical']

# Multinomial Naive Bayes Model