# Doc Vectorization

In [1]:
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [2]:
corpus_directory = '/home/meeka/Desktop/NU/453/assn2/philosophy/corpus'

In [3]:
from os import listdir
import re
import string
from nltk.corpus import stopwords
from collections import Counter
import numpy as np
import pandas as pd

# Create Classes to label corpus

In [4]:
#create vocabulary sets for class divisions

logic=['logic', 'logical', 'logics', 'syllogism', 'syllogisms', 'model']
mathematics=['mathematics','mathematical', 'number', 'set', 'sets', 'probability','probabilities', 'proof']
language=['language', 'linguistic', 'sentences', 'sentence', 'proposition', 'propositions', 'verb', 'verbs',
         'discourse', 'word', 'words']
mind=['cognition', 'cognitive', 'consciousness', 'thought', 'thoughts','knowledge', 'know', 'mental', 
        'perception', 'neural', 'brain', 'mind', 'selfknowledge']
ontology=['objects', 'object', 'truth', 'abstract', 'abstraction', 'phenomenal', 'phenomenology',
             'representation', 'representational', 'representations', 'experience', 'experiences']
ethics=['ethics', 'ethical', 'moral', 'morality', 'religion']

top_vocab=[]

def merge_list(group):
    for word in group:
        top_vocab.append(word)

merge_list(logic)
merge_list(mathematics)
merge_list(language)
merge_list(mind)
merge_list(ontology)
merge_list(ethics)

len(top_vocab)

55

In [5]:
#Create dict of entire corpus

def load_doc(filename):
    file=open(filename, 'r')
    text=file.read()
    file.close()
    return text    

def clean_doc(doc):
    tokens=doc.split()
    tokens=[word.lower() for word in tokens]
    re_punc=re.compile('[%s]'% re.escape(string.punctuation))
    tokens=[re_punc.sub('',w) for w in tokens]
    tokens=[word for word in tokens if word.isalpha()]
    stop_words=set(stopwords.words('english'))
    tokens=[word for word in tokens if not word in stop_words]
    tokens=[word for word in tokens if len(word)>2]
    return tokens

def process_docs(directory):
    for filename in listdir(directory):
        path=directory+'/'+filename
        doc=load_doc(path)
        tokens=clean_doc(doc)
        #process lists, counters, dicts:
        vocab.update(tokens)
        wordcount=Counter(tokens)
        corpus_dict_top5[filename]=wordcount.most_common(5)
        line= ' '.join(tokens)
        corpus_dict_sent[filename]=line
        vocab_tokens=[word for word in tokens if word in top_vocab]
        vocabcount=Counter(vocab_tokens)
        corpus_vdict[filename]=vocabcount

def save_list(lines, filename):
    data='\n'.join(lines)
    file=open(filename, 'w')
    file.write(data)
    file.close()

vocab=Counter()

#Top 5 words in each document for categorization
corpus_dict_top5={}
#Dict with top wordcounts for top vocab 6 group categorization list
corpus_vdict={}
#Dict with full length sentences
corpus_dict_sent={}

process_docs(corpus_directory)

min_occurrence=50
vocab=[k for k,c in vocab.items() if c >= min_occurrence]
save_list(vocab, 'vocab.txt')

In [6]:
#create subset for class labels, merge with main corpus to align docs with classes

def group_dfs(group):
    group_set={}
    for k,v in corpus_dict_top5.items():
        for name, count in v:
            if name in group:
                group_set[k]=v
    dkeys=[]
    dvals=[]
    for x,y in group_set.items():
        dkeys.append(x)
        dv=[]
        for item in y:
            dv.append(str(item[0]+'-'+str(item[1])))
        dvals.append(dv)
    headers=[]
    for x in range(1,6):
        label=str('word'+str(x))
        headers.append(label)
    newdf=pd.DataFrame(dvals, columns=headers, index=dkeys)
    return newdf

logic_df=group_dfs(logic)
mathematics_df=group_dfs(mathematics)
language_df=group_dfs(language)
mind_df=group_dfs(mind)
ontology_df=group_dfs(ontology)
ethics_df=group_dfs(ethics)

logic_df['class']='logic'
mathematics_df['class']='mathematics'
language_df['class']='language'
mind_df['class']='mind'
ontology_df['class']='ontology'
ethics_df['class']='ethics'

frames=[logic_df, mathematics_df, language_df, mind_df, ontology_df, ethics_df]
full_df=pd.concat(frames)
full_df=full_df.reset_index()
full_df=full_df.rename(columns={"index":"document"})
full_df.to_csv('full_concat.csv')
    
full_df['class'].value_counts()

mind           189
logic          139
ontology       135
language       128
mathematics     95
ethics          55
Name: class, dtype: int64

In [7]:
#Identify Dups, separate datasets:

dups=full_df.duplicated(subset=['document'])
df_dup=pd.concat([full_df['document'], dups],axis=1, join='inner')
df_dup=df_dup.rename(columns={0:'duplicate'})
df_dup=df_dup[df_dup.duplicate]

duplist=df_dup.document.to_list()
uniquedocs=full_df[~full_df.document.isin(duplist)]
dupdocs=full_df[full_df.document.isin(duplist)]

print('full df shape: ', full_df.shape)
print('unique docs shape: ', uniquedocs.shape)
print('dups docs shape: ', dupdocs.shape)
print('\nValue counts of unique docs:')
print(uniquedocs['class'].value_counts())

full df shape:  (741, 7)
unique docs shape:  (482, 7)
dups docs shape:  (259, 7)

Value counts of unique docs:
mind           141
logic           84
ontology        75
language        72
mathematics     66
ethics          44
Name: class, dtype: int64


In [10]:
#Clean duplicate docs based on first word assignment to category

docfilter=dupdocs.drop_duplicates(subset='document').copy(deep=True)
docfilter2=docfilter.copy(deep=True)
docfilter2[['word1w','word1c']]=docfilter2.word1.str.split("-",expand=True)

keepcols=['document','word1w']
primaryword=docfilter2.filter(items=keepcols, axis=1).copy(deep=True)

NaN=np.nan
primaryword['class']=NaN

def firstword(group, name):
    for i in range(len(primaryword)):
        for word in group:
            if primaryword.iloc[i,1]==word:
                primaryword.iloc[i,2]=name

firstword(logic, 'logic')
firstword(mathematics, 'math')
firstword(language, 'language')
firstword(mind, 'mental')
firstword(ontology, 'ontology')
firstword(ethics, 'ethics')

primaryword.dropna(subset=['class'], inplace=True)
docfilter=docfilter.drop(columns=['class'])
uniquefiltered=pd.concat([docfilter, primaryword['class']],axis=1, join='inner')
uniquefiltered.shape

frames=[uniquedocs, uniquefiltered]
final_corpus_df=pd.concat(frames)
print('final corpus shape: ', final_corpus_df.shape)
print('final corpus counts: \n')
print(uniquedocs['class'].value_counts())

final corpus shape:  (546, 7)
final corpus counts: 

mind           141
logic           84
ontology        75
language        72
mathematics     66
ethics          44
Name: class, dtype: int64


# Analysis Part 1:  Analyst Judgement vs TfIdf

## Approach 1: Analyst Judgment

In [11]:
final_texts=final_corpus_df['document']
final_texts=final_texts.to_list()

final_vcorpus={}

for k,v in corpus_vdict.items():
    for word in final_texts:
        if k == word:
            final_vcorpus[k]=v

vocab_matrix=pd.DataFrame.from_dict(final_vcorpus, orient='index')
vocab_matrix.to_csv('vocab_matrix_analyst.csv')
vocab_matrix.iloc[:10,:10]

Unnamed: 0,selfknowledge,knowledge,thought,cognitive,mental,know,thoughts,consciousness,phenomenal,experience
Self Knowledge_1 The Distinctiveness of SelfKnowledge.txt,18.0,5.0,2.0,1.0,19.0,3.0,6.0,1.0,2.0,2.0
Folk Psychology as Mental Simulation_7 Conclusion.txt,1.0,1.0,,3.0,2.0,,,,,
Gertrude Elizabeth Margaret Anscombe_3 Metaphysics.txt,1.0,,1.0,,,,,,,
Consciousness and Intentionality_8 Consciousness in Mind.txt,1.0,,10.0,5.0,6.0,1.0,1.0,27.0,5.0,2.0
Thomas Reid_6 Moral Philosophy.txt,1.0,4.0,2.0,,2.0,1.0,,,,1.0
Folk Psychology as Mental Simulation_6 Simulation Theory Pros and Cons.txt,2.0,6.0,,4.0,25.0,2.0,,,,1.0
Narrow Mental Content_3 Arguments for Narrow Content.txt,1.0,1.0,5.0,,10.0,,10.0,,23.0,5.0
Externalism About Mental Content_6 Externalism and Selfknowledge.txt,15.0,11.0,7.0,,1.0,9.0,14.0,,,
Self Knowledge_2 Doubts about the distinctiveness of selfknowledge.txt,10.0,5.0,4.0,1.0,8.0,7.0,,,,1.0
Self Knowledge_3 Accounts of SelfKnowledge.txt,73.0,31.0,4.0,6.0,49.0,15.0,8.0,4.0,12.0,11.0


## Approach 2: TfIdf

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

final_corpus=[]
final_labels=[]

for k,v in corpus_dict_sent.items():
    for word in final_texts:
        if k == word:
            final_corpus.append(v)
            final_labels.append(k)

vectorizer=TfidfVectorizer(vocabulary=vocab)
X=vectorizer.fit_transform(final_corpus)
print(X.shape)

feature_names=vectorizer.get_feature_names()
corpus_index=[n for n in final_corpus]
Tfidf_df_matrix=pd.DataFrame(X.todense(), index=final_labels, columns=feature_names)
Tfidf_df_matrix.T.to_csv('vocab_matrix_tfidf.csv')

Tfidf_df_matrix_topVocab=Tfidf_df_matrix[top_vocab]
Tfidf_df_matrix_topVocab.iloc[:10,:10]
Tfidf_df_matrix_topVocab.to_csv('Tfidf_df_matrix_topVocab.csv')

(546, 2901)


# Analysis 2: Test Train Split & Modeling

In [13]:
#extract data from overall corpus and split into train test for modelling

from sklearn.model_selection import train_test_split

doc_class_df=final_corpus_df[['document','class']]
final_texts=doc_class_df.values.tolist()

final_corpus_dict={}
final_labels_dict={}
final_analysis_dict={}

for k,v in corpus_dict_sent.items():
    for item in final_texts:
        if k == item[0]:
            final_corpus_dict[k]=v
            final_labels_dict[k]=item[1]
            final_analysis_dict[v]=item[1]

X=list(final_analysis_dict.keys())
y=list(final_analysis_dict.values())
print('items in X set: %d and items in y set: %d' %(len(X), len(y)))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=100, random_state=42)


items in X set: 546 and items in y set: 546


In [14]:
#Analysis 1: Judgment

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

#Analyst Dataframe
analyst_vectorizer=CountVectorizer(vocabulary=top_vocab)
X_train_analyst=analyst_vectorizer.fit_transform(X_train)
X_train_analyst_df=pd.DataFrame(X_train_analyst.todense(), columns=analyst_vectorizer.get_feature_names(), index=y_train)
X_test_analyst=analyst_vectorizer.fit_transform(X_test)
X_train_analyst_df.head(5)

Unnamed: 0,logic,logical,logics,syllogism,syllogisms,model,mathematics,mathematical,number,set,...,representation,representational,representations,experience,experiences,ethics,ethical,moral,morality,religion
mathematics,0,0,0,0,0,0,0,1,2,6,...,0,0,0,0,0,0,0,0,0,0
mental,0,1,0,0,0,1,0,0,3,0,...,17,1,28,0,0,0,0,2,0,0
mathematics,2,0,0,0,0,0,1,0,0,16,...,0,0,0,0,0,0,0,0,0,0
mind,1,0,0,0,0,0,1,0,0,0,...,0,0,0,2,1,0,0,0,0,0
ontology,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
#Analysis 2: Tf-Idf Dataframe

#tfidf_vectorizer=TfidfVectorizer(max_features=1000)
tfidf_vectorizer=TfidfVectorizer(max_features=3000)
X_train_tfidf=tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf=tfidf_vectorizer.fit_transform(X_test)

X_train_tfidf_df=pd.DataFrame(X_train_tfidf.todense(), columns=tfidf_vectorizer.get_feature_names(), index=y_train)
X_train_tfidf_df.head(5)

Unnamed: 0,abandoned,abelard,abilities,ability,able,absence,absolute,absolutely,abstract,abstraction,...,xyz,year,years,yellow,yet,yield,yields,zalta,zero,zfc
mathematics,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mental,0.016249,0.0,0.218344,0.026257,0.010218,0.0,0.0,0.0,0.087336,0.0,...,0.0,0.0,0.0,0.0,0.016893,0.0,0.0,0.018481,0.0,0.0
mathematics,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mind,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.016821,0.0,0.037315,0.0,0.0,0.0,0.0,0.0
ontology,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.354342,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Analysis 3:  NN Embeddings (Doc2Vec)

In [16]:
#Analysis 3: Doc2Vec

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import get_tmpfile

def tokenize_docs(X):
    word_tokens=[]
    for doc in X:
        tokens=doc.split()
        word_tokens.append(tokens)
    return word_tokens
    
X_train_tokens=tokenize_docs(X_train)
X_test_tokens=tokenize_docs(X_test)

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(X_train_tokens)]
model_50dim = Doc2Vec(documents, vector_size=50, window=4, min_count=2, epochs=50)
model_50dim.train(documents, total_examples = model_50dim.corpus_count, epochs = model_50dim.epochs)

#Vectorize Training Set:
doc2vec_50_vectors = np.zeros((len(X_train_tokens), 50)) 
for i in range(0, len(X_train_tokens)):
    doc2vec_50_vectors[i,] = model_50dim.infer_vector(X_train_tokens[i]).transpose()
print(doc2vec_50_vectors.shape)

#Vectorize Test Set:
doc2vec_50_vectors_test = np.zeros((len(X_test_tokens), 50))
for i in range(0, len(X_test_tokens)):
    doc2vec_50_vectors_test[i,] = model_50dim.infer_vector(X_test_tokens[i]).transpose()
print(doc2vec_50_vectors_test.shape)

(446, 50)
(100, 50)


# Modelling

In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics


count_clf = RandomForestClassifier(n_estimators = 100, max_depth = 10, random_state = 42)
count_clf.fit(X_train_analyst, y_train)
count_pred = count_clf.predict(X_test_analyst)
print('\nCount/Random forest F1 classification performance in test set:',
    round(metrics.f1_score(y_test, count_pred, average='macro'), 3))

#Tf-Idf
tfidf_clf = RandomForestClassifier(n_estimators = 100, max_depth = 10, random_state = 42)
tfidf_clf.fit(X_train_tfidf, y_train)
tfidf_pred = tfidf_clf.predict(X_test_tfidf)
print('\nTF-IDF/Random forest F1 classification performance in test set:',
    round(metrics.f1_score(y_test, tfidf_pred, average='macro'), 3))

#Doc2Vec
doc2vec_50_clf = RandomForestClassifier(n_estimators = 100, max_depth = 10, random_state = 42)
doc2vec_50_clf.fit(doc2vec_50_vectors, y_train)
doc2vec_50_pred = doc2vec_50_clf.predict(doc2vec_50_vectors_test)
print('\nDoc2Vec_50/Random forest F1 classification performance in test set:',
    round(metrics.f1_score(y_test, doc2vec_50_pred, average='macro'), 3)) 


Count/Random forest F1 classification performance in test set: 0.65

TF-IDF/Random forest F1 classification performance in test set: 0.168

Doc2Vec_50/Random forest F1 classification performance in test set: 0.506
