In [None]:
from abstract_model import AbstractModel
import re
import string
import pandas as pd
import numpy as np

from nltk import FreqDist
from nltk.stem import WordNetLemmatizer

In [None]:
### Preprocessing: accept string text, return a list of word stems
def _preprocess(text):
    text = re.sub('\s', ' ', text)
    text = text.lower()
    text = text.encode('ascii', 'ignore').decode()
    temp = [c for c in text if c not in string.punctuation]
    text_clean = ''.join(temp)
    
    li = text_clean.split(' ')
    lemm = WordNetLemmatizer()
    wdlist = [lemm.lemmatize(wd, pos="v") for wd in li]
    
    return wdlist

In [None]:
### Take author and input text, return df (Series) of profiles
def _makeProfile(self, auth, text):
    wdlist = _preprocess(text)

    totalvocab = len(wdlist)

    fd = FreqDist(wdlist)
    commonwords = dict(fd.most_common(self.topN))
    df = pd.DataFrame.from_dict(commonwords, orient='index', columns=[auth])
    df = df/totalvocab #normalize
    df = df.transpose()
    
    return df

In [None]:
### Convert text to features
def _makeFeatureMatrix(self, training_data):
    
    featdf = pd.DataFrame()
    
    for auth in training_data:
        text = training_data[auth]
        
        df = _makeProfile(self, auth, text)
        
        featdf = featdf.append(df) #, ignore_index=True)

    featdf.fillna(0, inplace=True)
        
    #print(featdf)
    return featdf

In [None]:
## Measures the similarity between two profiles by the angle formed between them ##
def _cosSimilarity(p1, p2):
    return (p1 @ p2) / (np.linalg.norm(p1) * np.linalg.norm(p2))

In [None]:
def train(self, training_data, topN=100):
    ##TODO: call _makeFeatureMatrix and convert to desired data structure of o/p profiles
    
    self.topN = topN
    
    traindict = {}
    for auth in training_data:
        corpus = ''
        for text in training_data[auth]:
            corpus = corpus+text+' '
        traindict[auth] = corpus
    
    featmatrix = _makeFeatureMatrix(self, traindict)
    
    self.profiledf = featmatrix

In [None]:
def identify(self, text):
    
    testdf = _createTest(self, text)
    authors = self.profiledf.index.tolist()
    
    M = len(self.profiledf)
    clist = []
    for i in range(M):
        clist.append(cosSimilarity(self.profiledf.iloc[i], testdf.iloc[0]))
    
    probdict = {}
    for i in range(M):
        probdict[authors[i]] = clist[i]/sum(clist)
        
    idlist = sorted([(clist[i]/sum(clist), authors[i]) for i in range(M)], reverse=True)
        
    return idlist
    

In [None]:
def _createTest(self, text):
    
    featmatrix = self.profiledf
    
    pr = _makeProfile(self, 'Test', text)
    
    testdf = featmatrix.append(pr)
    testdf.fillna(0, inplace=True)

    col_list = featmatrix.columns.tolist()
    testdf = testdf.iloc[-1]
    
    return testdf[col_list]

In [1]:
from word_frequency_model import BOW


In [2]:
### Read training data - V BRUTE FORCE

path = r'C:\Users\shala\classify3\ling-227-final-project\texts'
filedict = {'Alexander Pope':[r'\Pope\Pope_train1.txt', r'\Pope\Pope_train2.txt'],
            'John Dryden': [r'\Dryden\Dryden_train1.txt', r'\Dryden\Dryden_train2.txt'],
            'George Chapman':[r'\Chapman\Chapman_train1.txt', r'\Chapman\Chapman_train2.txt',
                             r'\Chapman\Chapman_train3.txt', r'\Chapman\Chapman_train4.txt']}

training_data = {'Alexander Pope':[], 'John Dryden':[], 'George Chapman':[]}

for auth in filedict:
    flist = filedict[auth]
    #clist = []
    content = ''
    for fi in flist:
        txtfile = path+fi
        with open(txtfile, 'r', encoding='utf-8') as f:
            ogtext = f.read()
            training_data[auth].append(ogtext)
            #clist.append(ogtext)
            #content = content+ogtext+' '
    #filedict[auth] = content

In [9]:
model = BOW(350)

model.train(training_data) 
model.profiledf

Unnamed: 0,the,and,to,a,of,in,be,with,his,all,...,hell,humour,strange,bosom,frame,virgins,naught,lifes,drink,bed
Alexander Pope,0.051625,0.040448,0.022627,0.018045,0.016595,0.016189,0.015618,0.010473,0.009801,0.008733,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
John Dryden,0.039935,0.027459,0.020447,0.012205,0.015005,0.011548,0.018681,0.00788,0.011186,0.005419,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
George Chapman,0.038489,0.039722,0.023062,0.011769,0.021471,0.020596,0.018529,0.013757,0.018648,0.011928,...,0.000437,0.000437,0.000437,0.000437,0.000437,0.000437,0.000437,0.000398,0.000398,0.000398


In [None]:
len(model.profiledf)

In [4]:
testdict = {'Alexander Pope':[r'\Pope\Pope_iliad.txt'], 
            'John Dryden':[r'\Dryden\Dryden_iliad_1_p6.txt'],
            #'George Chapman':[r'\Chapman\Chapman_iliad.txt']
           }


for auth in testdict:
    flist = testdict[auth]
    #clist = []
    content = ''
    for fi in flist:
        txtfile = path+fi
        with open(txtfile, 'r', encoding='utf-8') as f:
            content = f.read()
    testdict[auth] = content

In [None]:
pr = model.profiledf
pr.iloc[1]
testser = model._createTest(text)

In [None]:
p1 = pr.iloc[0]
p2 = testser
model._cosSimilarity(p1, p2)

In [10]:
text = testdict['Alexander Pope']
model.identify(text)

[(0.4774726833614673, 'Alexander Pope'),
 (0.4032789602461022, 'George Chapman'),
 (0.11924835639243044, 'John Dryden')]