## NOTE: feature number annotation can be found in pos.p

In [3]:
import sys
import pandas as pd
import numpy as np
import nltk

## Read the each dataset and store to DataFrame

In [4]:
with open("./data/K1_dataset.txt") as f:
    K1_txt = f.read()

with open("./data/K2_dataset.txt") as f:
    K2_txt = f.read()

with open("./data/Q_dataset.txt") as f:
    Q_txt = f.read()

df = pd.DataFrame([
    ["K1", K1_txt], 
    ["K2", K2_txt],
    ["Q", Q_txt]
],
columns=['author', 'message'],)

In [5]:
df

Unnamed: 0,author,message
0,K1,Download\n\nSource\n\nPDF\nActions\n Copy Pr...
1,K2,\n\nWith the rapid growth of the information c...
2,Q,"\n\nHowever, there are frequent situations whe..."


## Create a list of tuples of (word, POS)

In [6]:
'''
This function returns a list of tuples of (word, PoS) from a document.
'''
def create_word_pos_list(message):
    tokenized_txt = nltk.word_tokenize(message)
    return nltk.pos_tag(tokenized_txt)

In [7]:
df.insert(2,"word_pos_list",[ create_word_pos_list(message) for message in df['message']],True)

In [8]:
df['word_pos_list'][2][:3]

[('However', 'RB'), (',', ','), ('there', 'EX')]

In [9]:
RED = '\033[31m'
GREEN = '\033[32m'
END = '\033[0m'
print(GREEN + "サンプル文字列" + END)


[32mサンプル文字列[0m


In [10]:
"In".casefold()

'in'

In [11]:
def search(query_string):
    query = query_string.split()
    print("query : ", query)
    for docId, document in enumerate(df['word_pos_list']):
        for idx, (word, pos), in enumerate(document):
            # print(f'(word, pos): {word}, {pos}')
            if word.casefold() == query[0].casefold(): # undistinguith Upper or Lower case
                # check the following words matche the sequence of words
                foundFlg = True
                # print("Following ", query[1:])

                if len(query)>1:

                    for i, query_following in enumerate(query[1:]):
                        try:
                            word_following = document[idx+1+i][0] # follwoing word in the document
                            pos_following = document[idx+1+i][1] # follwoing pos in the document
                        except: # if cannot access the index
                            break
                        if query_following.casefold() == word_following.casefold() or query_following == pos_following:
                            continue
                        foundFlg = False
                    
                if foundFlg is True:
                    if docId == 0:
                        print("K1:  ", end="")
                    elif docId == 1:
                        print("K2:  ", end="")
                    elif docId == 2:
                        print("Q :  ", end="")


                    # # 前後6 文字　出力
                    for j in range(idx-6, idx+7):
                            if 0 <= j and  j < len(document):
                                if idx <= j and j < idx + len(query):  # within queried words
                                    print(GREEN + document[j][0] + END, end=" ")
                                else:    
                                    print(document[j][0], end=" ")
                            else:
                                print("    ", end="")

                    print()


In [24]:
search("appropriate conferences")

query :  ['appropriate', 'conferences']
Q :  authors could consider a range of [32mappropriate[0m [32mconferences[0m , while the exact venue 
Q :  authors could consider a range of [32mappropriate[0m [32mconferences[0m , while the exact venue 


In [13]:
search("how JJ NNS")

query :  ['how', 'JJ', 'NNS']
K1:  . Our idea is to investigate [32mhow[0m [32mcomputational[0m [32mmodels[0m can enhance musicology research 


In [14]:
search("services NN")

query :  ['services', 'NN']
Q :  { hasan2013understanding } . Such proactive [32mservices[0m [32mhelp[0m to overcome the limitations of 
Q :  23 ] . Such proac- tive [32mservices[0m [32mhelp[0m to overcome the limitations of 


## suspect の前処理
 - 全てのデータせっとに現れるPOSパターンのリストを作成
 - 同時にそれぞれのデータセットでカウントしていく

In [15]:
WINDOW_SIZE = 2
def count_pos_patterns(documents):
    width = WINDOW_SIZE
    pos_patterns = []
    pos_vectors = [[] for _ in range(len(documents))]
    for docId, document in enumerate(documents):
        len_doc = len(document)
        for i in range(len_doc - (width-1)):
            key_str = ""
            key_str += document[i][0] # word

            for j in range(1, width):
                key_str += " " + document[i+j][1] # POS

            if key_str not in pos_patterns:
                pos_patterns.append(key_str)
                # パターンが現れたドキュメント -> 1 , それ以外　-> 0
                for i in range(len(pos_vectors)): # loop for K1, K2, Q
                    if i == docId:
                        pos_vectors[i].append(1)
                    else:
                        pos_vectors[i].append(0)

            else: 
                idx = pos_patterns.index(key_str)
                pos_vectors[docId][idx] += 1

    return (pos_patterns, pos_vectors)
            

In [16]:
pos_patterns, pos_vectors = count_pos_patterns(df['word_pos_list'])
df['pos_vec'] = pos_vectors
df

Unnamed: 0,author,message,word_pos_list,pos_vec
0,K1,Download\n\nSource\n\nPDF\nActions\n Copy Pr...,"[(Download, NNP), (Source, NNP), (PDF, NNP), (...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,K2,\n\nWith the rapid growth of the information c...,"[(With, IN), (the, DT), (rapid, JJ), (growth, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Q,"\n\nHowever, there are frequent situations whe...","[(However, RB), (,, ,), (there, EX), (are, VBP...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [17]:
i = 2
max_idx = pos_vectors[i].index(max(pos_vectors[i]))
pos_patterns[max_idx]

'the NN'

In [18]:
# start からend までのIDの配列を返す
def extract_features(freq_vector, start=0, end=20):

    setX = set(freq_vector) # 最大値を順に取り出すため set を作成

    count = 0

    result = []

    while count<end:
        try:
            max_value = max(setX)
        except ValueError:
            return result

        max_index = freq_vector.index(max_value)
        # max_word = words[max_index]

        setX.remove(max_value)

        if count>= start:
            result.append(max_index)
        count += 1



    return result

In [19]:
# return the score of how many types of pattern are appered in both vectors?
def get_similarity(feature_vector1,feature_vector2): 
    return len(set(feature_vector1) & set(feature_vector2))

In [20]:
K1_features = extract_features(pos_vectors[0], 0, 60)
K2_features = extract_features(pos_vectors[1], 0, 60)
Q_features = extract_features(pos_vectors[2], 0, 60)


In [21]:
get_similarity(K1_features, Q_features)

10

In [22]:
def predict(Q_df,K_df):
    start = 0
    end = 20
    suspected = [author for author in K_df['author'] ]
    while(len(suspected) > 1):
        print("Suspected : ", end="")
        print(set(suspected))
        Q_features = extract_features(Q_df['pos_vec'], start, end)
        similarityWithQ = {}

        for author, reference_vector in (K_df['author'], K_df['pos_vec']):
            if author in suspected: #
                feature_vector = extract_features(reference_vector,start, end)
                score = get_similarity(feature_vector,Q_features)
                similarityWithQ[author]=score

        # innocent_list に含まれない著者の中から1人を選ぶ

        innocent = min(similarityWithQ, key=similarityWithQ.get)
        if input(f'Do you want to rule out {innocent} in top {end} patterns ? (y/n) ') == 'y':
            suspected.remove(innocent)
        end += 20
    return suspected[0]

In [23]:
# predict(df[:][1])
K_df = df.loc[0:1]
Q_df = df.loc[2]
predict(Q_df, K_df)

Suspected : {'K2', 'K1'}


'K2'