# Visualization notebook

This notebook is meant for visualizing stuff and testing code. 

In [1]:
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import json
from nltk import Tree
from spacy import displacy
import spacy
from more_itertools import chunked
from dataclasses import dataclass
from nltk.tokenize import word_tokenize
from collections import defaultdict, Counter
from featurizers import GrammarVectorizer, make_document
from typing import List, Dict, Tuple, Set
warnings.filterwarnings("ignore", category=FutureWarning)

# PAN 2022 Summary Stats & Visualizations


In [2]:
@dataclass
class Author:
    """
    Stores author information in an easy to work with format
    
    :param author_id: unique author id
    :param fixed_texts: list of author documents with regex fixes
    :param raw_texts: list of author documents without regex fixes
    :param discourse_types: list of discourse types
    
    Note: fixed_docs, raw_docs, and discourse_types are all 1 - 1 corresponding
    """
    author_id:str
    fixed_texts:list[str]
    raw_texts:list[str]
    discourse_types:list[str]
    
    def get_token_counts(self) -> list[int]:
        return [len(word_tokenize(author_doc)) for author_doc in self.fixed_texts]
    
    def get_total_docs(self) -> int:
        return len(self.fixed_texts)
    
    def count_dicourse_type(self, dtype:str) -> int:
        return Counter(self.discourse_types)[dtype]
        
def load_json(path:str) -> dict[str, list[dict]]:
    with open(path, "r") as fin:
        data = json.load(fin)
        return data

def extract_from_dict(author_entry:dict, to_extract:str) -> list[str]:
    return [entry[to_extract] for entry in author_entry]
    
def create_author_list(preprocessed_data:dict[str, list[dict]]) -> list[Author]:
    """
    Converts the preprocessed data into a list of Author objects
    """
    authors = []
    for author_id in preprocessed_data.keys():
        author_entry = preprocessed_data[author_id]
        fixed_texts = extract_from_dict(author_entry,"fixed_text")
        raw_texts = extract_from_dict(author_entry,"raw_text")
        discourse_types = extract_from_dict(author_entry,"discourse_type")
            
        authors.append(Author(author_id, fixed_texts, raw_texts, discourse_types))
        
    return authors

def get_doc_token_stats(authors:list[Author]) -> tuple[float, float]:
    """Gets the mean and std of tokens per document"""
    all_doc_token_counts = []
    for author in authors:
        all_doc_token_counts.extend(author.get_token_counts())
    return np.mean(all_doc_token_counts), np.std(all_doc_token_counts)
    
def make_author_df(authors:list[Author]) -> pd.DataFrame:
    
    author_maps = defaultdict(list)
    for author in authors:
        author_maps["author_id"].append(author.author_id)
        author_maps["total_token_count"].append(sum(author.get_token_counts()))
        author_maps["Total docs"].append(author.get_total_docs())
        author_maps["Emails"].append(author.count_dicourse_type("email"))
        author_maps["Memos"].append(author.count_dicourse_type("memo"))
        author_maps["Txt msgs"].append(author.count_dicourse_type("text_message"))
        author_maps["Essays"].append(author.count_dicourse_type("essay"))
        
    return pd.DataFrame(author_maps)

data = load_json("data/pan22/preprocessed/author_doc_mappings.json")
all_authors = create_author_list(data)
df = make_author_df(all_authors)

## Feature testing ground

In [3]:
g2v = GrammarVectorizer()

def get_all_documents(data_path:str, text_type="fixed_text") -> list[str]:
    """Aggregates all documents into one list"""
    all_documents = []
    for author_entries in load_json(data_path).values():
        for entry in author_entries:
            all_documents.append(make_document(entry[text_type], g2v.nlp))
            
    return all_documents

all_documents = get_all_documents("eval/pan22_splits/knn/train.json")

### Syntactic construction featues?

- Look for spacy tree pattern matcher online



In [9]:
sentence = "Apples that fall from trees taste good"
sentence = "Apples that John likes are tasty"
nlp = g2v.nlp
doc = nlp(sentence)

def _to_nltk_tree(node):
    
    _tok_format = lambda tok: "_".join([tok.orth_, tok.pos_,tok.dep_])
    if node.n_lefts + node.n_rights > 0:
        return Tree(_tok_format(node), [_to_nltk_tree(child) for child in node.children])
    else:
        return _tok_format(node)

def get_nltk_tree(doc):
    #https://stackoverflow.com/questions/36610179/how-to-get-the-dependency-tree-with-spacy
    return [_to_nltk_tree(sent.root).pretty_print() for sent in doc.sents]

def get_root(doc) -> spacy.tokens.token.Token:
    return [token for token in doc if token.head == token][0]

def get_subject(root) -> spacy.tokens.token.Token:
    return list(root.lefts)[0]


# root = get_root(doc)
# subject = get_subject(root)

# for descendant in subject.subtree:

#     print(descendant.text, descendant.dep_, descendant.n_lefts,
#             descendant.n_rights,
#             [ancestor.text for ancestor in descendant.ancestors])

get_nltk_tree(doc)



                 are_AUX_ROOT                                   
        ______________|_______________                           
       |                       Apples_NOUN_nsub                 
       |                              j                         
       |                              |                          
       |                       likes_VERB_relcl                 
       |               _______________|________________          
tasty_ADJ_acomp that_PRON_dobj                  John_PROPN_nsubj



[None]

In [5]:
from spacy.matcher import DependencyMatcher
from spacy.pipeline import merge_entities

dobj_relcl = []




### Vocabulary richness vector?

- hapaxes
- \# of mispelled words?
- 

## Visuals

In [None]:
from data.scripts.pan_create_bins import get_train_authors_sorted_by_docfreq

def bin_authors(iterable) -> tuple[list[str], ...]:
    return tuple(chunked(iterable, 7)) 

def make_doc_avg_labels(sorted_dict):
       
       labels = []
       for bin in bin_authors(list(sorted_dict.values())):
              labels.append(round(np.mean(bin), 2))
       return labels

train_path = "eval/pan22_splits/knn/train.json"
train_authors_sorted = get_train_authors_sorted_by_docfreq(train_path)
labels = make_doc_avg_labels(train_authors_sorted)

# k = 6

r_at_1 = np.array([0.02857142857,0.2285714286,0.1428571429,0.1428571429,0.2285714286,0.1142857143,0.2285714286,0.5142857143])

r_at_8 = np.array([0.2571428571,0.5142857143,0.5142857143,0.5142857143,0.7142857143,0.6857142857,0.7142857143,0.7428571429,])


df = pd.DataFrame({
       "Bin labels": labels,
       "R@1": r_at_1,
       "R@8": r_at_8
})

sns.set_style("darkgrid")


sns.lineplot(data=df, x="Bin labels", y="R@1",color="blue",marker="o", label="R@1")
sns.lineplot(data=df, x="Bin labels", y="R@8",color="green",marker="o", label="R@8")
plt.xlabel("Avg document count")
plt.ylabel("Score")
plt.title("Binned author scores")
plt.legend()

plt.show()

In [None]:

sns.set_style("darkgrid")
df["Total docs"].hist(bins=7)
plt.title("Document counts per author")
plt.xlabel("# of documents")
plt.ylabel("# of authors")
plt.show()

The following cell contains deprecated information

In [None]:
ALL_FEATS_ACCS = [0.0, 0.05714285714285714, 0.11428571428571428, 0.17142857142857143, 0.2, 0.2, 0.2857142857142857, 0.5428571428571428]
HALF_FEATS_ACCS = [0.0, 0.02857142857142857, 0.08571428571428572, 0.11428571428571428, 0.22857142857142856, 0.22857142857142856, 0.22857142857142856, 0.4]

old_df = pd.DataFrame(
    {"Full features": ALL_FEATS_ACCS,
     "Half features": HALF_FEATS_ACCS,
     "Bin labels":labels}
)


sns.lineplot(data=old_df, x="Bin labels", y="Full features",color="blue", label="Full features")
sns.lineplot(data=old_df, x="Bin labels", y="Half features", color="red", label="Half features")
plt.xlabel("Avg document count")
plt.ylabel("R@1 score")
plt.title("R@1 Development bin scores")
plt.legend()

plt.show()

# PAN 2022 Discourse related stuff

In [None]:
def load_all_discourse_types(path:str) -> tuple[list,list,list]:
    """Loads the preprocessed data and sorts it by discourse type"""
    preprocessed = load_json(path)
    author_ids = preprocessed.keys()
    emails = []
    memos = []
    txt_msgs = []
    essays = []
    for author_id in author_ids:
        for author_entry in preprocessed[author_id]:
            dtype = author_entry["discourse_type"]
            fixed = author_entry["fixed_text"].split()
            
            if  dtype == "email":
                emails.append(fixed)
                
            if  dtype == "memo":
                memos.append(fixed)
                
            if  dtype == "text_message":
                txt_msgs.append(fixed)
                
            if  dtype == "essay":
                essays.append(fixed)
    return emails, memos, txt_msgs, essays
  
def get_avg_tokens(dtype:list[list[str]]) -> int:
    
    token_counts = []
    for tokens in dtype:
        token_counts.append(len(tokens))
    return np.mean(token_counts)
              
            
emails, memos, txt_msgs, essays = load_all_discourse_types("data/pan22/preprocessed/preprocessed_data.json")


print(get_avg_tokens(emails))
print(get_avg_tokens(memos))
print(get_avg_tokens(txt_msgs))
print(get_avg_tokens(essays))
    

In [None]:
sns.set_style("darkgrid")

dtype_df = df[["Emails", "Txt msgs", "Essays", "Memos"]].sum()
dtype_df.plot.bar(color=["teal", "lightpink", "orange", "brown"])
plt.title("Discourse type counts")
plt.ylabel("Count")
plt.xticks(rotation=45)

plt.show()

### Blogs testing

In [2]:
blogs_preprocessed = pd.read_csv("data/blogs/preprocessed/blogs_preprocessed.csv")

In [11]:
from sklearn.model_selection import train_test_split


def select_from_threshold(series:pd.Series, threshold:int) -> Set[int]:
    """Takes a pandas Series and selects indices that meet a given threshold"""
    return set(series[series > threshold].index.to_list())

def extract_blog_authors(blogs_df:pd.DataFrame, avg_tok_threshold:int, doc_threshold:int) -> pd.DataFrame:
    """Selects authors from the blogs df that meet an avg token count threshold and doc frequency threshold"""
    
    author_avg_tkns = blogs_df.groupby("id")["tkn_count"].mean()
    author_doc_cnts= blogs_df.groupby("id")["id"].count()

    selected_avg_tkn_authors = select_from_threshold(author_avg_tkns, avg_tok_threshold)
    selected_doc_cnt_authors = select_from_threshold(author_doc_cnts, doc_threshold)
    selected_ids = selected_avg_tkn_authors.intersection(selected_doc_cnt_authors)
    
    return blogs_df.loc[blogs_df["id"].isin(selected_ids)]

    
df_sample = extract_blog_authors(blogs_df = blogs_preprocessed, 
                                 avg_tok_threshold = 350, 
                                 doc_threshold = 200)
df_sample = df_sample.rename(columns={"id":"author_id"})



#df_sample.groupby('author_id', group_keys=False).apply(lambda x: x.sample(min(len(x), 2)))




Unnamed: 0,author_id,text,tkn_count
526798,76211,M.ardi G.ras Update + Summer Don't...,397
526703,76211,Happy Anniversary to Me Today ma...,110
642373,122217,Her name is Sugar. The urlLink 4yo Rot...,394
642426,122217,Wednesday night was an orgy of caloric ...,238
37184,152151,yay heading up to berkeley this aft...,225
...,...,...,...
583186,3333298,This post also features news ...,458
96357,3344990,You have got to love training. ...,609
96331,3344990,"First there was Will & Grace , n...",957
170032,3456542,"So, Whoopi Goldberg has been let go by ...",517
