In [105]:
import math 
from convokit import Corpus, download
import polars as pl
import pandas as pd
import numpy as np
import nltk
from collections import defaultdict, Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from scipy.sparse import csc_array
from gensim.models.ldamodel import LdaModel
from gensim.matutils import Sparse2Corpus

EXPORT_TABLES = False

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /home/justin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/justin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/justin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/justin/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
corpus = Corpus(filename=download("tennis-corpus"))

Dataset already exists at /home/justin/.convokit/downloads/tennis-corpus


In [4]:
speakers = pl.from_pandas(corpus.get_speakers_dataframe(), include_index=True)
speakers.head()

id,vectors,meta.gender
str,list[null],str
"""REPORTER""",[],
"""Kei Nishikori""",[],"""M"""
"""Sergiy Stakhov…",[],"""M"""
"""Jelena Jankovi…",[],"""F"""
"""Fernando Verda…",[],"""M"""


In [5]:
df = pl.from_pandas(corpus.get_utterances_dataframe())
df = df.with_columns(pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%d"))
df = df.drop("vectors")
df.head()

timestamp,text,speaker,reply_to,conversation_id,meta.is_answer,meta.is_question,meta.pair_idx
datetime[μs],str,str,str,str,bool,bool,str
2008-08-28 00:00:00,"""I think this i…","""REPORTER""",,"""1681_0.q""",False,True,"""1681_0"""
2008-08-28 00:00:00,"""Yeah.""","""Kei Nishikori""","""1681_0.q""","""1681_0.q""",True,False,"""1681_0"""
2008-08-28 00:00:00,"""How would you …","""REPORTER""",,"""1681_1.q""",False,True,"""1681_1"""
2008-08-28 00:00:00,"""Yeah, I'm pret…","""Kei Nishikori""","""1681_1.q""","""1681_1.q""",True,False,"""1681_1"""
2008-08-28 00:00:00,"""Do you know wh…","""REPORTER""",,"""1681_2.q""",False,True,"""1681_2"""


In [6]:
df = df.with_columns(pl.col("meta.pair_idx").str.split("_").list.first().str.parse_int(radix=10).alias("conversation"))
NUM_DOCS = 1 + df.get_column("conversation").max()
NUM_DOCS

6467

In [7]:
match = pl.from_pandas(corpus.get_conversations_dataframe())
match = match.drop("vectors")
match.head()

meta.match_id,meta.opponent,meta.result,meta.stage,meta.tournament,meta.tournament_type,meta.player_ranking
i64,str,i64,str,str,str,i64
1681,,1,"""2nd Round""","""U.S. OPEN""","""Grand Slam""",126
1681,,1,"""2nd Round""","""U.S. OPEN""","""Grand Slam""",126
1681,,1,"""2nd Round""","""U.S. OPEN""","""Grand Slam""",126
1681,,1,"""2nd Round""","""U.S. OPEN""","""Grand Slam""",126
1681,,1,"""2nd Round""","""U.S. OPEN""","""Grand Slam""",126


In [8]:
match = match.unique("meta.match_id")

In [9]:
match.get_column("meta.opponent").value_counts(sort=True).head(9)

meta.opponent,counts
str,u32
,940
"""Novak Djokovic…",178
"""Roger Federer""",162
"""Rafael Nadal""",157
"""Andy Murray""",153
"""Serena William…",124
"""Caroline Wozni…",95
"""Agnieszka Radw…",87
"""Tomas Berdych""",86


In [10]:
win_array = np.zeros(NUM_DOCS, dtype=bool)
for row in match.select(["meta.match_id", "meta.result"]).rows():
    match_id = row[0]
    result = row[1]
    win_array[match_id] = result
win_array

array([ True, False,  True, ...,  True,  True,  True])

In [74]:
NUM_DOCS

6467

In [12]:
class Indexer:
    def __init__(self):
        self.token_to_idx = {}
        self.idx_to_token = {}

    def to_token(self, x):
        return self.idx_to_token[x]
    
    def to_idx(self, token):
        if token in self.token_to_idx:
            return self.token_to_idx[token]
        else:
            length = len(self.token_to_idx)
            self.idx_to_token[length] = token
            self.token_to_idx[token] = length
            return length
        
    def token_count(self):
        return len(self.token_to_idx)

In [13]:
stops = set(stopwords.words('english'))
wnl = WordNetLemmatizer()
stops.add(",")
stops.add(".")
stops.add("(")
stops.add(")")
stops.add("--")
indexer = Indexer()
row_idx = [] # Document
col_idx = [] # Term
data = [] # Count
# print(len(stops))
for group in df.select(["text", "conversation"]).group_by(by="conversation"):
    conv_id = group[0]
    text = group[1].get_column("text").str.concat("\n")
    tokens = [token.lower() for token in word_tokenize(text[0]) if token not in stops]

    pos = nltk.pos_tag(tokens)
    pos = ['v' if x[1][0] == "V" else 'n' for x in pos]

    tokens = [wnl.lemmatize(token.lower(), pos=p) for token, p in zip(tokens, pos)]
    idxs = [indexer.to_idx(token) for token in tokens]
    counter = Counter(idxs)
    for k, v in counter.items():
        col_idx.append(k)
        data.append(v)
        row_idx.append(conv_id)

term_doc = csc_array((data, (row_idx, col_idx)), shape=(NUM_DOCS, indexer.token_count()))

In [45]:
NUM_TERMS = indexer.token_count()
NUM_TERMS

21682

In [95]:
records = []
for mask, name in zip([win_array, ~win_array], ["Win", "Loss"]):
    counts = term_doc[mask].sum(axis=1)
    num_docs = len(counts)
    mean = counts.mean()
    std = counts.std()
    record = {"Class Name": name, "Document Count": num_docs, "Average Tokens / Doc": mean, "Std Tokens / Doc": std}
    records.append(record)

corpus_info = pd.DataFrame.from_records(records)
if EXPORT_TABLES:
    corpus_info.to_latex("corpus.table", index=False)
corpus_info

  corpus_info.to_latex("corpus.table", index=False)


Unnamed: 0,Class Name,Document Count,Average Tokens / Doc,Std Tokens / Doc
0,Win,4398,693.282401,358.090978
1,Loss,2069,569.201547,320.263525


In [15]:
idx_offset = math.log2(NUM_DOCS)
idf = [idx_offset - math.log2(term_doc.getcol(idx).count_nonzero()) for idx in range(NUM_TERMS)]
idf[0:10]

[0.08074411163312867,
 0.21619652718204385,
 3.226338994659619,
 2.3168062270487386,
 0.6624074063146494,
 0.44425820874554134,
 2.9741322746262515,
 0.08003650348967462,
 0.20947428006229707,
 0.011872256606229925]

In [16]:
data_idf = [x * idf[f] for x, f in zip(data, col_idx)]
term_doc_idf = csc_array((data_idf, (row_idx, col_idx)), shape=(NUM_DOCS, indexer.token_count()))

In [17]:
idf = np.array(idf)
idf[0:10]

array([0.08074411, 0.21619653, 3.22633899, 2.31680623, 0.66240741,
       0.44425821, 2.97413227, 0.0800365 , 0.20947428, 0.01187226])

In [18]:
len(idf)

21682

In [19]:
term_doc.getcol(0)[win_array] * idf[0]

<4398x1 sparse array of type '<class 'numpy.float64'>'
	with 4202 stored elements in Compressed Sparse Column format>

In [20]:
term_doc.data

array([12, 15, 11, ...,  1,  1,  1])

In [96]:
idf

array([ 0.08074411,  0.21619653,  3.22633899, ..., 12.6588809 ,
       12.6588809 , 12.6588809 ])

In [44]:
term_doc.shape

(6467, 21682)

In [72]:
def get_log_prob(class_mask, idf=None):
    term_count = []
    total = 0
    if idf is None: 
        idf = np.ones(NUM_TERMS)
    for t_idx in range(NUM_TERMS):
        count = 1 + term_doc.getcol(t_idx)[class_mask].sum()
        count *= idf[t_idx]
        term_count.append(count)
        total += count
    return np.array([math.log2(tc / total) for tc in term_count])

def binary_dirichlet_prior(term_doc, mask):
    win_counts = term_doc[mask].sum(axis=0)
    loss_counts = term_doc[~mask].sum(axis=0)
    background = win_counts + loss_counts
    win_corpus = win_counts.sum()
    loss_corpus = loss_counts.sum()
    background_corpus = win_corpus + loss_corpus
    log_odds_ratio1 = np.log((win_counts + background) / (win_corpus + background_corpus - win_counts - background))
    log_odds_ratio2 = np.log((loss_counts + background) / (loss_corpus + background_corpus - loss_counts - background))
    variance = 1.0 / (win_counts + background) + 1.0 / (loss_counts + background)
    z_score = (log_odds_ratio1 - log_odds_ratio2) / np.sqrt(variance)
    return np.argsort(z_score)

def print_win_loss(sort_indices):
    loss_words = [indexer.to_token(x) for x in sort_indices[0:10]]
    print("Loss words:")
    print(loss_words)
    
    win_words = [indexer.to_token(x) for x in sort_indices[-10:]]
    print("Win words:")
    print(win_words)


In [97]:
# Raw counts
win_prob = get_log_prob(win_array, idf=None)
loss_prob = get_log_prob(~win_array, idf=None)
sort_indices = (win_prob - loss_prob).argsort()
print("Count-based Naive Bayes")
print_win_loss(sort_indices)

Count-based Naive Bayes
Loss words:
['qubec', 'latvian', 'anesthesia', 'weil', 'provincial', 'lineman', 'plateau', 'risen', 'umpiring', 'footfault']
Win words:
['overrate', 'suarez', 'lpez', 'nasty', 'gil', 'playstation', 'van', 'antidoping', 'karaoke', 'no.1s']


In [98]:
# Tf*idf counts
win_prob = get_log_prob(win_array, idf=idf)
loss_prob = get_log_prob(~win_array, idf=idf)
sort_indices = (win_prob - loss_prob).argsort()
print("td-idf Naive Bayes")
print_win_loss(sort_indices)

td-idf Naive Bayes
Loss words:
['qubec', 'latvian', 'provincial', 'anesthesia', 'weil', 'lineman', 'plateau', 'zhe', 'umpiring', 'risen']
Win words:
['overrate', 'suarez', 'lpez', 'nasty', 'gil', 'playstation', 'van', 'antidoping', 'karaoke', 'no.1s']


In [71]:
# Log Odds Ratio Informative Dirichlet Prior
print("Words by Log Odds Ratio Informative Dirichlet Prior")
z_score_indices = binary_dirichlet_prior(term_doc, win_array)
print_win_loss(z_score_indices)

Words by Log Odds Ratio Informative Dirichlet Prior
Loss words:
['disappointed', 'disappointing', 'today', 'week', 'chance', "n't", 'could', 'set', 'disappoint', 'loss']
Win words:
['help', 'we', 'victory', 'focus', 'opponent', 'win', 'always', 'happy', 'important', 'tomorrow']


In [26]:
from gensim.corpora import Dictionary

In [27]:
assert(term_doc.shape[0] == NUM_DOCS)
assert(term_doc.shape[1] == NUM_TERMS)
gensim_corpus = Sparse2Corpus(term_doc_idf, documents_columns=False)
dictionary = Dictionary.from_corpus(gensim_corpus, id2word=indexer.idx_to_token)

In [36]:
lda = LdaModel(gensim_corpus, num_topics=4, id2word=indexer.idx_to_token, passes=16)

In [99]:
lda.show_topics()

[(0,
  '0.003*"definitely" + 0.003*"guy" + 0.003*"\'ve" + 0.003*"kind" + 0.002*"american" + 0.002*"pretty" + 0.002*"she" + 0.002*"obviously" + 0.002*"love" + 0.002*"college"'),
 (1,
  '0.005*"she" + 0.003*"serena" + 0.003*"course" + 0.003*"coach" + 0.002*"also" + 0.002*"grand" + 0.002*"slam" + 0.002*"woman" + 0.002*"life" + 0.002*"girl"'),
 (2,
  '0.005*"clay" + 0.004*"roland" + 0.004*"difficult" + 0.003*"garros" + 0.003*"grass" + 0.003*"gon" + 0.003*"important" + 0.003*"na" + 0.003*"season" + 0.003*"true"'),
 (3,
  '0.004*"he" + 0.003*"guy" + 0.003*"obviously" + 0.003*"sort" + 0.003*"serve" + 0.002*"roger" + 0.002*"mean" + 0.002*"\'ve" + 0.002*"return" + 0.002*"rafa"')]

In [38]:
import pyLDAvis
pyLDAvis.enable_notebook()

vis = pyLDAvis.gensim_models.prepare(lda, gensim_corpus, dictionary)
pyLDAvis.save_html(vis, 'lda.html')

  default_term_info = default_term_info.sort_values(


In [73]:
vis

In [104]:
topic1 = "he, guy, obviously, sort, serve, roger, mean, 've, return, rafa, break, hit"
topic2 = "she, serena, course, coach, also, grand, slam, woman, life, girl, sport, china"
topic3 = "definitely, guy, 've, kind, american, pretty, she, obviously, love, college, get, everyone"
topic4 = "clay, roland, difficult, garros, grass, gon, important, na, season, true, pain, course"
records = []
for name, words in zip(["Men's Tennis", "Women's Tennis", "Personal Background", "Court / Surface"], [topic1, topic2, topic3, topic4]):
    records.append({"Topic": name, "Word List": words})

topics = pd.DataFrame.from_records(records)
if EXPORT_TABLES:
    with pd.option_context("max_colwidth", 1000):
        topics.to_latex("out2.tex", index=False)
topics

  topics.to_latex("out2.tex", index=False)


Unnamed: 0,Topic,Word List
0,Men's Tennis,"he, guy, obviously, sort, serve, roger, mean, ..."
1,Women's Tennis,"she, serena, course, coach, also, grand, slam,..."
2,Personal Background,"definitely, guy, 've, kind, american, pretty, ..."
3,Court / Surface,"clay, roland, difficult, garros, grass, gon, i..."
