In [112]:
import math 
from convokit import Corpus, download
import polars as pl
import pandas as pd
import numpy as np
import nltk
from collections import defaultdict, Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from scipy.sparse import csc_array
from gensim.models.ldamodel import LdaModel
from gensim.matutils import Sparse2Corpus

EXPORT_TABLES = False

In [4]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /home/justin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/justin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/justin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/justin/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [5]:
corpus = Corpus(filename=download("tennis-corpus"))

Dataset already exists at /home/justin/.convokit/downloads/tennis-corpus


In [83]:
speakers = pl.from_pandas(corpus.get_speakers_dataframe(), include_index=True)
speakers = speakers.drop("vectors")
speakers = speakers.drop_nulls()
genders = {}
for row in speakers.rows():
    genders[row[0]] = row[1]
genders["Roger Federer"]

'M'

In [7]:
df = pl.from_pandas(corpus.get_utterances_dataframe())
df = df.with_columns(pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%d"))
df = df.drop("vectors")
df.head()

timestamp,text,speaker,reply_to,conversation_id,meta.is_answer,meta.is_question,meta.pair_idx
datetime[μs],str,str,str,str,bool,bool,str
2008-08-28 00:00:00,"""I think this i…","""REPORTER""",,"""1681_0.q""",False,True,"""1681_0"""
2008-08-28 00:00:00,"""Yeah.""","""Kei Nishikori""","""1681_0.q""","""1681_0.q""",True,False,"""1681_0"""
2008-08-28 00:00:00,"""How would you …","""REPORTER""",,"""1681_1.q""",False,True,"""1681_1"""
2008-08-28 00:00:00,"""Yeah, I'm pret…","""Kei Nishikori""","""1681_1.q""","""1681_1.q""",True,False,"""1681_1"""
2008-08-28 00:00:00,"""Do you know wh…","""REPORTER""",,"""1681_2.q""",False,True,"""1681_2"""


In [8]:
df = df.with_columns(pl.col("meta.pair_idx").str.split("_").list.first().str.parse_int(radix=10).alias("conversation"))
NUM_DOCS = 1 + df.get_column("conversation").max()
NUM_DOCS

6467

In [9]:
match = pl.from_pandas(corpus.get_conversations_dataframe())
match = match.drop("vectors")
match.head()

meta.match_id,meta.opponent,meta.result,meta.stage,meta.tournament,meta.tournament_type,meta.player_ranking
i64,str,i64,str,str,str,i64
1681,,1,"""2nd Round""","""U.S. OPEN""","""Grand Slam""",126
1681,,1,"""2nd Round""","""U.S. OPEN""","""Grand Slam""",126
1681,,1,"""2nd Round""","""U.S. OPEN""","""Grand Slam""",126
1681,,1,"""2nd Round""","""U.S. OPEN""","""Grand Slam""",126
1681,,1,"""2nd Round""","""U.S. OPEN""","""Grand Slam""",126


In [10]:
match = match.unique("meta.match_id")

In [11]:
match.get_column("meta.opponent").value_counts(sort=True).head(9)

meta.opponent,counts
str,u32
,940
"""Novak Djokovic…",178
"""Roger Federer""",162
"""Rafael Nadal""",157
"""Andy Murray""",153
"""Serena William…",124
"""Caroline Wozni…",95
"""Agnieszka Radw…",87
"""Tomas Berdych""",86


In [71]:
match.head()

meta.match_id,meta.opponent,meta.result,meta.stage,meta.tournament,meta.tournament_type,meta.player_ranking
i64,str,i64,str,str,str,i64
4432,"""Na Li""",0,"""Round Robin""","""TEB BNP PARIBA…","""SEC""",5
1440,"""Nicolas Mahut""",1,"""3rd Round""","""PACIFIC LIFE O…","""Masters""",1
2048,"""Svetlana Kuzne…",0,"""Semifinals""","""INTERNAZIONALI…","""Premier""",9
4512,,1,"""1st Round""","""U.S. OPEN""","""Grand Slam""",22
0,"""Kevin Anderson…",1,"""The Final""","""AEGON CHAMPION…","""ATP500""",3


In [12]:
win_array = np.zeros(NUM_DOCS, dtype=bool)
for row in match.select(["meta.match_id", "meta.result"]).rows():
    match_id = row[0]
    result = row[1]
    win_array[match_id] = result
win_array

array([ True, False,  True, ...,  True,  True,  True])

In [13]:
NUM_DOCS

6467

In [14]:
class Indexer:
    def __init__(self):
        self.token_to_idx = {}
        self.idx_to_token = {}

    def to_token(self, x):
        return self.idx_to_token[x]
    
    def to_idx(self, token):
        if token in self.token_to_idx:
            return self.token_to_idx[token]
        else:
            length = len(self.token_to_idx)
            self.idx_to_token[length] = token
            self.token_to_idx[token] = length
            return length
        
    def token_count(self):
        return len(self.token_to_idx)

In [15]:
stops = set(stopwords.words('english'))
wnl = WordNetLemmatizer()
stops.add(",")
stops.add(".")
stops.add("(")
stops.add(")")
stops.add("--")
indexer = Indexer()
row_idx = [] # Document
col_idx = [] # Term
data = [] # Count
# print(len(stops))
for group in df.select(["text", "conversation"]).group_by(by="conversation"):
    conv_id = group[0]
    text = group[1].get_column("text").str.concat("\n")
    tokens = [token.lower() for token in word_tokenize(text[0]) if token not in stops]

    pos = nltk.pos_tag(tokens)
    pos = ['v' if x[1][0] == "V" else 'n' for x in pos]

    tokens = [wnl.lemmatize(token.lower(), pos=p) for token, p in zip(tokens, pos)]
    idxs = [indexer.to_idx(token) for token in tokens]
    counter = Counter(idxs)
    for k, v in counter.items():
        col_idx.append(k)
        data.append(v)
        row_idx.append(conv_id)

term_doc = csc_array((data, (row_idx, col_idx)), shape=(NUM_DOCS, indexer.token_count()))

In [98]:
male_array = np.zeros(NUM_DOCS, dtype=bool)
for group in df.select(["speaker", "conversation"]).group_by(by="conversation"):
    conv_id = group[0]
    speaker = group[1].filter(pl.col("speaker") != "REPORTER").get_column("speaker")
    gender = genders[speaker[0]]
    if gender == "M":
        male_array[conv_id] = True

In [16]:
NUM_TERMS = indexer.token_count()
NUM_TERMS

21682

In [17]:
records = []
for mask, name in zip([win_array, ~win_array], ["Win", "Loss"]):
    counts = term_doc[mask].sum(axis=1)
    num_docs = len(counts)
    mean = counts.mean()
    std = counts.std()
    record = {"Class Name": name, "Document Count": num_docs, "Average Tokens / Doc": mean, "Std Tokens / Doc": std}
    records.append(record)

corpus_info = pd.DataFrame.from_records(records)
if EXPORT_TABLES:
    corpus_info.to_latex("corpus.table", index=False)
corpus_info

Unnamed: 0,Class Name,Document Count,Average Tokens / Doc,Std Tokens / Doc
0,Win,4398,693.282401,358.090978
1,Loss,2069,569.201547,320.263525


In [18]:
idx_offset = math.log2(NUM_DOCS)
idf = [idx_offset - math.log2(term_doc.getcol(idx).count_nonzero()) for idx in range(NUM_TERMS)]
idf[0:10]

[0.08074411163312867,
 0.3482681133883485,
 0.3591004921893379,
 3.532176422204687,
 0.38683437065794024,
 3.430062204551996,
 2.963652603552127,
 3.600889172288701,
 2.270863609702742,
 0.8906965702709506]

In [19]:
data_idf = [x * idf[f] for x, f in zip(data, col_idx)]
term_doc_idf = csc_array((data_idf, (row_idx, col_idx)), shape=(NUM_DOCS, indexer.token_count()))

In [20]:
idf = np.array(idf)
idf[0:10]

array([0.08074411, 0.34826811, 0.35910049, 3.53217642, 0.38683437,
       3.4300622 , 2.9636526 , 3.60088917, 2.27086361, 0.89069657])

In [21]:
len(idf)

21682

In [22]:
term_doc.getcol(0)[win_array] * idf[0]

<4398x1 sparse array of type '<class 'numpy.float64'>'
	with 4202 stored elements in Compressed Sparse Column format>

In [23]:
term_doc.data

array([12, 15, 11, ...,  1,  1,  1])

In [24]:
idf

array([ 0.08074411,  0.34826811,  0.35910049, ..., 12.6588809 ,
       12.6588809 , 12.6588809 ])

In [25]:
term_doc.shape

(6467, 21682)

In [26]:
def get_log_prob(class_mask, idf=None):
    term_count = []
    total = 0
    if idf is None: 
        idf = np.ones(NUM_TERMS)
    for t_idx in range(NUM_TERMS):
        count = 1 + term_doc.getcol(t_idx)[class_mask].sum()
        count *= idf[t_idx]
        term_count.append(count)
        total += count
    return np.array([math.log2(tc / total) for tc in term_count])

def binary_dirichlet_prior(term_doc, mask):
    win_counts = term_doc[mask].sum(axis=0)
    loss_counts = term_doc[~mask].sum(axis=0)
    background = win_counts + loss_counts
    win_corpus = win_counts.sum()
    loss_corpus = loss_counts.sum()
    background_corpus = win_corpus + loss_corpus
    log_odds_ratio1 = np.log((win_counts + background) / (win_corpus + background_corpus - win_counts - background))
    log_odds_ratio2 = np.log((loss_counts + background) / (loss_corpus + background_corpus - loss_counts - background))
    variance = 1.0 / (win_counts + background) + 1.0 / (loss_counts + background)
    z_score = (log_odds_ratio1 - log_odds_ratio2) / np.sqrt(variance)
    return np.argsort(z_score)

def print_win_loss(sort_indices):
    loss_words = [indexer.to_token(x) for x in sort_indices[0:10]]
    print("Loss words:")
    print(loss_words)
    
    win_words = [indexer.to_token(x) for x in sort_indices[-10:]]
    print("Win words:")
    print(win_words)


In [27]:
# Raw counts
win_prob = get_log_prob(win_array, idf=None)
loss_prob = get_log_prob(~win_array, idf=None)
sort_indices = (win_prob - loss_prob).argsort()
print("Count-based Naive Bayes")
print_win_loss(sort_indices)

Count-based Naive Bayes
Loss words:
['qubec', 'latvian', 'weil', 'anesthesia', 'provincial', 'umpiring', 'plateau', 'lineman', 'risen', 'ghangzou']
Win words:
['overrate', 'suarez', 'lpez', 'nasty', 'gil', 'van', 'playstation', 'antidoping', 'karaoke', 'no.1s']


In [28]:
# Tf*idf counts
win_prob = get_log_prob(win_array, idf=idf)
loss_prob = get_log_prob(~win_array, idf=idf)
sort_indices = (win_prob - loss_prob).argsort()
print("td-idf Naive Bayes")
print_win_loss(sort_indices)

td-idf Naive Bayes
Loss words:
['qubec', 'anesthesia', 'weil', 'provincial', 'latvian', 'umpiring', 'plateau', 'barty', 'footfault', 'zhe']
Win words:
['overrate', 'suarez', 'lpez', 'nasty', 'gil', 'playstation', 'van', 'antidoping', 'karaoke', 'no.1s']


In [29]:
# Log Odds Ratio Informative Dirichlet Prior
print("Words by Log Odds Ratio Informative Dirichlet Prior")
z_score_indices = binary_dirichlet_prior(term_doc, win_array)
print_win_loss(z_score_indices)

Words by Log Odds Ratio Informative Dirichlet Prior
Loss words:
['disappointed', 'disappointing', 'today', 'week', 'chance', "n't", 'could', 'set', 'disappoint', 'loss']
Win words:
['help', 'we', 'victory', 'focus', 'opponent', 'win', 'always', 'happy', 'important', 'tomorrow']


In [30]:
from gensim.corpora import Dictionary

In [31]:
assert(term_doc.shape[0] == NUM_DOCS)
assert(term_doc.shape[1] == NUM_TERMS)
gensim_corpus = Sparse2Corpus(term_doc_idf, documents_columns=False)
dictionary = Dictionary.from_corpus(gensim_corpus, id2word=indexer.idx_to_token)

In [42]:
lda = LdaModel(gensim_corpus, num_topics=4, id2word=indexer.idx_to_token, passes=20)

In [45]:
lda.save("lda")

In [101]:
lda.show_topics()

[(0,
  '0.007*"she" + 0.004*"serena" + 0.003*"definitely" + 0.003*"girl" + 0.003*"venus" + 0.003*"coach" + 0.002*"li" + 0.002*"woman" + 0.002*"uhm" + 0.002*"college"'),
 (1,
  '0.002*"definitely" + 0.002*"people" + 0.002*"sport" + 0.002*"\'ve" + 0.002*"wimbledon" + 0.002*"she" + 0.002*"love" + 0.002*"learn" + 0.002*"grass" + 0.002*"coach"'),
 (2,
  '0.003*"clay" + 0.003*"course" + 0.003*"difficult" + 0.003*"final" + 0.003*"grass" + 0.003*"he" + 0.003*"season" + 0.003*"important" + 0.003*"roland" + 0.002*"surface"'),
 (3,
  '0.005*"guy" + 0.004*"sort" + 0.003*"obviously" + 0.003*"he" + 0.003*"\'ve" + 0.002*"pretty" + 0.002*"get" + 0.002*"hit" + 0.002*"probably" + 0.002*"mean"')]

In [43]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

vis = pyLDAvis.gensim_models.prepare(lda, gensim_corpus, dictionary)
pyLDAvis.save_html(vis, 'lda.html')

  default_term_info = default_term_info.sort_values(


In [44]:
vis

In [66]:
win_corpus = Sparse2Corpus(term_doc_idf[win_array], documents_columns=False)
loss_corpus = Sparse2Corpus(term_doc_idf[~win_array], documents_columns=False)

In [67]:
win_topics = lda.inference(win_corpus)[0]
loss_topics = lda.inference(loss_corpus)[0]
win_topics.mean(axis=0)

array([237.09422, 175.35843, 267.50708, 289.1043 ], dtype=float32)

In [68]:
loss_topics.mean(axis=0)

array([150.12149 ,  87.438965, 251.24158 , 257.19962 ], dtype=float32)

In [99]:
male_corpus = Sparse2Corpus(term_doc_idf[male_array], documents_columns=False)
female_corpus = Sparse2Corpus(term_doc_idf[~male_array], documents_columns=False)
male_topics = lda.inference(male_corpus)[0]
female_topics = lda.inference(female_corpus)[0]
male_topics.mean(axis=0)

array([ 80.918236, 135.12613 , 339.98703 , 403.82675 ], dtype=float32)

In [100]:
female_topics.mean(axis=0)

array([373.80157, 162.75461, 162.71953, 118.74742], dtype=float32)

In [106]:
'0.005*"guy" + 0.004*"sort" + 0.003*"obviously" + 0.003*"he" + 0.003*"\'ve" + 0.002*"pretty" + 0.002*"get" + 0.002*"hit" + 0.002*"probably" + 0.002*"mean"'

'0.005*"guy" + 0.004*"sort" + 0.003*"obviously" + 0.003*"he" + 0.003*"\'ve" + 0.002*"pretty" + 0.002*"get" + 0.002*"hit" + 0.002*"probably" + 0.002*"mean"'

In [111]:
topic1 = "she(0.007), serena(0.004), girl(0.003), venus(0.003), coach(0.003), li(0.002), woman(0.002), uhm(0.002), college(0.002)" # Female
topic2 = "definitely(0.002), people(0.002), sport(0.002), 've(0.002), wimbledon(0.002), she(0.002), love(0.002), learn(0.002), grass(0.002), coach(0.002)"
topic3 = "clay(0.003), course(0.003), difficult(0.003), final(0.003), grass(0.003), he(0.003), season(0.003), important(0.003), roland(0.003), surface(0.002)"
topic4 = "guy(0.005), sort(0.004), obviously(0.003), he(0.003), 've(0.003), pretty(0.002), get(0.002), hit(0.002), probably(0.002), mean(0.002)"

records = []
for name, words in zip(["Women's Tennis", "Personal Background", "Tournament / Surface", "Men's Tennis"], [topic1, topic2, topic3, topic4]):
    records.append({"Topic": name, "Word List": words})

topics = pd.DataFrame.from_records(records)
if EXPORT_TABLES:
    with pd.option_context("max_colwidth", 1000):
        topics.to_latex("out2.tex", index=False)
topics

  topics.to_latex("out2.tex", index=False)


Unnamed: 0,Topic,Word List
0,Women's Tennis,"she(0.007), serena(0.004), girl(0.003), venus(..."
1,Personal Background,"definitely(0.002), people(0.002), sport(0.002)..."
2,Tournament / Surface,"clay(0.003), course(0.003), difficult(0.003), ..."
3,Men's Tennis,"guy(0.005), sort(0.004), obviously(0.003), he(..."
