In [163]:
import math 
from convokit import Corpus, download
import polars as pl
import numpy as np
import nltk
from collections import defaultdict, Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from scipy.sparse import csc_array
from gensim.models.ldamodel import LdaModel
from gensim.matutils import Sparse2Corpus

In [56]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /home/justin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/justin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/justin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/justin/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [3]:
corpus = Corpus(filename=download("tennis-corpus"))

Dataset already exists at /home/justin/.convokit/downloads/tennis-corpus


In [20]:
speakers = pl.from_pandas(corpus.get_speakers_dataframe(), include_index=True)
speakers.head()

id,vectors,meta.gender
str,list[null],str
"""REPORTER""",[],
"""Kei Nishikori""",[],"""M"""
"""Sergiy Stakhov…",[],"""M"""
"""Jelena Jankovi…",[],"""F"""
"""Fernando Verda…",[],"""M"""


In [75]:
df = pl.from_pandas(corpus.get_utterances_dataframe())
df = df.with_columns(pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%d"))
df = df.drop("vectors")
df.head()

timestamp,text,speaker,reply_to,conversation_id,meta.is_answer,meta.is_question,meta.pair_idx
datetime[μs],str,str,str,str,bool,bool,str
2008-08-28 00:00:00,"""I think this i…","""REPORTER""",,"""1681_0.q""",False,True,"""1681_0"""
2008-08-28 00:00:00,"""Yeah.""","""Kei Nishikori""","""1681_0.q""","""1681_0.q""",True,False,"""1681_0"""
2008-08-28 00:00:00,"""How would you …","""REPORTER""",,"""1681_1.q""",False,True,"""1681_1"""
2008-08-28 00:00:00,"""Yeah, I'm pret…","""Kei Nishikori""","""1681_1.q""","""1681_1.q""",True,False,"""1681_1"""
2008-08-28 00:00:00,"""Do you know wh…","""REPORTER""",,"""1681_2.q""",False,True,"""1681_2"""


In [76]:
df = df.with_columns(pl.col("meta.pair_idx").str.split("_").list.first().str.parse_int(radix=10).alias("conversation"))
NUM_DOCS = 1 + df.get_column("conversation").max()
NUM_DOCS

In [9]:
match = pl.from_pandas(corpus.get_conversations_dataframe())
match = match.drop("vectors")
match.head()

meta.match_id,meta.opponent,meta.result,meta.stage,meta.tournament,meta.tournament_type,meta.player_ranking
i64,str,i64,str,str,str,i64
1681,,1,"""2nd Round""","""U.S. OPEN""","""Grand Slam""",126
1681,,1,"""2nd Round""","""U.S. OPEN""","""Grand Slam""",126
1681,,1,"""2nd Round""","""U.S. OPEN""","""Grand Slam""",126
1681,,1,"""2nd Round""","""U.S. OPEN""","""Grand Slam""",126
1681,,1,"""2nd Round""","""U.S. OPEN""","""Grand Slam""",126


In [14]:
match = match.unique("meta.match_id")

In [18]:
match.get_column("meta.opponent").value_counts(sort=True).head(9)

meta.opponent,counts
str,u32
,940
"""Novak Djokovic…",178
"""Roger Federer""",162
"""Rafael Nadal""",157
"""Andy Murray""",153
"""Serena William…",124
"""Caroline Wozni…",95
"""Agnieszka Radw…",87
"""Tomas Berdych""",86


In [106]:
match

meta.match_id,meta.opponent,meta.result,meta.stage,meta.tournament,meta.tournament_type,meta.player_ranking
i64,str,i64,str,str,str,i64
336,"""Dominic Thiem""",1,"""Quarterfinals""","""MIAMI OPEN PRE…","""Masters 1000""",4
608,,1,"""2nd Round""","""WIMBLEDON""","""Grand Slam""",4
5936,,1,"""1st Round""","""DUBAI DUTY FRE…","""Premier""",7
4848,"""Rafael Nadal""",0,"""Round Robin""","""BARCLAYS ATP W…","""Masters Cup""",6
848,"""Elena Dementie…",1,"""Semifinals""","""PILOT PEN TENN…","""2""",4
1728,,1,"""1st Round""","""WESTERN & SOUT…","""Masters""",11
4560,"""Serena William…",0,"""Semifinals""","""WIMBLEDON""","""Grand Slam""",2
2064,"""Richard Gasque…",1,"""Semifinals""","""MEDIBANK INTER…","""ATP250""",11
5392,"""Richard Gasque…",1,"""3rd Round""","""WIMBLEDON""","""Grand Slam""",59
4320,"""Andy Murray""",1,"""3rd Round""","""ROGERS CUP MEN…","""Masters""",24


In [128]:
win_array = np.zeros(NUM_DOCS, dtype=bool)
for row in match.select(["meta.match_id", "meta.result"]).rows():
    match_id = row[0]
    result = row[1]
    win_array[match_id] = result
win_array

array([ True, False,  True, ...,  True,  True,  True])

In [112]:
NUM_DOCS

6467

In [74]:
class Indexer:
    def __init__(self):
        self.token_to_idx = {}
        self.idx_to_token = {}

    def to_token(self, x):
        return self.idx_to_token[x]
    
    def to_idx(self, token):
        if token in self.token_to_idx:
            return self.token_to_idx[token]
        else:
            length = len(self.token_to_idx)
            self.idx_to_token[length] = token
            self.token_to_idx[token] = length
            return length
        
    def token_count(self):
        return len(self.token_to_idx)

In [103]:
stops = set(stopwords.words('english'))
wnl = WordNetLemmatizer()
stops.add(",")
stops.add(".")
indexer = Indexer()
row_idx = [] # Document
col_idx = [] # Term
data = [] # Count
# print(len(stops))
for group in df.select(["text", "conversation"]).group_by(by="conversation"):
    conv_id = group[0]
    text = group[1].get_column("text").str.concat("\n")
    tokens = [token.lower() for token in word_tokenize(text[0]) if token not in stops]

    pos = nltk.pos_tag(tokens)
    pos = ['v' if x[1][0] == "V" else 'n' for x in pos]

    tokens = [wnl.lemmatize(token.lower(), pos=p) for token, p in zip(tokens, pos)]
    idxs = [indexer.to_idx(token) for token in tokens]
    counter = Counter(idxs)
    for k, v in counter.items():
        col_idx.append(k)
        data.append(v)
        row_idx.append(conv_id)

term_doc = csc_array((data, (row_idx, col_idx)), shape=(NUM_DOCS, indexer.token_count()))

In [132]:
NUM_TERMS = indexer.token_count()

In [133]:
idx_offset = math.log2(NUM_DOCS)
idf = [idx_offset - math.log2(term_doc.getcol(idx).count_nonzero()) for idx in range(NUM_TERMS)]
idf[0:10]

[0.26307902365813085,
 0.38683437065794024,
 2.315695179599997,
 0.8608141746848155,
 0.18898474637695983,
 0.034542349734891786,
 4.6588808950478775,
 1.2510820448259032,
 0.4805272715349851,
 0.49586025546708434]

In [175]:
idf.shape

(21689,)

In [177]:
data_idf = [x * idf[f] for x, f in zip(data, col_idx)]
term_doc_idf = csc_array((data_idf, (row_idx, col_idx)), shape=(NUM_DOCS, indexer.token_count()))

In [149]:
idf = np.array(idf)
idf[0:10]

array([0.26307902, 0.38683437, 2.31569518, 0.86081417, 0.18898475,
       0.03454235, 4.6588809 , 1.25108204, 0.48052727, 0.49586026])

In [151]:
len(idf)

21689

In [173]:
term_doc.getcol(0)[win_array] * idf[0]

<4398x1 sparse array of type '<class 'numpy.float64'>'
	with 3746 stored elements in Compressed Sparse Column format>

In [174]:
term_doc.data

array([9, 7, 7, ..., 1, 1, 1])

In [172]:
idf

array([ 0.26307902,  0.38683437,  2.31569518, ..., 12.6588809 ,
       12.6588809 , 12.6588809 ])

In [155]:
def get_log_prob(class_mask, idf=None):
    term_count = []
    total = 0
    if idf is None: 
        idf = np.ones(NUM_TERMS)
    for t_idx in range(NUM_TERMS):
        count = 1 + term_doc.getcol(t_idx)[class_mask].sum()
        count *= idf[t_idx]
        term_count.append(count)
        total += count
    return np.array([math.log2(tc / total) for tc in term_count])
win_prob = get_log_prob(win_array, idf=idf)
loss_prob = get_log_prob(~win_array, idf=idf)

In [156]:
sort_indices = (win_prob - loss_prob).argsort()

In [160]:
loss_words = [indexer.to_token(x) for x in sort_indices[0:10]]
loss_words

['qubec',
 'anesthesia',
 'latvian',
 'provincial',
 'weil',
 'umpiring',
 'barty',
 'footfault',
 'plateau',
 'risen']

In [159]:
win_words = [indexer.to_token(x) for x in sort_indices[-10:]]
win_words

['overrate',
 'suarez',
 'lpez',
 'nasty',
 'gil',
 'playstation',
 'van',
 'antidoping',
 'karaoke',
 'no.1s']

In [178]:
assert(term_doc.shape[0] == NUM_DOCS)
assert(term_doc.shape[1] == NUM_TERMS)
gensim_corpus = Sparse2Corpus(term_doc_idf, documents_columns=False)

In [179]:
lda = LdaModel(gensim_corpus, num_topics=4, id2word=indexer.idx_to_token, passes=10)

In [180]:
# Personal Details, Court/Surface, Men's Tennis, Women's Tennis
lda.show_topics()

[(0,
  '0.002*"sport" + 0.002*"coach" + 0.002*"people" + 0.002*"money" + 0.002*"british" + 0.002*"centre" + 0.002*"\'ve" + 0.002*"definitely" + 0.002*"learn" + 0.002*"life"'),
 (1,
  '0.003*"course" + 0.003*"clay" + 0.003*"difficult" + 0.003*"final" + 0.003*"grass" + 0.003*"season" + 0.003*"he" + 0.002*"important" + 0.002*"roland" + 0.002*"garros"'),
 (2,
  '0.005*"guy" + 0.004*"sort" + 0.003*"obviously" + 0.003*"he" + 0.003*"\'ve" + 0.002*"pretty" + 0.002*"get" + 0.002*"mean" + 0.002*"serve" + 0.002*"hit"'),
 (3,
  '0.006*"she" + 0.005*"serena" + 0.003*"definitely" + 0.003*"--" + 0.003*"venus" + 0.003*"girl" + 0.002*"kind" + 0.002*"love" + 0.002*"uhm" + 0.002*"oh"')]