In [None]:
import pandas as pd

In [None]:
import sys
from datawand.parametrization import ParamHelper
ph = ParamHelper("../../", "TrendApproximation", sys.argv)

In [None]:
schedule_path = ph.get("schedule_file_path")
player_names_file_path = ph.get("player_names_file_path")
word_file_path = ph.get("word_corpus")
tweet_file_path = ph.get("stemmed_tweet_file_path")
w2v_model_dir = ph.get("w2v_root_folder")

# 1. Load schedule information

In [None]:
schedule_df = pd.read_csv(schedule_path, sep="|")
schedule_df.head(3)

## Collect name parts

In [None]:
names = set(schedule_df["playerName active"]).union(set(schedule_df["playerName opponent"]))
names = set([n.lower() for n in names])

In [None]:
names_parts = []
for n in names:
    if "-" in n:
        name = n.replace("-"," ")
    else:
        name = n
    names_parts.append(name.replace(" ",""))
    names_parts += name.split(" ")
len(names_parts)

In [None]:
names_parts

with open(player_names_file_path, "w") as f:
    for n_part in names_parts:
        f.write("%s\n" % n_part)

## Select relevant words

In [None]:
WORDS = []
with open(word_file_path) as wf:
    for line in wf:
        w = line.rstrip()
        if "@" not in w and w not in names_parts:
            WORDS.append(w)
WORDS = set(WORDS)

In [None]:
len(WORDS)

# 2. Load tweets with stammed text

In [None]:
tweets_df = pd.read_csv(tweet_file_path, sep="|")

In [None]:
tweets_df.head()

In [None]:
import re
def clean_text(t):
    clean_1 = ' '.join(re.findall("[\w,\@]+",t))
    clean_2 = ' '.join(re.findall("[^\,,\d]+",clean_1))
    return clean_2.lower()

In [None]:
tweets_df["text_clean"] = tweets_df["text"].apply(clean_text)

In [None]:
tweets_df["text_clean_splitted"] = tweets_df["text_clean"].apply(str.split)

In [None]:
tweets_df["filtered_words"] = tweets_df["text_clean_splitted"].apply(lambda x: list(WORDS.intersection(set(x))))

In [None]:
tweets_df = tweets_df[tweets_df["filtered_words"].apply(lambda x: len(x)>1)]

In [None]:
len(tweets_df)

In [None]:
filtered_tweet_texts = list(tweets_df["filtered_words"])
filtered_tweet_texts[:2]

# Experimenting with W2V

In [None]:
import gensim, logging

In [None]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
model = gensim.models.Word2Vec(filtered_tweet_texts, min_count=5, batch_words=100, workers=5)

model.most_similar(positive=["shock"],topn=20)

In [None]:
model.most_similar(positive=["winner"],topn=20)

model.most_similar(positive=["djoko"],topn=20)

model.wv.vocab

# Export model

In [None]:
model.save("%s/global.w2v" % w2v_model_dir)