# Use TFIDF to find keywords in the text

In [9]:
import re
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
bgg_alldata_df = pd.read_pickle('datasources/BGG_FINAL.pkl')
dum = [y.splitlines() for y in bgg_alldata_df['gamedscription']]

In [11]:
# Horrible - why does it have to be this way? What happened tolist comprehension
dumsntncs = []
for dumdescr in dum:
    for dum1 in dumdescr:
        for dum2 in dum1.split('.'):
            dumsntncs.append(dum2)

In [12]:
bgg_sentences = pd.DataFrame({'bggsentences':dumsntncs}) # ALl the sentences
bgg_sentences.head(3)

Unnamed: 0,bggsentences
0,Gloomhaven is a game of Euro-inspired tactical...
1,Players will take on the role of a wandering ...
2,Players must work together out of necessity t...


In [13]:
def  clean_text(df, text_field, new_text_field_name):
    df[new_text_field_name] = df[text_field].str.lower()
    df[new_text_field_name] = df[new_text_field_name].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))  
    # remove numbers
    df[new_text_field_name] = df[new_text_field_name].apply(lambda elem: re.sub(r"\d+", "", elem))
    
    return df

In [14]:
bgg_sentences = clean_text(bgg_sentences, 'bggsentences', 'bggsentences_clean')
bgg_sentences.head(3)

Unnamed: 0,bggsentences,bggsentences_clean
0,Gloomhaven is a game of Euro-inspired tactical...,gloomhaven is a game of euroinspired tactical ...
1,Players will take on the role of a wandering ...,players will take on the role of a wandering ...
2,Players must work together out of necessity t...,players must work together out of necessity t...


In [15]:
corpus = list(bgg_sentences['bggsentences_clean'])

In [71]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(corpus)

In [72]:
X.shape  # This is the output of the vectorizer

(191954, 59110)

In [73]:
bgg_feature_names = vectorizer.get_feature_names()

In [74]:
docnum=5
feature_index = X[docnum,:].nonzero()[1] # Non-zero features in the matrix
tfidf_scores = zip(feature_index, [X[docnum, x] for x in feature_index])
for w, s in [(bgg_feature_names[i], s) for (i, s) in tfidf_scores]:
    print(w, s)

sessions 0.4680112265100513
played 0.2412360365679205
ideally 0.43566664806742306
changing 0.36503264900472887
world 0.23885982236634393
persistent 0.5235212195450356
game 0.2618929578836405


In [87]:
wrdlist=[]
scorelist=[]
for docnum in range(0,X.shape[0]):
    feature_index = X[docnum,:].nonzero()[1] # Non-zero features in the matrix
    tfidf_scores = zip(feature_index, [X[docnum, x] for x in feature_index])
    for w, s in [(bgg_feature_names[i], s) for (i, s) in tfidf_scores]:
        if s>0.99:
            wrdlist.append(w)
            scorelist.append(s)
        

In [88]:
wordsXtfidf = pd.DataFrame({'word':wrdlist,'tfidfscore':scorelist})
wordsXtfidf.sort_values(by='tfidfscore',ascending=False,inplace=True)

In [94]:
wordsXtfidf[wordsXtfidf['tfidfscore']>.9999].shape

(5346, 2)

In [91]:
#sns.distplot(wordsXtfidf['tfidfscore'])

In [95]:
wordsXtfidf.to_csv('HighTFIDFwords.csv')

In [96]:
wordsXtfidf.head()

Unnamed: 0,word,tfidfscore
0,john,1.0
3570,contents,1.0
3568,price,1.0
3567,play,1.0
3566,wits,1.0
