In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.decomposition import NMF
import nltk
nltk.download('stopwords') 
nltk.download('wordnet')
import numpy as np
import glob
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import string

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jillian/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jillian/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
df_text=pd.read_pickle('primary_df.pkl')
df_text

Unnamed: 0_level_0,raw_text,sect
source,Unnamed: 1_level_1,Unnamed: 2_level_1
Dead_Suns_1,SUNRISE MAIDEN\n\n\nTIER 3\n\nSanjaval Vagabon...,SUNRISE MAIDEN
Dead_Suns_1,SUNRISE MAIDEN\n\n\nTIER 3\n\nSanjaval Vagabon...,TIER 3\n\nSanjaval Vagabond\nMedium explorer\n...
Dead_Suns_1,SUNRISE MAIDEN\n\n\nTIER 3\n\nSanjaval Vagabon...,SUNRISE MAIDEN MAP KEY\n1. Bridge\na. Pilot’s ...
Dead_Suns_1,SUNRISE MAIDEN\n\n\nTIER 3\n\nSanjaval Vagabon...,"\nFORTUNE IN THE STARS\n\nTake your station, c..."
Dead_Suns_1,SUNRISE MAIDEN\n\n\nTIER 3\n\nSanjaval Vagabon...,The Starfinder Adventure Path is designed to p...
...,...,...
Dead_Suns_2,THAUMTECH\nCAIRNCARVER\n\nTIER 2\n\nSmall ligh...,8
Dead_Suns_2,THAUMTECH\nCAIRNCARVER\n\nTIER 2\n\nSmall ligh...,4800
Dead_Suns_2,THAUMTECH\nCAIRNCARVER\n\nTIER 2\n\nSmall ligh...,\nBulldoze (Ex) When a yaruk uses its trample ...
Dead_Suns_2,THAUMTECH\nCAIRNCARVER\n\nTIER 2\n\nSmall ligh...,Yaruks are immense herbivores that can readily...


In [3]:
# Apply a first round of text cleaning techniques
def clean_text(text):
    '''Make text lowercase, remove text in square brackets, 
    remove punctuation and remove words containing numbers, remove line breaks.'''
    text = text.lower()
    text = re.sub('\[.*?\]', ' ', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\w*\d\w*', ' ', text)
    text = re.sub('[‘’“”…]', ' ', text)
    text = re.sub('\n', ' ', text)
    return text

round1 = lambda x: clean_text(x)

In [4]:
data_clean=pd.DataFrame(df_text.sect.apply(round1))
data_clean.to_pickle('cleaned_sections.pkl') #saving cleaned version of sections
data_clean

Unnamed: 0_level_0,sect
source,Unnamed: 1_level_1
Dead_Suns_1,sunrise maiden
Dead_Suns_1,tier sanjaval vagabond medium explorer spee...
Dead_Suns_1,sunrise maiden map key bridge a pilot s st...
Dead_Suns_1,fortune in the stars take your station chec...
Dead_Suns_1,the starfinder adventure path is designed to p...
...,...
Dead_Suns_2,
Dead_Suns_2,
Dead_Suns_2,bulldoze ex when a yaruk uses its trample a...
Dead_Suns_2,yaruks are immense herbivores that can readily...


In [5]:
#Tried Lemmatizing and Stemming, Lemmatizing seemed to give generally more understandable tokens

def preprocess_text(text):
    """preprocessing text, tokenizing words that are strings longer than 2 letters
    lemmatising tokens, adding new stopwords to stopwords list and removing all stopwords"""
    from nltk.corpus import stopwords
    #Tokenize words while ignoring punctuation
    #letters only, strings longer than 2 letters
    tokeniser = RegexpTokenizer(r'[A-Za-z]{3,}')
    tokens = tokeniser.tokenize(text)
     # lemmatise 
    lemmatiser = WordNetLemmatizer()
    lemmas = [lemmatiser.lemmatize(token,pos='v') for token in tokens]
    #ps=PorterStemmer()
    #ports=[ps.stem(token.lower())for token in tokens]
    
    # Remove creating and removing stopwords
    stopwords= stopwords.words('english')
    new_stopwords=['statistics','tactics','ecology','bulk','defense','offense','treasure',
                  'mission','pcs','starfinder',] 
    stopwords.extend(new_stopwords)
    keywords= [lemma for lemma in lemmas if lemma not in stopwords]
    #keywords= [port for port in ports if port not in stopwords]
    return keywords

In [6]:
#Vectorizing
vectorizer = CountVectorizer(analyzer=preprocess_text)
doc_word = vectorizer.fit_transform(data_clean.sect)
doc_word.shape

(6558, 22160)

In [7]:
#Creating document term matrix, using book source instead of section as index as 
#more concerned with where the section of text originated than the section iteself
dtm=pd.DataFrame(doc_word.toarray(),columns=vectorizer.get_feature_names())
dtm.index=data_clean.index
dtm

Unnamed: 0_level_0,aanung,abadar,abadaran,abadarans,abadarcorp,abaddon,aballon,aballonian,abandon,abandonment,...,zoologists,zoom,zoos,zunar,zurnala,zusleggim,zyakama,zybollo,zygotes,zysyk
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Dead_Suns_1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Dead_Suns_1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Dead_Suns_1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Dead_Suns_1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Dead_Suns_1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Dead_Suns_2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Dead_Suns_2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Dead_Suns_2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Dead_Suns_2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# NMF Topic Modeling with Countvectorizer

In [8]:
nmf_model = NMF(n_components=20,random_state=1)
doc_topic = nmf_model.fit_transform(doc_word)
doc_topic.shape



(6558, 20)

In [9]:
topic_word = nmf_model.components_
topic_word.shape

(20, 22160)

In [10]:
#Looking at the words that make up each topic
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-20:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['area',
  'room',
  'door',
  'wall',
  'open',
  'contain',
  'two',
  'doors',
  'light',
  'engineer',
  'floor',
  'one',
  'lock',
  'chamber',
  'check',
  'use',
  'computer',
  'lead',
  'succeed'],
 ['worlds',
  'pact',
  'planet',
  'world',
  'within',
  'species',
  'live',
  'many',
  'one',
  'even',
  'system',
  'new',
  'find',
  'make',
  'star',
  'drift',
  'though',
  'space',
  'first'],
 ['creature',
  'save',
  'effect',
  'level',
  'spell',
  'target',
  'use',
  'throw',
  'action',
  'round',
  'creatures',
  'feet',
  'time',
  'ability',
  'affect',
  'within',
  'one',
  'take',
  'mind'],
 ['ship',
  'starship',
  'crew',
  'pilot',
  'drift',
  'vessel',
  'rank',
  'captain',
  'engineer',
  'officer',
  'combat',
  'space',
  'computers',
  'bay',
  'sensors',
  'systems',
  'cargo',
  'science',
  'gunnery'],
 ['check',
  'succeed',
  'successful',
  'attempt',
  'one',
  'take',
  'perception',
  'engineer',
  'bonus',
  'computers',
  'require',


In [11]:
data_clean1=data_clean.copy()
data_clean1['topic']=doc_topic.argmax(axis=1)

In [12]:
#Adding topic to dataframe
data_clean1.to_pickle('topics.pkl')
data_clean1.to_csv('topics.csv')
data_clean1

Unnamed: 0_level_0,sect,topic
source,Unnamed: 1_level_1,Unnamed: 2_level_1
Dead_Suns_1,sunrise maiden,3
Dead_Suns_1,tier sanjaval vagabond medium explorer spee...,3
Dead_Suns_1,sunrise maiden map key bridge a pilot s st...,13
Dead_Suns_1,fortune in the stars take your station chec...,1
Dead_Suns_1,the starfinder adventure path is designed to p...,1
...,...,...
Dead_Suns_2,,0
Dead_Suns_2,,0
Dead_Suns_2,bulldoze ex when a yaruk uses its trample a...,17
Dead_Suns_2,yaruks are immense herbivores that can readily...,1


In [13]:
#Examining at how often topics occur, counts are number of sections
doc_cluster=doc_topic.argmax(axis=1)
pd.Series(doc_cluster).value_counts()

17    1995
2      672
0      524
1      486
19     353
4      322
9      320
3      279
5      256
11     245
18     209
7      192
8      156
16     154
10      92
12      70
6       64
13      63
14      58
15      48
dtype: int64

In [14]:
#saving topic keywords for future reference
topic_keys=pd.Series(topic_words)
topic_keys=topic_keys.to_frame()
topic_keys.rename(columns={0:'keywords'},inplace=True)
topic_keys.to_csv('topic_keywords.csv')
topic_keys

Unnamed: 0,keywords
0,"[area, room, door, wall, open, contain, two, d..."
1,"[worlds, pact, planet, world, within, species,..."
2,"[creature, save, effect, level, spell, target,..."
3,"[ship, starship, crew, pilot, drift, vessel, r..."
4,"[check, succeed, successful, attempt, one, tak..."
5,"[swarm, hylax, components, suskillon, mind, sh..."
6,"[fleet, corpse, eox, waneda, report, ministry,..."
7,"[azlanti, empire, star, aeon, guard, drive, ru..."
8,"[sun, diver, burn, archipelago, take, fire, kh..."
9,"[know, see, get, one, ask, might, like, take, ..."


# Trying LDA

In [15]:
from sklearn.decomposition import LatentDirichletAllocation

In [16]:
lda_model = LatentDirichletAllocation(n_components=20)
lda_doc_topic = lda_model.fit_transform(doc_word)
lda_doc_topic.shape

(6558, 20)

In [17]:
words = vectorizer.get_feature_names()
ldat = lda_model.components_.argsort(axis=1)[:,-1:-7:-1]
lda_topic_words = [[words[e] for e in l] for l in ldat]
lda_topic_words

[['plane', 'shadow', 'mindscape', 'take', 'one', 'velstracs'],
 ['damage', 'attack', 'creature', 'feet', 'check', 'take'],
 ['sivv', 'vheiransch', 'simulation', 'sivvs', 'quantum', 'bantrids'],
 ['alien', 'archive', 'barathus', 'humanoid', 'barathu', 'male'],
 ['combat', 'fight', 'morale', 'use', 'fire', 'attack'],
 ['check', 'succeed', 'one', 'guard', 'take', 'attempt'],
 ['ship', 'starship', 'attack', 'crew', 'check', 'pilot'],
 ['speed', 'melee', 'abilities', 'critical', 'range', 'offensive'],
 ['effect', 'creature', 'level', 'save', 'use', 'spell'],
 ['stewards', 'azlanti', 'starship', 'star', 'ship', 'rune'],
 ['station', 'absalom', 'pact', 'worlds', 'fleet', 'adventure'],
 ['planet', 'species', 'worlds', 'many', 'world', 'live'],
 ['gray', 'dycepskian', 'reptoid', 'host', 'one', 'might'],
 ['swarm', 'hylax', 'corpse', 'fleet', 'suskillon', 'temple'],
 ['city', 'kish', 'new', 'resort', 'check', 'see'],
 ['abilities', 'sense', 'eac', 'kac', 'skills', 'fort'],
 ['rank', 'pilot', 'of

# NMF with TFID

In [18]:
vect= TfidfVectorizer(analyzer=preprocess_text)
tf_doc_word = vect.fit_transform(data_clean.sect)
tf_doc_word.shape

(6558, 22160)

In [19]:
tf_dtm=pd.DataFrame(tf_doc_word.toarray(),columns=vect.get_feature_names())
tf_dtm.index=df_text.index
tf_dtm

Unnamed: 0_level_0,aanung,abadar,abadaran,abadarans,abadarcorp,abaddon,aballon,aballonian,abandon,abandonment,...,zoologists,zoom,zoos,zunar,zurnala,zusleggim,zyakama,zybollo,zygotes,zysyk
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Dead_Suns_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Dead_Suns_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Dead_Suns_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Dead_Suns_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Dead_Suns_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Dead_Suns_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Dead_Suns_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Dead_Suns_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Dead_Suns_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
tf_nmf_model = NMF(n_components=20,random_state=2)
tf_doc_topic = tf_nmf_model.fit_transform(tf_doc_word)
tf_doc_topic.shape



(6558, 20)

In [21]:
tf_topic_word = tf_nmf_model.components_
tf_topic_word.shape

(20, 22160)

In [22]:
tf_words = vect.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-10:-1]
tf_topic_words = [[words[e] for e in l] for l in t]
tf_topic_words

[['area', 'room', 'door', 'wall', 'open', 'contain', 'two', 'doors', 'light'],
 ['worlds',
  'pact',
  'planet',
  'world',
  'within',
  'species',
  'live',
  'many',
  'one'],
 ['creature',
  'save',
  'effect',
  'level',
  'spell',
  'target',
  'use',
  'throw',
  'action'],
 ['ship',
  'starship',
  'crew',
  'pilot',
  'drift',
  'vessel',
  'rank',
  'captain',
  'engineer'],
 ['check',
  'succeed',
  'successful',
  'attempt',
  'one',
  'take',
  'perception',
  'engineer',
  'bonus'],
 ['swarm',
  'hylax',
  'components',
  'suskillon',
  'mind',
  'shirrens',
  'sdf',
  'creatures',
  'take'],
 ['fleet',
  'corpse',
  'eox',
  'waneda',
  'report',
  'ministry',
  'bone',
  'eoxian',
  'undead'],
 ['azlanti',
  'empire',
  'star',
  'aeon',
  'guard',
  'drive',
  'rune',
  'madelon',
  'zolan'],
 ['sun',
  'diver',
  'burn',
  'archipelago',
  'take',
  'fire',
  'khaim',
  'force',
  'bubble'],
 ['know', 'see', 'get', 'one', 'ask', 'might', 'like', 'take', 'say'],
 ['sec

In [23]:
#Looking at how often a topic appears
tf_doc_cluster=tf_doc_topic.argmax(axis=1)
pd.Series(tf_doc_cluster).value_counts()

2     1681
13     801
0      489
16     465
1      437
19     311
12     292
15     269
5      264
3      232
17     186
4      175
6      159
7      157
11     130
18     121
10     107
8      105
14      94
9       83
dtype: int64

In [24]:
data_clean2=data_clean.copy()
data_clean2['topic']=tf_doc_topic.argmax(axis=1)

In [25]:
#Topics don't seem to line up as well as they do with countvectorizer nmf
data_clean2

Unnamed: 0_level_0,sect,topic
source,Unnamed: 1_level_1,Unnamed: 2_level_1
Dead_Suns_1,sunrise maiden,2
Dead_Suns_1,tier sanjaval vagabond medium explorer spee...,11
Dead_Suns_1,sunrise maiden map key bridge a pilot s st...,9
Dead_Suns_1,fortune in the stars take your station chec...,2
Dead_Suns_1,the starfinder adventure path is designed to p...,8
...,...,...
Dead_Suns_2,,0
Dead_Suns_2,,0
Dead_Suns_2,bulldoze ex when a yaruk uses its trample a...,13
Dead_Suns_2,yaruks are immense herbivores that can readily...,2
