# basic feature examples

In [1]:
from ScriptFeaturizer import scripts_to_tfidf, num_lines
from FeatureUtils import load_scripts

In [2]:
raw_scripts, titles = load_scripts(r'data/scraping/texts/')
X, vocab = scripts_to_tfidf(raw_scripts)
X.shape

  'stop_words.' % sorted(inconsistent))


(1147, 3849)

In [3]:
type(raw_scripts)

list

In [4]:
raw_scripts[0][:1000]

'\n\n\n\nTen Things I Hate About You - by Karen McCullah Lutz & Kirsten Smith\n\n                               TEN THINGS I HATE ABOUT YOU\n          \n                written by Karen McCullah Lutz & Kirsten Smith\n          \n              based on \'Taming of the Shrew" by William Shakespeare\n          \n          Revision November 12, 1997\n          \n          \n          PADUA HIGH SCHOOL - DAY\n          \n          Welcome to Padua High School,, your typical urban-suburban \n          high school in Portland, Oregon.  Smarties, Skids, Preppies, \n          Granolas. Loners, Lovers, the In and the Out Crowd rub sleep \n          out of their eyes and head for the main building.\n          \n          PADUA HIGH PARKING LOT - DAY\n          \n          KAT STRATFORD, eighteen, pretty -- but trying hard not to be \n          -- in a baggy granny dress and glasses, balances a cup of \n          coffee and a backpack as she climbs out of her battered, \n          baby blue \'75 D

In [5]:
# find names in the script


num_lines(raw_scripts[34])

1101

# spacy feature utils examples

In [6]:
from ScriptFeaturizer import scripts_to_tfidf, df_to_stats
from FeatureUtils import load_scripts, make_doc_df, series_to_doc

In [7]:
from datetime import datetime
raw_scripts, titles = load_scripts(r'data/scraping/texts/')

In [8]:
nw = datetime.now()
df = make_doc_df(raw_scripts[0])
print(datetime.now() - nw)

0:00:04.015120


In [9]:
df

Unnamed: 0,text,lemma,pos,tag,dep,shape,is_alpha,is_stop,ner_obj
0,,,SPACE,_SP,,,False,False,
1,ten,ten,NUM,CD,nummod,xxx,True,True,TIME
2,things,thing,NOUN,NNS,ROOT,xxxx,True,False,
3,i,i,PRON,PRP,nsubj,x,True,True,
4,hate,hate,VERB,VBP,relcl,xxxx,True,False,
...,...,...,...,...,...,...,...,...,...
19169,user,user,NOUN,NN,compound,xxxx,True,False,
19170,comments,comment,NOUN,NNS,dobj,xxxx,True,False,
19171,back,back,ADV,RB,advmod,xxxx,True,True,
19172,to,to,ADP,IN,prep,xx,True,True,


In [10]:
df_to_stats(df, 'ner_obj')

(array([ 169,  692,  688,   37,    8,    5,   20,    9,  242, 2757,   35,
           8,  239,    1]),
 array(['CARDINAL', 'DATE', 'FAC', 'GPE', 'LANGUAGE', 'LOC', 'NORP',
        'ORDINAL', 'ORG', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME',
        'WORK_OF_ART'], dtype=object))

In [11]:


lemmatized_text = series_to_doc(df.lemma)
print(lemmatized_text[:100])
test = scripts_to_tfidf([lemmatized_text])
test[1][:10]

  ten thing i hate about -PRON- by karen mccullah lutz kirsten smith ten thing i hate about -PRON- w


  'stop_words.' % sorted(inconsistent))


['10', '11', '12', '141', '16th', '1995', '1997', '400', '75', '90210']

# word embeddings example

In [12]:
from gensim.models import Word2Vec
from ScriptFeaturizer import scripts_to_tfidf, scripts_to_embeddings
from FeatureUtils import tokenize_script, load_scripts

In [13]:
from datetime import datetime
raw_scripts, titles = load_scripts(r'data/scraping/texts/')

In [14]:
docs = [tokenize_script(script, stop_words=True) for script in raw_scripts]

In [19]:
from datetime import datetime
nw = datetime.now()
model = Word2Vec(docs, 
                 min_count=100, 
                 size=100, 
                 window=5, 
                 max_vocab_size=2000
                )
print(datetime.now() - nw)

0:00:31.135198


In [27]:
import pickle

with open(r'models/word2vec.pkl', 'wb') as f:
    pickle.dump(model, f)
    


# what's done
- length of script `num_lines`
- `FeatureUtils` for feature engineering helper functions
  - master spacy df `make_doc_df`
  - `series_to_doc` for lemmatizing and other word transformations
- `ScriptFeaturizer` updates for include spacy summary features via pandas methods
  - stats feature vectors `df_to_stats`
- word2vec skeleton framework & small model (won't scale)

# todo
- thematic counts?
  - swear words
  - sex words
- word2vec second model