In [1]:
import turicreate as tc

In [2]:
doc = tc.SFrame('age_geo.csv')

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [3]:
doc

age
age of plants
age at biopsy
age at diagnosis (y)
age (at diagnosis)
sac age (wks)
age (1#pfu_1)
taqman pool by age
age at op
age of epilepsy duration (yr) ...
age of rat


In [4]:
keys = ['age', 'tissue', 'cell line']

In [5]:
# Remove stopwords and convert to bag of words
doc = tc.text_analytics.count_words(doc['age'])
doc = doc.dict_trim_by_keys(tc.text_analytics.stopwords(), exclude=True)

In [6]:
# Learn topic model
model = tc.topic_model.create(doc)

In [7]:
out = (model.get_topics())

In [8]:
model

Class                          : TopicModel

Schema
------
Vocabulary Size                : 273

Settings
--------
Number of Topics               : 10
alpha                          : 5.0
beta                           : 0.1
Iterations                     : 10
Training time                  : 1.0043
Verbose                        : True

Accessible fields             : 
m.topics                      : An SFrame containing the topics.
m.vocabulary                  : An SArray containing the words in the vocabulary.
Useful methods                : 
m.get_topics()                : Get the most probable words per topic.
m.predict(new_docs)           : Make predictions for new documents.

In [9]:
model.vocabulary

dtype: str
Rows: 273
['plants', 'age', 'biopsy', '(y)', 'diagnosis', 'diagnosis)', '(at', '(wks)', 'sac', '(1#pfu_1)', 'pool', 'taqman', 'op', 'duration', '(yr)', 'epilepsy', 'rat', '(years)', 'ffpe', 'block', 'mouse', 'examination', 'age-at-examination', '(1#pvu_1)', 'trees', 'primary', 'age-at-biopsy-years', 'tissue', '(1#pfu_2)', 'sampling', 'onset', '(wk)', 'mice', '(yr/mo)', 'ipi', "mother's", '(year)', 'maternal', '(yrs)', 'host', '65', '≥', 'gestation', 'age-at-onset', 'days)', '(postnatal', 'death', 'withdrawal', 'blood', 'vitro', 'years)', '(in', 'collection', 'age_at_diagnosis', 'group', '(month)', 'age@dx', '(pup)', 'average', 'age_diag', '(days)', 'harvested', 'birth)', '(after', 'sacrifice', 'description', 'category', '(months)', 'sample', 'commenced', 'high', 'salt', 'diet', 'culture', 'age/gender', 'diagonosis', 'age(yrs.)', 'seedlings', 'age.brain', 'clone', 'age.blood', 'drawing', 'gestation)', '(weeks', 'surgery', 'age(years)', 'unit', 'developmental', '(years', 'surg

In [10]:
table = model.get_topics()

In [11]:
list_words = table.groupby(key_column_names='word',operations={'sum_scores': tc.aggregate.SUM('score')})

In [12]:
list_words

word,sum_scores
culture,0.0741190765492
sample,0.0475086906141
(range),0.0105465004794
start,0.0510589941025
(years),0.157148514157
draw,0.0201342281879
ffpe,0.0218068535826
(yrs),0.097286028121
onset,0.0838934655389
gestational,0.0769230769231


In [13]:
list_words.print_rows(38,2)

+-------------+------------------+
|     word    |    sum_scores    |
+-------------+------------------+
|   culture   | 0.0741190765492  |
|    sample   | 0.0475086906141  |
|   (range)   | 0.0105465004794  |
|    start    | 0.0510589941025  |
|   (years)   |  0.157148514157  |
|     draw    | 0.0201342281879  |
|     ffpe    | 0.0218068535826  |
|    (yrs)    |  0.097286028121  |
|    onset    | 0.0838934655389  |
| gestational | 0.0769230769231  |
|   sampling  | 0.0255164034022  |
|    (year)   | 0.0359212050985  |
|   epilepsy  | 0.0183727034121  |
|   (months)  | 0.0643127364439  |
|     cell    | 0.0255164034022  |
|  treatment  | 0.0218068535826  |
|     diet    | 0.0318602261048  |
|   maternal  | 0.0271216097988  |
|     age     |  2.38158357612   |
|     (y)     | 0.0271668822768  |
|  withdrawal | 0.0101569713758  |
|     (in     | 0.0359212050985  |
|     (yr)    | 0.0271216097988  |
|  diagnosis  |  0.152198454487  |
|    birth)   | 0.0188679245283  |
|     day)    | 0.02

In [14]:
topics_features = list_words.sort('sum_scores', ascending= False)

In [15]:
topics_features.print_rows(38,2)

+-------------+------------------+
|     word    |    sum_scores    |
+-------------+------------------+
|     age     |  2.38158357612   |
|   (years)   |  0.157148514157  |
|  diagnosis  |  0.152198454487  |
|    (yrs)    |  0.097286028121  |
|    onset    | 0.0838934655389  |
| gestational | 0.0769230769231  |
|   culture   | 0.0741190765492  |
|   (months)  | 0.0643127364439  |
|     days    | 0.0590961761298  |
|   surgery   | 0.0530401034929  |
|    blood    | 0.0524152106886  |
|    start    | 0.0510589941025  |
|    sample   | 0.0475086906141  |
|     time    | 0.0401034928849  |
|    (days)   |  0.03683737646   |
|    (year)   | 0.0359212050985  |
|     (in     | 0.0359212050985  |
|     diet    | 0.0318602261048  |
|    death    | 0.0318602261048  |
|    tissue   | 0.0297219558965  |
|    group    | 0.0286241920591  |
|    donor    | 0.0286241920591  |
|     (y)     | 0.0271668822768  |
|   maternal  | 0.0271216097988  |
|     (yr)    | 0.0271216097988  |
|     cell    | 0.02

In [16]:
#vector
all_features = [i for i in topics_features['word']]

#drop the keys
features = set(all_features) - set(keys)
features_nokeys = [k for k in features]

#include the scores

In [17]:
features_nokeys

['epilepsy',
 'sampling',
 'sample',
 'culture',
 '(range)',
 'birth)',
 '(years)',
 'maternal',
 'surgery',
 '(in',
 'gestational',
 'onset',
 'death',
 'group',
 'cell',
 'start',
 'experiment',
 'treatment',
 'day)',
 'donor',
 'draw',
 'patient',
 'ffpe',
 'time',
 '(yrs)',
 'diet',
 '(year)',
 'pig',
 '(months)',
 '(y)',
 '(days)',
 'days',
 'diagnosis',
 'withdrawal',
 '(yr)',
 'blood']

In [24]:
type(features)

set

In [15]:
topics_features.print_rows(37,2)

+-------------+-----------------+
|     word    |    sum_scores   |
+-------------+-----------------+
|     age     |  2.38472097976  |
|   (years)   |  0.16329733227  |
|  diagnosis  |  0.150684931507 |
|    death    | 0.0681520314548 |
|    (yrs)    | 0.0668414154653 |
|    start    | 0.0668127053669 |
|   culture   | 0.0636759537059 |
|    onset    |  0.059513830679 |
|    blood    | 0.0540882259973 |
|   patient   | 0.0537352555701 |
|   (months)  | 0.0537352555701 |
|     days    | 0.0449069003286 |
|  collection | 0.0449069003286 |
|     (yr)    |  0.040629095675 |
|    years    |  0.040629095675 |
|     cell    |  0.040629095675 |
|     (in     |  0.040629095675 |
|    group    | 0.0386052303861 |
|    sample   | 0.0343671416597 |
|   surgery   | 0.0339539978094 |
|    (days)   | 0.0339539978094 |
|     (at     | 0.0300096805421 |
|   sampling  | 0.0300096805421 |
|    tissue   | 0.0286241920591 |
| gestational | 0.0286241920591 |
|   current   | 0.0261519302615 |
|    days)    

In [7]:
out.print_rows(50,3)

+-------+-------------+-----------------+
| topic |     word    |      score      |
+-------+-------------+-----------------+
|   0   |     age     |  0.107947805457 |
|   0   |   patient   | 0.0486358244365 |
|   0   |     (at     | 0.0367734282325 |
|   0   |     diet    | 0.0367734282325 |
|   0   |   sampling  | 0.0249110320285 |
|   1   |     age     |  0.512154233026 |
|   1   |  collection | 0.0259849119866 |
|   1   |  operation  | 0.0176026823135 |
|   1   |  (postnatal | 0.0176026823135 |
|   1   |     (yr)    | 0.0176026823135 |
|   2   |   (years)   | 0.0668414154653 |
|   2   | gestational | 0.0668414154653 |
|   2   |  commenced  | 0.0275229357798 |
|   2   |     time    | 0.0275229357798 |
|   2   |   (weeks)   | 0.0275229357798 |
|   3   |    tissue   | 0.0492196878751 |
|   3   |     days    | 0.0492196878751 |
|   3   |   surgery   | 0.0372148859544 |
|   3   |    start    | 0.0372148859544 |
|   3   |   maternal  | 0.0372148859544 |
|   4   |     age     |  0.3887349

In [8]:
topics = (model.get_topics(output_type='topic_words'))

In [9]:
topics

words
"[age, patient, (at, diet, sampling] ..."
"[age, collection, operation, (yr), ..."
"[(years), gestational, commenced, (weeks), t ..."
"[tissue, days, start, surgery, maternal] ..."
"[age, (yrs), gestational, donor, start] ..."
"[age, cell, (days), group, culture] ..."
"[age, (yrs), culture, agenotype, sacrifice] ..."
"[age, diagnosis, onset, years, treatment] ..."
"[age, (months), (years), sample, biopsy] ..."
"[(years), death, blood, onset, draw] ..."


In [80]:
topics.export_csv('topics.csv')

In [81]:
model

Class                          : TopicModel

Schema
------
Vocabulary Size                : 273

Settings
--------
Number of Topics               : 10
alpha                          : 5.0
beta                           : 0.1
Iterations                     : 10
Training time                  : 0.0116
Verbose                        : True

Accessible fields             : 
m.topics                      : An SFrame containing the topics.
m.vocabulary                  : An SArray containing the words in the vocabulary.
Useful methods                : 
m.get_topics()                : Get the most probable words per topic.
m.predict(new_docs)           : Make predictions for new documents.

In [82]:
model.vocabulary

dtype: str
Rows: 273
['plants', 'age', 'biopsy', '(y)', 'diagnosis', 'diagnosis)', '(at', '(wks)', 'sac', '(1#pfu_1)', 'pool', 'taqman', 'op', 'duration', '(yr)', 'epilepsy', 'rat', '(years)', 'ffpe', 'block', 'mouse', 'examination', 'age-at-examination', '(1#pvu_1)', 'trees', 'primary', 'age-at-biopsy-years', 'tissue', '(1#pfu_2)', 'sampling', 'onset', '(wk)', 'mice', '(yr/mo)', 'ipi', "mother's", '(year)', 'maternal', '(yrs)', 'host', '65', '≥', 'gestation', 'age-at-onset', 'days)', '(postnatal', 'death', 'withdrawal', 'blood', 'vitro', 'years)', '(in', 'collection', 'age_at_diagnosis', 'group', '(month)', 'age@dx', '(pup)', 'average', 'age_diag', '(days)', 'harvested', 'birth)', '(after', 'sacrifice', 'description', 'category', '(months)', 'sample', 'commenced', 'high', 'salt', 'diet', 'culture', 'age/gender', 'diagonosis', 'age(yrs.)', 'seedlings', 'age.brain', 'clone', 'age.blood', 'drawing', 'gestation)', '(weeks', 'surgery', 'age(years)', 'unit', 'developmental', '(years', 'surg

In [83]:
pred = model.predict(doc)

In [84]:
for i in pred:
    print(i)

1
7
8
8
0
7
9
6
9
9
7
4
7
3
4
9
6
7
2
3
7
9
7
9
7
8
5
7
6
4
7
8
4
8
9
9
7
9
1
7
6
6
9
2
9
4
1
8
5
5
9
7
9
6
8
9
6
9
8
8
6
9
9
5
8
5
0
3
8
6
2
8
4
1
6
9
6
9
7
0
1
9
0
8
6
6
6
3
4
7
3
1
6
4
8
4
9
6
6
3
7
8
9
7
7
9
2
4
4
5
8
9
9
7
0
9
1
4
6
5
1
5
9
1
9
3
9
8
9
8
0
8
1
7
5
8
4
5
8
6
9
3
8
6
8
9
3
5
4
4
8
3
2
2
6
0
9
3
6
7
8
4
5
8
4
9
4
8
4
8
8
5
5
2
1
2
4
8
8
8
3
9
5
8
8
4
6
6
9
9
8
7
8
2
4
6
0
9
7
7
0
8
4
4
7
3
9
9
8
7
7
8
9
3
6
7
7
5
0
7
7
4
4
5
9
9
5
8
8
6
6
9
9
5
7
0
5
9
7
7
8
6
9
7
0
8
6
3
9
8
0
8
7
8
8
6
8
7
8
8
4
7
6
4
9
6
8
1
5
4
8
3
6
4
9
6
3
8
6
5
8
8
7
8
8
5
6
8
8
9
7
5
8
8
8
8
2


In [85]:
prob = model.predict(doc, output_type = 'probability')

In [86]:
for j in prob:
    print(j)

array('d', [0.11538461538461539, 0.09615384615384616, 0.09615384615384616, 0.09615384615384616, 0.09615384615384616, 0.09615384615384616, 0.09615384615384616, 0.09615384615384616, 0.11538461538461539, 0.09615384615384616])
array('d', [0.11538461538461539, 0.09615384615384616, 0.09615384615384616, 0.09615384615384616, 0.11538461538461539, 0.09615384615384616, 0.09615384615384616, 0.09615384615384616, 0.09615384615384616, 0.09615384615384616])
array('d', [0.09433962264150944, 0.11320754716981132, 0.11320754716981132, 0.09433962264150944, 0.09433962264150944, 0.09433962264150944, 0.09433962264150944, 0.09433962264150944, 0.09433962264150944, 0.11320754716981132])
array('d', [0.11320754716981132, 0.11320754716981132, 0.09433962264150944, 0.09433962264150944, 0.09433962264150944, 0.09433962264150944, 0.11320754716981132, 0.09433962264150944, 0.09433962264150944, 0.09433962264150944])
array('d', [0.11320754716981132, 0.09433962264150944, 0.09433962264150944, 0.09433962264150944, 0.0943396226