## Extracting Features using text mining

In [1]:
import turicreate as tc

In [2]:
sf_keys = tc.SFrame('../datasets/geo.csv')

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [3]:
sf_keys

key,key_class
age at diagnosis,age
age at diagnosis (years),age
age unit,age
age at sample (months),age
age group,age
age_years,age
patient age (yrs),age
age (y),age
age in years,age
donor_age,age


In [4]:
key_classes = [i for i in sf_keys['key_class'].unique()]
key_classes

['disease', 'cell line', 'treatment', 'age', 'strain', 'tissue']

We want to create different subsets, one per each key class, so we do a filtering

In [5]:
def create_subsets(df, column_category):
    tuples = []
    for category in df[column_category].unique():
        table = df[df[column_category] == category]
        tuples.append((category, table))
        
    tables = {key: value for (key, value) in tuples}
    print(tables.keys())
    return tables

In [6]:
geo_tables = create_subsets(sf_keys, 'key_class')

dict_keys(['strain', 'cell line', 'age', 'disease', 'treatment', 'tissue'])


In [7]:
geo_tables['tissue']

key,key_class
tissuer type,tissue
tissue_detail,tissue
tissuetype,tissue
cell line source tissue,tissue
cell/tissue type,tissue
tissue,tissue
tissue-type,tissue
original tissue,tissue
tissue/cell lines,tissue
primary tissue,tissue


In [8]:
# Remove stopwords and convert to bag of words
doc = tc.text_analytics.count_words(geo_tables['disease']['key'])
doc = doc.dict_trim_by_keys(tc.text_analytics.stopwords(), exclude=True)

In [72]:
# Learn topic model
model = tc.topic_model.create(doc)#, num_topics = 3)

Reviewing the difference between the unique words appeared on topics vs the whole vocabulary  

In [73]:
model.get_topics()['word'].unique()

dtype: str
Rows: 37
['meibomian', 'interval', 'clinical', 'disease/treatment', 'state', 'specific', 'disease_specific_survival_event', 'exposure', 'primary', "subject's", 'disease', 'phase', '(dfs)', '(years)', 'severity', '(event)', 'type', 'disease-free', 'status', 'survival', 'event', '(months)', 'free', 'state)', '(inss)', 'stage', 'time', 'months', '(days)', 'advanced', 'disease_status', 'characteristic/disease', 'diseased', 'progression', 'disease_state', 'patient', 'disease/cell']

In [74]:
model.vocabulary

dtype: str
Rows: 55
['disease', 'primary', 'exposure', 'type', 'disease/cell', 'subtype', 'model', 'disease-state', 'state', 'gland', 'meibomian', "subject's", 'status', 'disease/treatment', 'patient', 'stage', 'progression', 'characteristic/disease', 'clinical', 'development', 'diseasestatus', 'outcome', 'duration', 'disease_status', 'advanced', '(host)', 'disease_state', 'severity', 'state)', '(disease', 'histology', '(inss)', 'phase', 'diseased', 'extent', 'disease_specific_survival_years', '(dfs)', 'survival', 'disease-free', 'disease_free_survival_years', '(months)', 'interval', '(days)', 'free', '(years)', 'specific', 'onset', 'age', 'months', 'time', 'event', '(event)', 'disease_specific_survival_event', 'relapse', 'disease_free_survival_event']

In [75]:
sf_topics = model.get_topics()
sf_topics

topic,word,score
0,disease,0.4303030303030304
0,advanced,0.1272727272727273
0,diseased,0.0666666666666667
0,state),0.0666666666666667
0,disease/cell,0.006060606060606
1,disease,0.2827586206896554
1,disease_status,0.0758620689655173
1,clinical,0.0758620689655173
1,disease/treatment,0.0758620689655173
1,meibomian,0.0758620689655173


In [76]:
# Agreaggate the unique words
sf_words = sf_topics.groupby(key_column_names='word', operations={'sum_scores': tc.aggregate.SUM('score')})
# Sort the features scores and filter all those which are key
sf_words = sf_words.sort('sum_scores', ascending= False).filter_by(key_classes, 'word', exclude=True)

In [77]:
sf_words

word,sum_scores
status,0.2912466843501328
stage,0.2827586206896554
survival,0.2484848484848485
free,0.2270815811606393
state,0.2266666666666668
(months),0.1743589743589744
interval,0.1565811965811966
disease-free,0.1497435897435898
patient,0.1448275862068966
advanced,0.1272727272727273


In [78]:
#sf_words.export_csv('features_words.csv')

### Constructing the matrix to train

In [79]:
# Take a look of the features related with this key class
features = [i for i in sf_words['word']]

In [80]:
features

['status',
 'stage',
 'survival',
 'free',
 'state',
 '(months)',
 'interval',
 'disease-free',
 'patient',
 'advanced',
 'progression',
 'disease/cell',
 'type',
 'characteristic/disease',
 'event',
 'phase',
 '(event)',
 '(dfs)',
 'clinical',
 'disease_status',
 'disease/treatment',
 'meibomian',
 'severity',
 '(inss)',
 'diseased',
 'state)',
 'disease_state',
 '(days)',
 'specific',
 'months',
 'time',
 'disease_specific_survival_event',
 'exposure',
 "subject's",
 'primary',
 '(years)']

In [81]:
tuples = []
for word in features:#range(len(features)):
    feature_vector = [1 if (word in i) else 0 for i in geo_tables['tissue']['key']]
    tuples.append((word, feature_vector))

sf_features = tc.SFrame({key: value for (key, value) in tuples})

In [84]:
sf_features

(days),(dfs),(event),(inss),(months),(years),advanced,characteristic/disease,clinical,disease-free
0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0

disease/cell,disease/treatment,disease_specific_survival _event ...,disease_state,disease_status,diseased,event,exposure
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0

free,interval,meibomian,months,patient,phase,primary,progression,severity,specific,stage,state
0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,1,0,0,0,0,0

state),status,subject's,survival,time,type
0,0,0,0,0,1
0,0,0,0,0,0
0,0,0,0,0,1
0,0,0,0,0,0
0,0,0,0,0,1
0,0,0,0,0,0
0,0,0,0,0,1
0,0,0,0,0,0
0,0,0,0,0,0
0,0,0,0,0,0
