## Extracting Features using text mining

In [1]:
import turicreate as tc

In [6]:
sf_keys = tc.SFrame('../datasets/geo1.csv')

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument


------------------------------------------------------


In [7]:
sf_keys

key,key_class
# of tissue = 36 tissue,tissue
4 months tissue,tissue
prostate tissue,tissue
# of tissue = 1 tissue,tissue
# of tissue = 40 tissue,tissue
# of tissue = 42 tissue,tissue
fetal vs adult tissue,tissue
# of tissue = 38 tissue,tissue
a.thaliana tissue,tissue
age of ffpe tissue,tissue


In [8]:
len(sf_keys)

1645

In [10]:
key_classes = [i for i in sf_keys['key_class'].unique()]
key_classes

['gender',
 'cell line',
 'genotype',
 'sex',
 'treatment',
 'age',
 'cell type',
 'strain',
 'time',
 'disease',
 'tissue']

We want to create different subsets, one per each key class, so we do a filtering

In [11]:
def create_subsets(df, column_category):
    tuples = []
    for category in df[column_category].unique():
        yes_category = df[df[column_category] == category]
        no_category = df[df[column_category] != category]
        no_category[column_category] = 'no '+category
        table = yes_category.append(no_category)
        tuples.append((category, table))
        
    tables = {key: value for (key, value) in tuples}
    print(tables.keys())
    return tables

In [12]:
geo_tables = create_subsets(sf_keys, 'key_class')

['genotype', 'gender', 'age', 'cell line', 'disease', 'sex', 'strain', 'tissue', 'treatment', 'time', 'cell type']


In [13]:
geo_tables['cell line'].print_rows(355,2)

+--------------------------------+--------------+
|              key               |  key_class   |
+--------------------------------+--------------+
|        human cell line         |  cell line   |
|    dendritic cell lineages     |  cell line   |
|          or cell line          |  cell line   |
|         huh7 cell line         |  cell line   |
|        hybrid cell line        |  cell line   |
|        tumor cell line         |  cell line   |
|         atcc cell line         |  cell line   |
|         host cell line         |  cell line   |
|        donor cell line         |  cell line   |
|     fibrosarcoma cell line     |  cell line   |
|        insect cell line        |  cell line   |
|       es cell line type        |  cell line   |
|      responder cell line       |  cell line   |
|        stable cell line        |  cell line   |
|      background cell line      |  cell line   |
|        cell line/clone         |  cell line   |
|      reference cell lines      |  cell line   |


---
## Topic Modeling for Feature Extraction

In [9]:
disease_df = geo_tables['disease']

In [10]:
disease_df

key,key_class
disease,disease
primary disease,disease
disease exposure,disease
disease/cell type,disease
disease type,disease
disease subtype,disease
disease model,disease
disease-state,disease
meibomian gland disease state ...,disease
subject's disease state,disease


In [12]:
# Remove stopwords and convert to bag of words
doc = tc.text_analytics.count_words(disease_df['key'])
doc = doc.dict_trim_by_keys(tc.text_analytics.stopwords(), exclude=True)

In [13]:
# Learn topic model
model = tc.topic_model.create(doc)#, num_topics = 3)

Reviewing the difference between the unique words appeared on topics vs the whole vocabulary  

In [14]:
print('Unique keys {}'.format(len(model.get_topics()['word'].unique())))
#model.vocabulary

Unique keys 34


In [15]:
# Agreaggate the unique words
sf_topics = model.get_topics()
sf_words = sf_topics.groupby(key_column_names='word', operations={'sum_scores': tc.aggregate.SUM('score')})
# Sort the features scores and filter all those which are key
sf_words = sf_words.sort('sum_scores', ascending= False).filter_by(key_classes, 'word', exclude=True)

In [18]:
sf_words#.print_rows(3,2)

word,sum_scores
cell,0.264415156507
line,0.253564018764
type,0.134301505903
source,0.130717497568
stage,0.128177966102
time,0.100543478261
sample,0.0608311391658
patient,0.0466179159049
background,0.0466179159049
status,0.0461956521739


In [62]:
#sf_words.export_csv('features_words.csv')

### Constructing the matrix to train

In [19]:
# Take a look of the features related with this key class
features = [i for i in sf_words['word']][0:10] #changable

In [20]:
tuples = []
for word in features:#range(len(features)):
    feature_vector = [1 if (word in i) else 0 for i in disease_df['key']]
    tuples.append((word, feature_vector))

sf_features = tc.SFrame({key: value for (key, value) in tuples})

In [21]:
sf_features

background,cell,line,patient,sample,source,stage,status,time,type
0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0
0,1,0,0,0,0,0,0,0,1
0,0,0,0,0,0,0,0,0,1
0,0,0,0,0,0,0,0,0,1
0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0


In [22]:
disease_df = disease_df.add_row_number()
sf_features = sf_features.add_row_number()
disease_df.join(sf_features, on='id', how='left')

id,key,key_class,background,cell,line,patient,sample,source,stage,status,time
0,disease,disease,0,0,0,0,0,0,0,0,0
1,primary disease,disease,0,0,0,0,0,0,0,0,0
2,disease exposure,disease,0,0,0,0,0,0,0,0,0
3,disease/cell type,disease,0,1,0,0,0,0,0,0,0
4,disease type,disease,0,0,0,0,0,0,0,0,0
5,disease subtype,disease,0,0,0,0,0,0,0,0,0
6,disease model,disease,0,0,0,0,0,0,0,0,0
7,disease-state,disease,0,0,0,0,0,0,0,0,0
8,meibomian gland disease state ...,disease,0,0,0,0,0,0,0,0,0
9,subject's disease state,disease,0,0,0,0,0,0,0,0,0

type
0
0
0
1
1
1
0
0
0
0


---
### Function for appending features into the input matrix

In [34]:
def create_feature_matrix(category_df, n_features):
    
    # Remove stopwords and convert to bag of words
    doc = tc.text_analytics.count_words(category_df['key'])
    doc = doc.dict_trim_by_keys(tc.text_analytics.stopwords(), exclude=True)
    # Learn topic model
    model = tc.topic_model.create(doc)
    # Agreaggate the unique words
    sf_topics = model.get_topics()
    sf_words = sf_topics.groupby(key_column_names='word', operations={'sum_scores': tc.aggregate.SUM('score')})
    # Sort the features scores and filter out all those which are key
    sf_words = sf_words.sort('sum_scores', ascending= False).filter_by(key_classes, 'word', exclude=True)
    # Take a look of the features related with this key class
    features = [i for i in sf_words['word']][0:n_features] #changable
    tuples = [] #range(len(features)):
    for word in features:
        feature_vector = [1 if (word in i) else 0 for i in category_df['key']]
        tuples.append((word, feature_vector))
        
    sf_features = tc.SFrame({key: value for (key, value) in tuples})
    #print sf_features
    #concatenating the features with the category matrix
    category_df = category_df.add_row_number()
    sf_features = sf_features.add_row_number()
    category_df.join(sf_features, on='id', how='left')
    return category_df

---
### Function for creating feature matrix


In [14]:
tissue_df = geo_tables['tissue']

In [15]:
def create_features(category_df, n_features = 10):
    # Remove stopwords and convert to bag of words
    doc = tc.text_analytics.count_words(category_df['key'])
    doc = doc.dict_trim_by_keys(tc.text_analytics.stopwords(), exclude=True)
    # Learn topic model
    model = tc.topic_model.create(doc)
    # Agreaggate the unique words
    sf_topics = model.get_topics()
    sf_words = sf_topics.groupby(key_column_names='word', operations={'sum_scores': tc.aggregate.SUM('score')})
    # Sort the features scores and filter out all those which are key
    sf_words = sf_words.sort('sum_scores', ascending= False).filter_by(key_classes, 'word', exclude=True)
    # Take a look of the features related with this key class
    features = [i for i in sf_words['word']][0:n_features] #changable
    return sf_words, features

In [16]:
sf_words, features = create_features(tissue_df)
sf_words.print_rows(3,2)

+------+-----------------+
| word |    sum_scores   |
+------+-----------------+
| cell |  0.228652028746 |
| line |  0.139723220759 |
| type | 0.0914357302942 |
+------+-----------------+
[21 rows x 2 columns]



---
### Function to create the final input matrix including weights (from topic model scores)

In [17]:
def get_input_matrix(features, category_df, sf_words):
    tuples = []
    for word in features:
        feature_vector = [1 if (word in i) else 0 for i in category_df['key']]
        tuples.append((word, feature_vector))
        
    sf_features = tc.SFrame({key: value for (key, value) in tuples})
    #concatenating the features with the category matrix
    category_df = category_df.add_row_number()
    sf_features = sf_features.add_row_number()
    final_table = category_df.join(sf_features, on='id', how='left')
    for f in features:
        score = sf_words[sf_words['word'] == str(f)]['sum_scores'].astype(float)[0]
        final_table[str(f)] = [(1.0+score) * i for i in final_table[str(f)]]
        
    return final_table

In [18]:
my_input = get_input_matrix(features, tissue_df, sf_words)

In [19]:
my_input.print_rows(355,13)

+-----+--------------------------------+-----------+---------------+
|  id |              key               | key_class |    (months)   |
+-----+--------------------------------+-----------+---------------+
|  0  |    # of tissue = 36 tissue     |   tissue  |      0.0      |
|  1  |        4 months tissue         |   tissue  |      0.0      |
|  2  |        prostate tissue         |   tissue  |      0.0      |
|  3  |     # of tissue = 1 tissue     |   tissue  |      0.0      |
|  4  |    # of tissue = 40 tissue     |   tissue  |      0.0      |
|  5  |    # of tissue = 42 tissue     |   tissue  |      0.0      |
|  6  |     fetal vs adult tissue      |   tissue  |      0.0      |
|  7  |    # of tissue = 38 tissue     |   tissue  |      0.0      |
|  8  |       a.thaliana tissue        |   tissue  |      0.0      |
|  9  |       age of ffpe tissue       |   tissue  |      0.0      |
|  10 |    day of tissue dissection    |   tissue  |      0.0      |
|  11 |         age and tissue    