# Load 'bcl2' sentences data

In [1]:
bcl2 = spark.read.csv('raw_data/article-content.txt', sep='\t', inferSchema=True).toDF('id', 'sentences')

In [2]:
bcl2.show(5)

+--------+--------------------+
|      id|           sentences|
+--------+--------------------+
|28386116| we experimentall...|
|28386116| to evaluate the ...|
|28386116|it has been well ...|
|28386116| our bioinformati...|
|28386116| it has been prev...|
+--------+--------------------+
only showing top 5 rows



# Lemmatizing words

In [3]:
from pyspark.sql.functions import udf
from pyspark.sql.types import *

In [48]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

def lemmatizer(s):
    words_list = [wordnet_lemmatizer.lemmatize(w, 'v') for w in s.split()]
    return(words_list)
lemmatizer_udf = udf(lemmatizer, ArrayType(StringType()))

In [115]:
bcl2_lemm = bcl2.select(bcl2.id, bcl2.sentences, lemmatizer_udf(bcl2.sentences).alias('lemm_words'))
pd_df = bcl2_lemm.toPandas()
pd_df.iloc[155, ]['lemm_words']

[u'the',
 u'cell',
 u'fate',
 u'decision',
 u'on',
 u'tna-nanoenvironment',
 u'have',
 u'be',
 u'report',
 u'to',
 u'possibly',
 u'regulate',
 u'proliferative',
 u'activities',
 u'via',
 u'expression',
 u'of',
 u'p27',
 u'and',
 u'bcl2',
 u'tumor',
 u'suppressor',
 u'proteins,',
 u'cogent',
 u'with',
 u'skp2',
 u'and',
 u'bcl2',
 u'oncogenic',
 u'proteins',
 u'suppression']

# Load gene regulation vocabulary

In [116]:
import pandas as pd

In [117]:
gr_df = pd.read_csv('raw_data/bcl2_regulation_vocab.csv', names=['word'])

In [118]:
gr_lemm_words = [wordnet_lemmatizer.lemmatize(w, 'v') for w in gr_df['word'] ]

# Filter sentences that has 'bcl2' and any words from the `gr_lemm_words` list

In [119]:
from pyspark.sql.functions import udf

In [120]:
def filter_sentences(l):
    set0 = set(gr_lemm_words)
    set1 = set(l)
    common_words = list(set0.intersection(set1))
    if len(common_words) > 0:
        return(common_words)
    else:
        return(None)
filter_sentences_udf = udf(filter_sentences, ArrayType(StringType()))

In [121]:
bcl2_regulation_df = bcl2_lemm.select(bcl2_lemm.id, bcl2_lemm.sentences, filter_sentences_udf(bcl2_lemm.lemm_words).alias('filter_sentence'))

In [123]:
bcl2_regulation_df = bcl2_regulation_df.filter(bcl2_regulation_df.filter_sentence.isNotNull() )

In [124]:
bcl2_regulation_df.show(truncate=False)

+--------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------+
|id      |sentences                                                                                                                                                                                                                                                                                                                                  |filter_sentence                              |
+--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------