# Load 'bcl2' sentences data

In [1]:
bcl2 = spark.read.csv('raw_data/article-content.txt', sep='\t', inferSchema=True).toDF('id', 'sentences')

In [2]:
bcl2.show(5)

+--------+--------------------+
|      id|           sentences|
+--------+--------------------+
|28386116| we experimentall...|
|28386116| to evaluate the ...|
|28386116|it has been well ...|
|28386116| our bioinformati...|
|28386116| it has been prev...|
+--------+--------------------+
only showing top 5 rows



# Lemmatizing words

In [3]:
from pyspark.sql.functions import udf
from pyspark.sql.types import *

In [48]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

def lemmatizer(s):
    words_list = [wordnet_lemmatizer.lemmatize(w, 'v') for w in s.split()]
    return(words_list)
lemmatizer_udf = udf(lemmatizer, ArrayType(StringType()))

In [204]:
bcl2_lemm = bcl2.select(bcl2.id, bcl2.sentences, lemmatizer_udf(bcl2.sentences).alias('lemm_words'))

# Load gene regulation vocabulary

In [177]:
import pandas as pd

In [178]:
gr_df = pd.read_csv('raw_data/bcl2_regulation.csv', names=['word'])

In [185]:
gr_lemm_words = [wordnet_lemmatizer.lemmatize(w.lower(), 'v') for w in gr_df['word'] ]

# Filter sentences that has 'bcl2' and any words from the `gr_lemm_words` list

In [186]:
from pyspark.sql.functions import udf

In [187]:
def filter_bcl2_regulation(l):
    set0 = set(gr_lemm_words)
    set1 = set(l)
    common_words = list(set0.intersection(set1))
    if len(common_words) > 0:
        return(common_words)
    else:
        return(None)
filter_bcl2_regulation_udf = udf(filter_bcl2_regulation, ArrayType(StringType()))

In [206]:
bcl2_regulation_df = bcl2_lemm.select(bcl2_lemm.id, bcl2_lemm.sentences, bcl2_lemm.lemm_words, filter_bcl2_regulation_udf(bcl2_lemm.lemm_words).alias('bcl2_regulation'))

In [207]:
bcl2_regulation_df = bcl2_regulation_df.filter(bcl2_regulation_df.bcl2_regulation.isNotNull() )

In [208]:
bcl2_regulation_df.show()

+--------+--------------------+--------------------+--------------------+
|      id|           sentences|          lemm_words|     bcl2_regulation|
+--------+--------------------+--------------------+--------------------+
|28386116|it has been well ...|[it, have, be, we...|    [proto-oncogene]|
|28386116| compared to the ...|[compare, to, the...|          [increase]|
|28386116|bcl2 is a human p...|[bcl2, be, a, hum...|    [proto-oncogene]|
|28386116| many examples ex...|[many, examples, ...|           [elevate]|
|28386116| several mechanis...|[several, mechani...|    [overexpression]|
|28386116| we observed ~75%...|[we, observe, ~75...|          [increase]|
|28386116| taken together, ...|[take, together,,...|    [proto-oncogene]|
|28382141| however, the exp...|[however,, the, e...|    [overexpression]|
|28382141| in univariate su...|[in, univariate, ...|          [survival]|
|28382141| enktl, an ebv-as...|[enktl,, an, ebv-...|[overexpression, ...|
|28382141| however, the exp...|[howeve

# Load bcl2 family genes

In [209]:
bcl2_family = pd.read_csv('raw_data/bcl2_family_genes.csv', names=['genes'])
bcl2_family_lemm = [wordnet_lemmatizer.lemmatize(w.lower(), 'v') for w in bcl2_family['genes'] ]

# Filter sentences that has bcl2, regulation word and bcl2 family genes

In [210]:
def filter_bcl2_family(l):
    set0 = set(bcl2_family_lemm)
    set1 = set(l)
    common_words = list(set0.intersection(set1))
    if len(common_words) > 0:
        return(common_words)
    else:
        return(None)
filter_bcl2_family_udf = udf(filter_bcl2_family, ArrayType(StringType()))

In [216]:
# bcl2_family_df = bcl2_regulation_df.select(bcl2_regulation_df.id, bcl2_regulation_df.sentences, bcl2_regulation_df.bcl2_regulation, filter_bcl2_family_udf(bcl2_regulation_df.sentences).alias('bcl2_family'))
bcl2_family_df = bcl2_regulation_df.select(bcl2_regulation_df.id, 
                                           bcl2_regulation_df.sentences, 
                                           bcl2_regulation_df.lemm_words, 
                                           bcl2_regulation_df.bcl2_regulation, 
                                           filter_bcl2_family_udf(bcl2_regulation_df.lemm_words).alias('bcl2_family'))

In [217]:
bcl2_family_df = bcl2_family_df.filter(bcl2_family_df.bcl2_family.isNotNull())

In [218]:
bcl2_family_df.show()

+--------+--------------------+--------------------+--------------------+-----------+
|      id|           sentences|          lemm_words|     bcl2_regulation|bcl2_family|
+--------+--------------------+--------------------+--------------------+-----------+
|28369145| albicans or go-p...|[albicans, or, go...|          [increase]|      [bax]|
|28369145| (f) increased ra...|[(f), increase, r...|[increase, apopto...|      [bax]|
|28367088| the addition of ...|[the, addition, o...|  [increase, reduce]|      [bax]|
|28350842| it was demonstra...|[it, be, demonstr...|[anti-apoptotic, ...|      [bax]|
|28334048| also, there was ...|[also,, there, be...|[pro-apoptotic, i...|      [bax]|
|28157696|tramp cells expos...|[tramp, cells, ex...|             [alter]|      [bax]|
|28105423|seminal mirna-122...|[seminal, mirna-1...|  [increase, reduce]|      [bax]|
|28076382|g004to assess whe...|[g004to, assess, ...|[pro-apoptotic, a...|      [bax]|
|28073348| bcl2 protein is ...|[bcl2, protein, b...|[p

In [219]:
bcl2_family_df.orderBy('bcl2_family').show()

+--------+--------------------+--------------------+--------------------+------------------+
|      id|           sentences|          lemm_words|     bcl2_regulation|       bcl2_family|
+--------+--------------------+--------------------+--------------------+------------------+
|27433938| to further explo...|[to, further, exp...|    [overexpression]|        [a1, mcl1]|
|27014180| the latter activ...|[the, latter, act...|[trigger, inhibit...|             [bad]|
|27014180| bad promotes cel...|[bad, promote, ce...|[inhibit, death, ...|             [bad]|
|27529753| subsequently, we...|[subsequently,, w...|          [increase]|        [bad, bax]|
|27014180| normally, bcl2 i...|[normally,, bcl2,...|[prevent, interac...|        [bad, bax]|
|27529753| the expression o...|[the, expression,...|          [suppress]|        [bad, bax]|
|27776559| the histograms a...|[the, histograms,...|[increase, pro-ap...|[bad, bcl-xl, bax]|
|27990281| because the bad ...|[because, the, ba...|[bind, anti-apopt.

In [224]:
# explode by bcl2 family genes
from pyspark.sql.functions import explode
bcl2_family_explode = bcl2_family_df.select(bcl2_family_df.id, 
                                            bcl2_family_df.sentences,
                                            bcl2_family_df.lemm_words,
                                            bcl2_family_df.bcl2_regulation,
                                            explode(bcl2_family_df.bcl2_family).alias('bcl2_family'))
bcl2_family_explode.show(5)

+--------+--------------------+--------------------+--------------------+-----------+
|      id|           sentences|          lemm_words|     bcl2_regulation|bcl2_family|
+--------+--------------------+--------------------+--------------------+-----------+
|28369145| albicans or go-p...|[albicans, or, go...|          [increase]|        bax|
|28369145| (f) increased ra...|[(f), increase, r...|[increase, apopto...|        bax|
|28367088| the addition of ...|[the, addition, o...|  [increase, reduce]|        bax|
|28350842| it was demonstra...|[it, be, demonstr...|[anti-apoptotic, ...|        bax|
|28334048| also, there was ...|[also,, there, be...|[pro-apoptotic, i...|        bax|
+--------+--------------------+--------------------+--------------------+-----------+
only showing top 5 rows



In [226]:
# explode by bcl2 regulation vocabulary
bcl2_regulation_explode = bcl2_family_explode.select(bcl2_family_explode.id,
                                                     bcl2_family_explode.sentences,
                                                     bcl2_family_explode.lemm_words,
                                                     explode(bcl2_family_explode.bcl2_regulation).alias('bcl2_regulation'),
                                                     bcl2_family_explode.bcl2_family)
bcl2_regulation_explode.show(5)

+--------+--------------------+--------------------+---------------+-----------+
|      id|           sentences|          lemm_words|bcl2_regulation|bcl2_family|
+--------+--------------------+--------------------+---------------+-----------+
|28369145| albicans or go-p...|[albicans, or, go...|       increase|        bax|
|28369145| (f) increased ra...|[(f), increase, r...|       increase|        bax|
|28369145| (f) increased ra...|[(f), increase, r...|      apoptosis|        bax|
|28369145| (f) increased ra...|[(f), increase, r...|     activation|        bax|
|28367088| the addition of ...|[the, addition, o...|       increase|        bax|
+--------+--------------------+--------------------+---------------+-----------+
only showing top 5 rows



In [229]:
bcl2_regulation_explode.orderBy('bcl2_regulation').show()

+--------+--------------------+--------------------+---------------+-----------+
|      id|           sentences|          lemm_words|bcl2_regulation|bcl2_family|
+--------+--------------------+--------------------+---------------+-----------+
|27990281| in this model, b...|[in, this, model,...|       activate|        bak|
|27990281|in model 1 (left)...|[in, model, 1, (l...|       activate|        bim|
|27990281|in model 1 (left)...|[in, model, 1, (l...|       activate|        bax|
|27990281| the indirect act...|[the, indirect, a...|       activate|        bax|
|27990281| to the extent th...|[to, the, extent,...|       activate|        bax|
|27990281| based on these o...|[base, on, these,...|       activate|        bak|
|27777645| puma binds and a...|[puma, bind, and,...|       activate|       puma|
|27777645| puma binds and a...|[puma, bind, and,...|       activate|        bax|
|27014180| the latter activ...|[the, latter, act...|       activate|        bad|
|27990281| in model 2 (righ.

In [249]:
from pyspark.sql.functions import concat, concat_ws
bcl2_final_df = bcl2_regulation_explode.select(bcl2_regulation_explode.id,  
                                               concat_ws('_', bcl2_regulation_explode.bcl2_regulation, bcl2_regulation_explode.bcl2_family).alias('bcl2_regulates_family'),
                                               bcl2_regulation_explode.sentences)

In [250]:
bcl2_final_df.orderBy('bcl2_regulates_family').show(5)

+--------+---------------------+--------------------+
|      id|bcl2_regulates_family|           sentences|
+--------+---------------------+--------------------+
|27014180|         activate_bad| the latter activ...|
|27990281|         activate_bak| the indirect act...|
|27990281|         activate_bak| to the extent th...|
|27990281|         activate_bak| in this model, b...|
|27990281|         activate_bak| in model 2 (righ...|
+--------+---------------------+--------------------+
only showing top 5 rows



In [253]:
bcl2_final_df.orderBy('bcl2_regulates_family')\
    .coalesce(1)\
    .write.format("com.databricks.spark.csv")\
    .option("header", "true")\
    .save("bcl2_final_results")