In [2]:
# create entry points to spark
try:
    sc.stop()
except:
    pass
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
sc = SparkContext()
spark = SparkSession(sparkContext=sc)

# Load 'bcl2' sentences data

In [4]:
bcl2 = spark.read.csv('raw_data/article-content.txt', sep='\t', inferSchema=True).toDF('id', 'sentences')
bcl2.show()

+--------+--------------------+
|      id|           sentences|
+--------+--------------------+
|28386116| we experimentall...|
|28386116| to evaluate the ...|
|28386116|it has been well ...|
|28386116| our bioinformati...|
|28386116| it has been prev...|
|28386116| to assess to wha...|
|28386116| however, rnafold...|
|28386116|a point mutation ...|
|28386116| wild-type constr...|
|28386116| (b) cd spectra o...|
|28386116| (c) cd melting c...|
|28386116| cd spectra of wi...|
|28386116| cd melting at 26...|
|28386116|luciferase report...|
|28386116| three constructs...|
|28386116| 3d), which suppo...|
|28386116| compared to the ...|
|28386116|although in silic...|
|28386116| these observatio...|
|28386116|further character...|
+--------+--------------------+
only showing top 20 rows



# Split sentences into words and lemmatizing words

We need to define a `udf` function to deal with each row of the bcl2 dataframe.

In [5]:
from pyspark.sql.functions import udf
from pyspark.sql.types import *

In [12]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

def lemmatizer(s):
    words_list = [wordnet_lemmatizer.lemmatize(w, 'v') for w in s.split()]
    return(words_list)
lemmatizer_udf = udf(lemmatizer, ArrayType(StringType()))

In [8]:
bcl2_lemm = bcl2.select(bcl2.id, bcl2.sentences, lemmatizer_udf(bcl2.sentences).alias('lemm_words'))
bcl2_lemm.show()

+--------+--------------------+--------------------+
|      id|           sentences|          lemm_words|
+--------+--------------------+--------------------+
|28386116| we experimentall...|[we, experimental...|
|28386116| to evaluate the ...|[to, evaluate, th...|
|28386116|it has been well ...|[it, have, be, we...|
|28386116| our bioinformati...|[our, bioinformat...|
|28386116| it has been prev...|[it, have, be, pr...|
|28386116| to assess to wha...|[to, assess, to, ...|
|28386116| however, rnafold...|[however,, rnafol...|
|28386116|a point mutation ...|[a, point, mutati...|
|28386116| wild-type constr...|[wild-type, const...|
|28386116| (b) cd spectra o...|[(b), cd, spectra...|
|28386116| (c) cd melting c...|[(c), cd, melt, c...|
|28386116| cd spectra of wi...|[cd, spectra, of,...|
|28386116| cd melting at 26...|[cd, melt, at, 26...|
|28386116|luciferase report...|[luciferase, repo...|
|28386116| three constructs...|[three, construct...|
|28386116| 3d), which suppo...|[3d),, which, s

# bcl2 and gene regulation words

* **We collect bcl2 regulation words and lemmatize all the words**

In [10]:
import pandas as pd

In [17]:
gr_df = pd.read_csv('raw_data/bcl2_regulation.csv', names=['word'])
gr_df.head()

Unnamed: 0,word
0,activate
1,activation
2,activator
3,alter
4,antagonizing


Lemmatize regulation words

In [19]:
gr_lemm_words = [wordnet_lemmatizer.lemmatize(w.lower(), 'v') for w in gr_df['word'] ]
gr_lemm_words[:10]

['activate',
 'activation',
 'activator',
 'alter',
 'antagonize',
 'anti-apoptotic',
 'anti-death',
 'antitumor',
 'apoptosis',
 'apoptosis']

* **Filter sentences that has 'bcl2' and any gene regulation words**

In [None]:
from pyspark.sql.functions import udf

Define a `udf` function to to check if a row has 'bcl2'

In [21]:
def filter_bcl2_regulation(l):
    set0 = set(gr_lemm_words)
    set1 = set(l)
    common_words = list(set0.intersection(set1))
    if len(common_words) > 0:
        return(common_words)
    else:
        return(None)
filter_bcl2_regulation_udf = udf(filter_bcl2_regulation, ArrayType(StringType()))

In [22]:
bcl2_regulation_df = bcl2_lemm.select(bcl2_lemm.id, bcl2_lemm.sentences, bcl2_lemm.lemm_words, filter_bcl2_regulation_udf(bcl2_lemm.lemm_words).alias('bcl2_regulation'))

In [23]:
bcl2_regulation_df = bcl2_regulation_df.filter(bcl2_regulation_df.bcl2_regulation.isNotNull() )

In [24]:
bcl2_regulation_df.show(5)

+--------+--------------------+--------------------+----------------+
|      id|           sentences|          lemm_words| bcl2_regulation|
+--------+--------------------+--------------------+----------------+
|28386116|it has been well ...|[it, have, be, we...|[proto-oncogene]|
|28386116| compared to the ...|[compare, to, the...|      [increase]|
|28386116|bcl2 is a human p...|[bcl2, be, a, hum...|[proto-oncogene]|
|28386116| many examples ex...|[many, examples, ...|       [elevate]|
|28386116| several mechanis...|[several, mechani...|[overexpression]|
+--------+--------------------+--------------------+----------------+
only showing top 5 rows



# bcl2, gene regulation words and bcl2 family genes

* **Collect all bcl2 family genes and lemmatize all the words**

In [27]:
# import collected family genes
bcl2_family = pd.read_csv('raw_data/bcl2_family_genes.csv', names=['genes'])
# lemmatize family genes
bcl2_family_lemm = [wordnet_lemmatizer.lemmatize(w.lower(), 'v') for w in bcl2_family['genes'] ]
bcl2_family_lemm

['a1',
 'bad',
 'bak',
 'bax',
 'bcl-2a1',
 'bcl-b',
 'bcl-w',
 'bcl-xl',
 'bcl-xs',
 'bfl-1',
 'bid',
 'bik',
 'bim',
 'bmf',
 'bok',
 'ced-9',
 'diva',
 'egl-1',
 'hrk',
 'mcl-1',
 'mcl1',
 'noxa',
 'puma']

* **Filter sentences that has bcl2 and bcl2 family genes from the previous filtering step. The final sentences should contain bcl2, at least one gene regulation word and at least one bcl2 family gene.**

In [28]:
def filter_bcl2_family(l):
    set0 = set(bcl2_family_lemm)
    set1 = set(l)
    common_words = list(set0.intersection(set1))
    if len(common_words) > 0:
        return(common_words)
    else:
        return(None)
filter_bcl2_family_udf = udf(filter_bcl2_family, ArrayType(StringType()))

In [29]:
# bcl2_family_df = bcl2_regulation_df.select(bcl2_regulation_df.id, bcl2_regulation_df.sentences, bcl2_regulation_df.bcl2_regulation, filter_bcl2_family_udf(bcl2_regulation_df.sentences).alias('bcl2_family'))
bcl2_family_df = bcl2_regulation_df.select(bcl2_regulation_df.id, 
                                           bcl2_regulation_df.sentences, 
                                           bcl2_regulation_df.lemm_words, 
                                           bcl2_regulation_df.bcl2_regulation, 
                                           filter_bcl2_family_udf(bcl2_regulation_df.lemm_words).alias('bcl2_family'))

In [30]:
bcl2_family_df = bcl2_family_df.filter(bcl2_family_df.bcl2_family.isNotNull())

In [31]:
bcl2_family_df.show(5)

+--------+--------------------+--------------------+--------------------+-----------+
|      id|           sentences|          lemm_words|     bcl2_regulation|bcl2_family|
+--------+--------------------+--------------------+--------------------+-----------+
|28369145| albicans or go-p...|[albicans, or, go...|          [increase]|      [bax]|
|28369145| (f) increased ra...|[(f), increase, r...|[activation, apop...|      [bax]|
|28367088| the addition of ...|[the, addition, o...|  [reduce, increase]|      [bax]|
|28350842| it was demonstra...|[it, be, demonstr...|[promote, anti-ap...|      [bax]|
|28334048| also, there was ...|[also,, there, be...|[anti-apoptotic, ...|      [bax]|
+--------+--------------------+--------------------+--------------------+-----------+
only showing top 5 rows



# Explode the data frame

* **At this step, we explode the data frame so that each row should on single gene regulation word and one single bcl2 family gene.**

## Explode by bcl2 family genes

Each row may have more than one family genes. We want to spread them so that each row has only one familty genes.

In [36]:
from pyspark.sql.functions import explode
bcl2_family_explode = bcl2_family_df.select(bcl2_family_df.id, 
                                            bcl2_family_df.sentences,
                                            bcl2_family_df.lemm_words,
                                            bcl2_family_df.bcl2_regulation,
                                            explode(bcl2_family_df.bcl2_family).alias('bcl2_family'))
bcl2_family_explode.show(5)

+--------+--------------------+--------------------+--------------------+-----------+
|      id|           sentences|          lemm_words|     bcl2_regulation|bcl2_family|
+--------+--------------------+--------------------+--------------------+-----------+
|28369145| albicans or go-p...|[albicans, or, go...|          [increase]|        bax|
|28369145| (f) increased ra...|[(f), increase, r...|[activation, apop...|        bax|
|28367088| the addition of ...|[the, addition, o...|  [reduce, increase]|        bax|
|28350842| it was demonstra...|[it, be, demonstr...|[promote, anti-ap...|        bax|
|28334048| also, there was ...|[also,, there, be...|[anti-apoptotic, ...|        bax|
+--------+--------------------+--------------------+--------------------+-----------+
only showing top 5 rows



## Explode by bcl2 regulation vocabulary

In [37]:
bcl2_regulation_explode = bcl2_family_explode.select(bcl2_family_explode.id,
                                                     bcl2_family_explode.sentences,
                                                     bcl2_family_explode.lemm_words,
                                                     explode(bcl2_family_explode.bcl2_regulation).alias('bcl2_regulation'),
                                                     bcl2_family_explode.bcl2_family)
bcl2_regulation_explode.show(5)

+--------+--------------------+--------------------+---------------+-----------+
|      id|           sentences|          lemm_words|bcl2_regulation|bcl2_family|
+--------+--------------------+--------------------+---------------+-----------+
|28369145| albicans or go-p...|[albicans, or, go...|       increase|        bax|
|28369145| (f) increased ra...|[(f), increase, r...|     activation|        bax|
|28369145| (f) increased ra...|[(f), increase, r...|      apoptosis|        bax|
|28369145| (f) increased ra...|[(f), increase, r...|       increase|        bax|
|28367088| the addition of ...|[the, addition, o...|         reduce|        bax|
+--------+--------------------+--------------------+---------------+-----------+
only showing top 5 rows



# Concatenate regulation and gene columns

In [38]:
from pyspark.sql.functions import concat, concat_ws
bcl2_final_df = bcl2_regulation_explode.select(bcl2_regulation_explode.id,  
                                               concat_ws('_', bcl2_regulation_explode.bcl2_regulation, bcl2_regulation_explode.bcl2_family).alias('bcl2_regulates_family'),
                                               bcl2_regulation_explode.sentences)

In [41]:
bcl2_final_df.show(n=)

+--------+---------------------+--------------------+
|      id|bcl2_regulates_family|           sentences|
+--------+---------------------+--------------------+
|28369145|         increase_bax| albicans or go-p...|
|28369145|       activation_bax| (f) increased ra...|
|28369145|        apoptosis_bax| (f) increased ra...|
|28369145|         increase_bax| (f) increased ra...|
|28367088|           reduce_bax| the addition of ...|
|28367088|         increase_bax| the addition of ...|
|28350842|          promote_bax| it was demonstra...|
|28350842|   anti-apoptotic_bax| it was demonstra...|
|28350842|          inhibit_bax| it was demonstra...|
|28350842|     upregulation_bax| it was demonstra...|
|28334048|   anti-apoptotic_bax| also, there was ...|
|28334048|    pro-apoptotic_bax| also, there was ...|
|28334048|         increase_bax| also, there was ...|
|28157696|            alter_bax|tramp cells expos...|
|28105423|           reduce_bax|seminal mirna-122...|
|28105423|         increase_

# Save results into csv file

In [None]:
bcl2_final_df.orderBy('bcl2_regulates_family')\
    .coalesce(1)\
    .write.format("com.databricks.spark.csv")\
    .option("header", "true")\
    .save("bcl2_final_results")