# Load 'bcl2' sentences data

In [1]:
bcl2 = spark.read.csv('raw_data/article-content.txt', sep='\t', inferSchema=True).toDF('id', 'sentences')

# Split sentences into words and lemmatizing words

In [2]:
from pyspark.sql.functions import udf
from pyspark.sql.types import *

In [3]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

def lemmatizer(s):
    words_list = [wordnet_lemmatizer.lemmatize(w, 'v') for w in s.split()]
    return(words_list)
lemmatizer_udf = udf(lemmatizer, ArrayType(StringType()))

In [4]:
bcl2_lemm = bcl2.select(bcl2.id, bcl2.sentences, lemmatizer_udf(bcl2.sentences).alias('lemm_words'))

# bcl2 and gene regulation words

* **We collect bcl2 regulation words and lemmatize all the words**

In [5]:
import pandas as pd

  from pkg_resources import resource_stream


In [None]:
gr_df = pd.read_csv('raw_data/bcl2_regulation.csv', names=['word'])

In [None]:
gr_lemm_words = [wordnet_lemmatizer.lemmatize(w.lower(), 'v') for w in gr_df['word'] ]

* **Filter sentences that has 'bcl2' and any gene regulation words**

In [None]:
from pyspark.sql.functions import udf

In [None]:
def filter_bcl2_regulation(l):
    set0 = set(gr_lemm_words)
    set1 = set(l)
    common_words = list(set0.intersection(set1))
    if len(common_words) > 0:
        return(common_words)
    else:
        return(None)
filter_bcl2_regulation_udf = udf(filter_bcl2_regulation, ArrayType(StringType()))

In [None]:
bcl2_regulation_df = bcl2_lemm.select(bcl2_lemm.id, bcl2_lemm.sentences, bcl2_lemm.lemm_words, filter_bcl2_regulation_udf(bcl2_lemm.lemm_words).alias('bcl2_regulation'))

In [None]:
bcl2_regulation_df = bcl2_regulation_df.filter(bcl2_regulation_df.bcl2_regulation.isNotNull() )

In [None]:
bcl2_regulation_df.show(5)

# bcl2, gene regulation words and bcl2 family genes

* **Collect all bcl2 family genes and lemmatize all the words**

In [None]:
bcl2_family = pd.read_csv('raw_data/bcl2_family_genes.csv', names=['genes'])
bcl2_family_lemm = [wordnet_lemmatizer.lemmatize(w.lower(), 'v') for w in bcl2_family['genes'] ]

* **Filter sentences that has bcl2 and bcl2 family genes from the previous filtering step. The final sentences should contain bcl2, at least one gene regulation word and at least one bcl2 family gene.**

In [None]:
def filter_bcl2_family(l):
    set0 = set(bcl2_family_lemm)
    set1 = set(l)
    common_words = list(set0.intersection(set1))
    if len(common_words) > 0:
        return(common_words)
    else:
        return(None)
filter_bcl2_family_udf = udf(filter_bcl2_family, ArrayType(StringType()))

In [None]:
# bcl2_family_df = bcl2_regulation_df.select(bcl2_regulation_df.id, bcl2_regulation_df.sentences, bcl2_regulation_df.bcl2_regulation, filter_bcl2_family_udf(bcl2_regulation_df.sentences).alias('bcl2_family'))
bcl2_family_df = bcl2_regulation_df.select(bcl2_regulation_df.id, 
                                           bcl2_regulation_df.sentences, 
                                           bcl2_regulation_df.lemm_words, 
                                           bcl2_regulation_df.bcl2_regulation, 
                                           filter_bcl2_family_udf(bcl2_regulation_df.lemm_words).alias('bcl2_family'))

In [None]:
bcl2_family_df = bcl2_family_df.filter(bcl2_family_df.bcl2_family.isNotNull())

In [None]:
bcl2_family_df.show(5)

# Explode the data frame

* **At this step, we explode the data frame so that each row should on single gene regulation word and one single bcl2 family gene.**

## explode by bcl2 family genes**

In [None]:
from pyspark.sql.functions import explode
bcl2_family_explode = bcl2_family_df.select(bcl2_family_df.id, 
                                            bcl2_family_df.sentences,
                                            bcl2_family_df.lemm_words,
                                            bcl2_family_df.bcl2_regulation,
                                            explode(bcl2_family_df.bcl2_family).alias('bcl2_family'))
bcl2_family_explode.show(5)

## explode by bcl2 regulation vocabulary

In [None]:
bcl2_regulation_explode = bcl2_family_explode.select(bcl2_family_explode.id,
                                                     bcl2_family_explode.sentences,
                                                     bcl2_family_explode.lemm_words,
                                                     explode(bcl2_family_explode.bcl2_regulation).alias('bcl2_regulation'),
                                                     bcl2_family_explode.bcl2_family)
bcl2_regulation_explode.show(5)

# Concatenate regulation and gene columns

In [None]:
from pyspark.sql.functions import concat, concat_ws
bcl2_final_df = bcl2_regulation_explode.select(bcl2_regulation_explode.id,  
                                               concat_ws('_', bcl2_regulation_explode.bcl2_regulation, bcl2_regulation_explode.bcl2_family).alias('bcl2_regulates_family'),
                                               bcl2_regulation_explode.sentences)

# Save results into csv file

In [None]:
bcl2_final_df.orderBy('bcl2_regulates_family')\
    .coalesce(1)\
    .write.format("com.databricks.spark.csv")\
    .option("header", "true")\
    .save("bcl2_final_results")