In [84]:
import os
import pandas as pd
import csv
import re
import sys
import math
import nltk
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [3]:
## Retrieve current working directory (`cwd`)
cwd = os.getcwd()
cwd
## Change directory 
current_file=os.chdir("C:\\treckchallenge")

# Reading the Data

In [28]:
# Load the data from excel in csv format
data_with_features= pd.read_csv("C:\\treckchallenge\\20180622processedGoldStandardXMLTXT.tsv", sep="\t", encoding="utf-8", usecols=["pm_rel_desc", "title", "abstract", "trec_topic_disease"])
data_with_features.shape

(22642, 4)

# Getting Stop Words

In [62]:
#download stopwords
nltk.download('stopwords')
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Arpita.Kappattanavar\AppData\Roaming\nltk_dat
[nltk_data]     a...
[nltk_data]   Package stopwords is already up-to-date!


# Creating PM dataset

In [29]:
pmSet = data_with_features[data_with_features['pm_rel_desc'].str.contains('Human PM|Animal PM', regex=True)]
pmSet.shape

(9274, 4)

# Something

In [93]:
nltk.download('punkt')

# Convert the data to lower case 
PM_abstract_title = pmSet['title'].str.lower() + ' ' + pmSet['abstract'].str.lower()


# Removed stop words and punctuations
PM_no_stopWords = []
for text in PM_abstract_title:
    words = word_tokenize(text)
    for word in words:
        if word not in stopwords.words('english') and word not in string.punctuation:
            PM_no_stopWords.append(word)

PM_no_stopWords

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Arpita.Kappattanavar\AppData\Roaming\nltk_dat
[nltk_data]     a...
[nltk_data]   Package punkt is already up-to-date!


['case',
 'metastatic',
 'liposarcoma',
 'originating',
 'retroperitoneum',
 'successfully',
 'treated',
 'combination',
 'chemotherapy',
 'reported',
 '36-year-old',
 'woman',
 'metastatic',
 'liposarcoma',
 'originating',
 'retroperitoneum',
 'responded',
 'well',
 'adjuvant',
 'chemotherapy',
 'primary',
 'tumor',
 'removed',
 'surgery',
 'two',
 'months',
 'later',
 'patient',
 'developed',
 'metastasis',
 'brain',
 'lung',
 'four',
 'months',
 'later',
 'metastatic',
 'liposarcomas',
 'brain',
 'generally',
 'extremely',
 'rare',
 'patient',
 'treated',
 'combination',
 'chemotherapy',
 'using',
 'cyclophosphamide',
 'vincristine',
 'adriamycin',
 'dacarbazine',
 'cyvadic',
 'examined',
 'former',
 'two',
 'drugs',
 'alternated',
 'vindesine',
 'ifosfamide',
 'another',
 'regimen',
 'cisplatin',
 'etoposide',
 'given',
 'three-week',
 'interval',
 'result',
 'metastases',
 'totally',
 'disappeared',
 'recurrent',
 'lesion',
 'noted',
 'two',
 'years',
 'although',
 'role',
 'chemo

In [288]:
# filter the stopwords from title
text3 = text2
text3 = ' '.join([word for word in text3.split() if word not in (stopwords.words('english'))])

In [289]:
# filter the stopwords from abstract
text4 = text1
text4 = ' '.join([word for word in text4.split() if word not in (stopwords.words('english'))])
print(text4)

0 Cancer pancreas. Palliative operation, Whipple procedure, total pancreatectomy? 1 Rauwolfia derivatives breast cancer hypertensive women. 2 Repeated light- electron microscopic studies small-bowel mucosa Whipple's disease. 3 alpha-Lactalbumin human subhuman primate normal mammary tissue human breast cancer marker prolactin activity. 4 Dynamics neoplastic development carcinogen-exposed tracheal mucosa. 5 [Some epidemiological factors incidence female genital cancer Azerbaijan SSR]. 6 Role hypertension ischemic heart disease cerebral vascular disease cynomolgus monkey coarctation aorta. 7 [Pancreatic diseases]. 8 Infant human pancreas. A potential source islet tissue transplantation. 9 The many faces islet cell tumors. 10 Analysis human tumors human malignant cell lines BK virus-specific DNA sequences. 11 [Insulinoma]. 12 [Nesidioblastoma]. 13 Aetiology breast cancer: brief review. 14 Adjuvant systemic therapy lung cancer. 15 Twelve year old caucasian male asymptomatic hypertension. 16

In [290]:
#combine abstract and title
text=[text4,text3]
print(text)

["0 Cancer pancreas. Palliative operation, Whipple procedure, total pancreatectomy? 1 Rauwolfia derivatives breast cancer hypertensive women. 2 Repeated light- electron microscopic studies small-bowel mucosa Whipple's disease. 3 alpha-Lactalbumin human subhuman primate normal mammary tissue human breast cancer marker prolactin activity. 4 Dynamics neoplastic development carcinogen-exposed tracheal mucosa. 5 [Some epidemiological factors incidence female genital cancer Azerbaijan SSR]. 6 Role hypertension ischemic heart disease cerebral vascular disease cynomolgus monkey coarctation aorta. 7 [Pancreatic diseases]. 8 Infant human pancreas. A potential source islet tissue transplantation. 9 The many faces islet cell tumors. 10 Analysis human tumors human malignant cell lines BK virus-specific DNA sequences. 11 [Insulinoma]. 12 [Nesidioblastoma]. 13 Aetiology breast cancer: brief review. 14 Adjuvant systemic therapy lung cancer. 15 Twelve year old caucasian male asymptomatic hypertension. 

In [291]:
#Cell to find top_words

def tokenize(text):
    tokens = word_tokenize(text)
    stems = []
    for item in tokens: stems.append(PorterStemmer().stem(item))
    return stems


# word tokenize and stem
text = [" ".join(tokenize(txt.lower())) for txt in text]
vectorizer = TfidfVectorizer()
matrix = vectorizer.fit_transform(text).todense()
# transform the matrix to a pandas df
matrix = pd.DataFrame(matrix, columns=vectorizer.get_feature_names())
# sum over each title and abstract (axis=0)
top_words = matrix.sum(axis=0).sort_values(ascending=False)
print(top_words)

patient              0.640964
mutat                0.639770
cancer               0.602994
lung                 0.475828
egfr                 0.394871
cell                 0.393319
kra                  0.285974
tumor                0.269258
the                  0.231884
alk                  0.208361
braf                 0.199048
diseas               0.195824
pt                   0.194510
studi                0.188898
carcinoma            0.178391
amplif               0.163704
fgfr1                0.156778
breast               0.155226
adenocarcinoma       0.148301
human                0.146271
therapi              0.133256
melanoma             0.132062
non                  0.123942
result               0.120599
pik3ca               0.120241
treatment            0.117375
nsclc                0.117017
tki                  0.110091
inhibitor            0.108897
respons              0.105558
                       ...   
dramat               0.002853
drive                0.002853
emphas    

# Creating not PM dataset

In [30]:
notPmSet = data_with_features[data_with_features['pm_rel_desc'].str.contains('Not PM', regex=True)]
notPmSet.shape

(13368, 4)