# Cancer Blog Decision Support

In [1]:
import csv
import nltk
import re

In [2]:
stopwords = nltk.corpus.stopwords.words('english')
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [3]:
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent) if word not in stopwords]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)

    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [4]:
doc_list = []
with open('text.csv', 'r',encoding = 'latin-1') as f:
    reader = csv.reader(f)
    for i in reader:
        doc_list.append(i[0])

print(tokenize_and_stem(doc_list[0]))
doc_list[0]
doc_list

['are', 'american', 'afraid', 'cancer', 'actual', 'fear', 'lead', 'lifestyl', 'choic', 'thing', 'end', 'one', 'get', 'cancer', 'that', "'s", 'conclus', 'recent', 'studi', 'said', 'mani', 'american', "n't", 'believ', 'anyth', 'protect', 'cancer', 'henc', 'creation', 'self-fulfil', 'prophecy.in', 'word', 'believ', 'someth', 'even', 'without', 'proof', 'like', 'somehow', 'find', 'way', 'make', 'happen', 'the', 'research', 'look', 'peopl', 'percent', 'conclud', 'near', 'everyth', 'caus', 'cancer', 'while', "'s", 'even', 'close', 'true', 'close', 'that', "'s", 'imposs', 'answer', 'i', 'think.th', 'best', 'live', 'healthi', 'possibl', 'within', 'mean', "n't", 'engag', 'lifestyl', 'choic', 'may', 'futur', 'negat', 'consequ', 'good', 'start', 'not', 'smoke', 'eat', 'plenti', 'fruit', 'veget', 'comment', 'american', 'train', 'virtual', 'birth', 'make', 'feel', 'better', 'consum', 'thing', 'the', 'worri', 'indulge.regard', 'richard', 'day', 'gore']


['Are Americans so afraid of cancer that the actual fear leads to lifestyle choices and other things that end up in one getting cancer? That\'s the conclusions from a recent study that said so many Americans don\'t believe they can do anything to protect themselves from cancer. Hence, the creation of a self-fulfilling prophecy.In other words, the more you believe something (even without proof), the more likely you are to somehow find a way to make it happen. The research here looked at 6,000 people; and 47 percent of them concluded that "nearly everything causes cancer." While that\'s not even close to being true, just how close it is? That\'s impossible to answer I think.The best you can do is to live as healthy as possible (within your means) and don\'t engage in lifestyle choices that may have future negative consequences. Good starts? Not smoking and eating plenty of fruits and vegetables. Comments: 1. Americans are trained virtually from birth to make themselves feel better by con

In [5]:
from gensim.models import doc2vec
from collections import namedtuple



In [8]:
docs = []
analyzedDocument = namedtuple('AnalyzedDocument', 'words tags')
for i, text in enumerate(doc_list):
    words = tokenize_and_stem(text)
    tags = [i]
    docs.append(analyzedDocument(words, tags))

In [9]:
print(docs[1])

AnalyzedDocument(words=['a', 'new', 'studi', 'find', 'chronic', 'gum', 'diseas', 'could', 'lead', 'elev', 'futur', 'tongu', 'cancer', 'risk', 'men', 'so', 'instead', 'brush', 'teeth', 'sure', 'massag', 'clean', 'gum', 'oh', 'i', 'forget', 'mention', 'floss', 'periodont', 'aka', 'gum', 'diseas', 'look', 'studi', 'insofar', 'studi', 'amount', 'bone', 'loss', 'tooth', 'caviti', 'conclus', 'men', 'tongu', 'cancer', 'signific', 'bone', 'loss', 'tooth', 'caviti', 'without', 'tongu', 'cancer.aft', 'result', 'studi', 'research', 'believ', 'certain', 'periodont', 'bacteria', 'may', 'toxic', 'lead', 'tongu', 'oral', 'cancer', 'moral', 'stori', 'everyon', 'make', 'sure', 'treat', 'mouth', 'good', 'clean', 'time', 'day', 'brush', 'floss', 'comment'], tags=[1])


In [10]:
from gensim.models import doc2vec
import random

alpha_val = 0.025        # Initial learning rate
min_alpha_val = 1e-4     # Minimum for linear learning rate decay

model = doc2vec.Doc2Vec( vector_size=300, window=10, min_count=5, workers=11,alpha=0.025, min_alpha=0.025)

model.build_vocab(docs) # Building vocabulary


model.train(docs, total_examples=model.corpus_count, epochs=model.epochs, start_alpha = alpha_val, end_alpha = min_alpha_val)
print(model.epochs)
    # Logs


5


In [12]:
def search(text,doc_num):
    tokens = tokenize_and_stem(text)
    new_vector = model.infer_vector(tokens)
    sims = model.docvecs.most_similar([new_vector], topn = doc_num)
    print("Top ", doc_num, "most relevant blogs are: ")
    for i in sims:
        print(doc_list[i[0]])
        print('\n')
        
    
search("breast cancer cure", 2)

Top  2 most relevant blogs are: 
Right now -- at this very moment -- my two boys have turned our living room into a mess of blankets and pillows and stuffed animals. They put on their jammies and closed all the blinds and are pretending it's bedtime. But it's actually lunch time, so they have spread out paper plates and plastic silverware and bags of chips and boxes of crackers all over the floor -- on top of all their bedding. I delivered them their lunch platters and lemonade and there they sit, in the room next to me -- chattering away, stuffing their little mouths, full of life. And I am in awe -- of the simple joy that comes from a living room camp-out and picnic, of the beauty these children bring into my life. I am mostly in awe of the fact that no matter what cancer takes from me -- my hair, moments of health, my innocence -- it cannot ever take this very moment from me. And that makes today a happy day. Comments: 


The Lung Cancer Alliance -- the only national non-profit orga