In [1]:
# Naive Bayes
import pandas as pd, os, sklearn, numpy as np, nltk, re
from sklearn import datasets
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import CountVectorizer
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from sklearn.metrics import auc

In [2]:
# Set the working directory to the folder where your data is stored
os.chdir("P:\\7670\Common\\2. Ad hoc requests (Task 2F )\Star Ratings\Cross-site\Marketing scan\August 2016\\99_Idea Lab\Work\Week 11\Training Datasets")
# See what files are in the folder.
os.listdir()

['archive',
 'Naive Bayes.ipynb',
 'all_training.xlsx',
 'test.xlsx',
 'population_dataset_11.7.16.xlsx',
 'train.xlsx',
 'validate.xlsx']

In [3]:
#load training data
train=pd.read_excel('train.xlsx', header=0, \
                    delimiter="\t", quoting=3)
train[:5]

Unnamed: 0,id,Provider Type,root url,scraped url,Visible Text,page_MM
0,82,HC,dunessurgicalhospital.com,http://dunessurgicalhospital.com/error/mainten...,Skip to main content Home Careers About Us Eve...,0
1,146,HHC,porchlighthomecare.org,http://www.porchlighthomecare.org/the-gr8t-a-s...,Join our Facebook Group Follow us on Twitter O...,0
2,167,NHC,libertyhealthcareandrehab.com,http://www.libertyhealthcareandrehab.com/capit...,Call us at 1-919-231-6045 end #header Home Dir...,1
3,192,NHC,troynursinghome.com,http://www.troynursinghome.com/,"Troy, AL Nursing Home Troy Health And Rehabili...",0
4,117,HHC,interimhealthcare.com,http://www.interimhealthcare.com/eriepa/about-us,"2206 West 15th St., Erie, Pennsylvania 16505 G...",1


In [4]:
#set up model and outcome variable
model=GaussianNB()
outcome=np.array(train['page_MM'])

In [5]:
def MM_to_words( raw_MM ):
    # Function to convert a raw CMS MM to a string of words
    # The input is a single string (a raw MM text), and 
    # the output is a single string (a preprocessed MM text)
    #
    # 1. Remove HTML
    MM_text = BeautifulSoup(raw_MM,"lxml").get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", MM_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 6. Join the words back into one string separated by space, 
    #    to make the output easier to use in our Bag of Words,
    #    then return the result. 
    return( " ".join( words ))

In [6]:
# Now let's loop through and clean all of the training set at once

# Get the number of MMs based on the dataframe column size
num_MM = train["Visible Text"].size

# Initialize an empty list to hold the clean MMs
clean_train_MM = []

# Loop over each MM; create an index i that goes from 0 to the length
# of theMM list 
for i in range( 0, num_MM ):
    # Call our function for each one, and add the result to the list of
    # clean MMs
    clean_train_MM.append( MM_to_words( train["Visible Text"][i] ) )

# This next bit is optional, and it will give you status updates for every 1,000 rows.
print ("Cleaning and parsing the training set CMS MMs...\n")
clean_train_MM = []
for i in range( 0, num_MM ):
    # If the index is evenly divisible by 1000, print a message
    if( (i+1)%1000 == 0 ):
        print ("MM %d of %d\n" % ( i+1, num_MM ))                                                                    
    clean_train_MM.append( MM_to_words( train["Visible Text"][i] ))

Cleaning and parsing the training set CMS MMs...



In [7]:
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,
                             ngram_range=(1, 3),
                             max_features = 10000) # This limits to the 10,000 most frequent words
train_data_features = vectorizer.fit_transform(clean_train_MM)
trainpredictors=train_data_features.toarray()

In [8]:
#run model
model.fit(trainpredictors, outcome)

GaussianNB()

In [9]:
expected=outcome
predicted_train=model2.predict(trainpredictors)

In [10]:
print(metrics.classification_report(expected, predicted_train))
print(metrics.confusion_matrix(expected, predicted_train))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        64
          1       1.00      1.00      1.00        63

avg / total       1.00      1.00      1.00       127

[[64  0]
 [ 0 63]]


In [11]:
#load test data
test=pd.read_excel('test.xlsx', header=0, \
                    delimiter="\t", quoting=3)
test[:5]

Unnamed: 0,id,Provider Type,root url,scraped url,Visible Text,page_MM
0,135,HHC,celestialhhc.com,http://www.celestialhhc.com/home-health-care-c...,wrapper wrapper-canvas banner nav Home About U...,0
1,22,HC,northwestspecialtyhospital.com,http://www.northwestspecialtyhospital.com/abou...,Our specialty is YOU. Menu About Mission Stat...,1
2,131,HHC,healthsouthwichitafalls.com,http://www.healthsouthwichitafalls.com/en/our-...,Share This Print English Espanol Decrease Font...,0
3,12,DFC,pskc.net,http://www.pskc.net/about-us/,DONATE FOR PATIENTS Find a Kidney Doctor About...,0
4,20,HC,hancockregionalhospital.org,http://www.hancockregionalhospital.org/about-u...,About Us | Careers | Wellness & Education ...,1


In [12]:
# Now let's loop through and clean all of the training set at once

# Get the number of MMs based on the dataframe column size
num_MM = test["Visible Text"].size

# Initialize an empty list to hold the clean MMs
clean_test_MM = []

# Loop over each MM; create an index i that goes from 0 to the length
# of theMM list 
for i in range( 0, num_MM ):
    # Call our function for each one, and add the result to the list of
    # clean MMs
    clean_test_MM.append( MM_to_words( test["Visible Text"][i] ) )

# This next bit is optional, and it will give you status updates for every 1,000 rows.
print ("Cleaning and parsing the training set CMS MMs...\n")
clean_test_MM = []
for i in range( 0, num_MM ):
    # If the index is evenly divisible by 1000, print a message
    if( (i+1)%1000 == 0 ):
        print ("MM %d of %d\n" % ( i+1, num_MM ))                                                                    
    clean_test_MM.append( MM_to_words( test["Visible Text"][i] ))

Cleaning and parsing the training set CMS MMs...



In [13]:
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,
                             ngram_range=(1, 3),
                             max_features = 10000) # This limits to the 10,000 most frequent words
test_data_features = vectorizer.fit_transform(clean_test_MM)
testpredictors=test_data_features.toarray()

In [14]:
outcometest=np.array(test['page_MM'])
expectedtest=outcometest
predicted_test=model2.predict(testpredictors)
print(metrics.classification_report(expectedtest, predicted_test))
print(metrics.confusion_matrix(expectedtest, predicted_test))

             precision    recall  f1-score   support

          0       0.30      0.15      0.20        20
          1       0.45      0.67      0.54        21

avg / total       0.38      0.41      0.37        41

[[ 3 17]
 [ 7 14]]


In [15]:
cm=ConfusionMatrix(expectedtest, predicted_test)

In [17]:
cm.print_stats()

population: 41
P: 21
N: 20
PositiveTest: 31
NegativeTest: 10
TP: 14
TN: 3
FP: 17
FN: 7
TPR: 0.666666666667
TNR: 0.15
PPV: 0.451612903226
NPV: 0.3
FPR: 0.85
FDR: 0.548387096774
FNR: 0.333333333333
ACC: 0.414634146341
F1_score: 0.538461538462
MCC: -0.213395488257
informedness: -0.183333333333
markedness: -0.248387096774
prevalence: 0.512195121951
LRP: 0.78431372549
LRN: 2.22222222222
DOR: 0.352941176471
FOR: 0.7
