In [2]:
import os, pandas as pd, bs4, re, nltk, sklearn, numpy as np, xgboost
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from sklearn.metrics import auc

In [3]:
# Set the working directory to the folder where your data is stored
os.chdir("P:\\7670\Common\\2. Ad hoc requests (Task 2F )\Star Ratings\Cross-site\Marketing scan\August 2016\\99_Idea Lab\Work\Week 11\Training Datasets")
# See what files are in the folder.
os.listdir()

['model_test_11.1.16_v4.csv',
 'archive',
 'R_training_subset_11.1.16.csv',
 'provider type files',
 'model_test_11.1.16_v3.csv',
 'training_subset_11.1.16.xlsx',
 'clean_model_text.csv',
 'BoW-RF_test1_paradata_11.1.16.xlsx',
 'train_v.csv',
 'test_v.csv',
 'test_results_T1.csv',
 'R_model_test_11.1.16_v2.csv',
 'model_test_11.1.16_v3.xlsx',
 'model_test_11.1.16_v4.xlsx',
 'model_test_11.1.16.xlsx']

In [7]:
# Read the training data
train = pd.read_excel("model_test_11.1.16_v4.xlsx", header=0, \
                    delimiter="\t", quoting=3)

#   "header=0" indicates that the first line of the file contains column names, 
#   "delimiter=\t" indicates that the fields are separated by tabs,
#   quoting=3 tells Python to ignore doubled quotes, otherwise you may encounter errors trying to read the file.
train[:5]

Unnamed: 0,id,p_type,root,url,text,coded_page_mm,coded_site_mm
0,1,HC,okheart.com,https://www.okheart.com/physicians/jerome-p-ma...,skip main content menu pay bill give careers l...,0,0
1,2,HC,abbevilleareamc.com,http://www.abbevilleareamc.com/volunteers/appl...,abbeville area medical center slider volunteer...,0,1
2,3,HC,cch-inc.com,https://www.cch-inc.com/health-wellness/educat...,body navigation logo columbus community hospit...,0,0
3,4,HHC,homecareofcommonwealth.com,http://www.homecareofcommonwealth.com/northeas...,careers contact us espanol call us today us ho...,0,1
4,5,HC,bellevuehospital.com,http://www.bellevuehospital.com/Hospital%20Ser...,1400 west main street bellevue ohio 44811 us p...,0,0


In [5]:
def MM_to_words( raw_MM ):
    # Function to convert a raw CMS MM to a string of words
    # The input is a single string (a raw MM text), and 
    # the output is a single string (a preprocessed MM text)
    #
    # 1. Remove HTML
    MM_text = BeautifulSoup(raw_MM,"lxml").get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", MM_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #    a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    #    to make the output easier to use in our Bag of Words,
    #    then return the result. 
    return( " ".join( meaningful_words ))

In [8]:
# Now let's loop through and clean all of the training set at once

# Get the number of MMs based on the dataframe column size
num_MM = train["text"].size

# Initialize an empty list to hold the clean MMs
clean_train_MM = []

# Loop over each MM; create an index i that goes from 0 to the length
# of theMM list 
for i in range( 0, num_MM ):
    # Call our function for each one, and add the result to the list of
    # clean MMs
    clean_train_MM.append( MM_to_words( train["text"][i] ) )

# This next bit is optional, and it will give you status updates for every 1,000 rows.
print ("Cleaning and parsing the training set CMS MMs...\n")
clean_train_MM = []
for i in range( 0, num_MM ):
    # If the index is evenly divisible by 1000, print a message
    if( (i+1)%1000 == 0 ):
        print ("MM %d of %d\n" % ( i+1, num_MM ))                                                                    
    clean_train_MM.append( MM_to_words( train["text"][i] ))

Cleaning and parsing the training set CMS MMs...

MM 1000 of 27239

MM 2000 of 27239

MM 3000 of 27239

MM 4000 of 27239

MM 5000 of 27239

MM 6000 of 27239

MM 7000 of 27239

MM 8000 of 27239

MM 9000 of 27239

MM 10000 of 27239

MM 11000 of 27239

MM 12000 of 27239

MM 13000 of 27239

MM 14000 of 27239

MM 15000 of 27239

MM 16000 of 27239

MM 17000 of 27239

MM 18000 of 27239

MM 19000 of 27239

MM 20000 of 27239

MM 21000 of 27239

MM 22000 of 27239

MM 23000 of 27239

MM 24000 of 27239

MM 25000 of 27239

MM 26000 of 27239

MM 27000 of 27239



In [9]:
print ("Creating the bag of words...\n")

import sklearn
# If you are using Anaconda, Sci-Kit Learn should already be installed. Otherwise you will have to install it.
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,
                             ngram_range=(1, 3),
                             max_features = 10000) # This limits to the 10,000 most frequent words

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(clean_train_MM)

# Numpy arrays are easy to work with, so convert the result to an \

# array
train_data_features = train_data_features.toarray()


# NOTE: CountVectorizer comes with its own options to automatically do preprocessing,
# tokenization, and stop word removal -- for each of these, instead of specifying "None",
# we could have used a built-in method or specified our own function to use.
# However, we wanted to write our own function for data cleaning in this tutorial 
# to show you how it's done step by step.

Creating the bag of words...



In [11]:
model=XGBClassifier()
trainpredictors=train_data_features.toarray()
outcome=np.array(train['page_MM'])
model.fit(trainpredictors, outcome)
expected=outcome
predicted_train=model.predict(trainpredictors)
print(metrics.classification_report(expected, predicted_train))
print(metrics.confusion_matrix(expectedtrain, predicted_train)

Training the random forest...


In [16]:
# Note that when we use the Bag of Words for the test set we only call "transform", 
#  not "fit_transform" as we did for the training set. 
# In machine learning, you shouldn't use the test set to fit your model, 
# otherwise you run the risk of overfitting. 
# For this reason, we keep the test set off-limits until we are ready to make predictions.

# Read the test data
test = pd.read_excel("model_test_11.1.16.xlsx", header=0, delimiter="\t", \
                   quoting=3 )

In [17]:
# Create an empty list and append the clean MMs one by one
num_MM = len(test["text"])
clean_test_MM = [] 

print ("Cleaning and parsing the test set MMs...\n")
for i in range(0,num_MM):
    if( (i+1) % 1000 == 0 ):
        print ("CMS MM %d of %d\n" % (i+1, num_MM))
    clean_MM = MM_to_words( test["text"][i] )
    clean_test_MM.append( clean_MM )

Cleaning and parsing the test set MMs...

CMS MM 1000 of 27322

CMS MM 2000 of 27322

CMS MM 3000 of 27322

CMS MM 4000 of 27322

CMS MM 5000 of 27322

CMS MM 6000 of 27322

CMS MM 7000 of 27322

CMS MM 8000 of 27322

CMS MM 9000 of 27322

CMS MM 10000 of 27322

CMS MM 11000 of 27322

CMS MM 12000 of 27322

CMS MM 13000 of 27322

CMS MM 14000 of 27322

CMS MM 15000 of 27322

CMS MM 16000 of 27322

CMS MM 17000 of 27322

CMS MM 18000 of 27322

CMS MM 19000 of 27322

CMS MM 20000 of 27322

CMS MM 21000 of 27322

CMS MM 22000 of 27322

CMS MM 23000 of 27322

CMS MM 24000 of 27322

CMS MM 25000 of 27322

CMS MM 26000 of 27322

CMS MM 27000 of 27322



In [18]:
testpredictors=test_data_features.toarray()
outcome=np.array(test['page_MM'])
model.fit(testpredictors, outcome)
expected=outcome
predicted_test=model.predict(testpredictors)
print(metrics.classification_report(expected, predicted_test))
print(metrics.confusion_matrix(expected, predicted_test)