## Linear Regression Model

In [1]:
# Import the pandas package, then use the "read_csv" function to read
# the labeled training data
import os
import pandas as pd
import numpy as np

In [2]:
# Set the working directory to the folder where your data is stored
os.chdir("C:\\Users\lickfett-john\Documents\Python Scripts\small_sample")
# See what files are in the folder.
os.listdir()

['results_train-test_LRM.csv', 'test.xlsx', 'train.xlsx', 'validate.xlsx']

In [3]:
# Read the training data
train = pd.read_excel("train.xlsx", header=0, \
                    delimiter="\t", quoting=3)

#   "header=0" indicates that the first line of the file contains column names, 
#   "delimiter=\t" indicates that the fields are separated by tabs,
#   quoting=3 tells Python to ignore doubled quotes, otherwise you may encounter errors trying to read the file.

In [4]:
# Import BeautifulSoup into your workspace
import bs4
from bs4 import BeautifulSoup             
import nltk, re
from nltk.corpus import stopwords # Import the stop word list

In [5]:
def MM_to_words( raw_MM ):
    # Function to convert a raw CMS MM to a string of words
    # The input is a single string (a raw MM text), and 
    # the output is a single string (a preprocessed MM text)
    #
    # 1. Remove HTML
    MM_text = BeautifulSoup(raw_MM,"lxml").get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z,4,5,&,-]", " ", MM_text)
    #letters_only = re.sub("[^a-zA-Z,4,5,&,-]", " ", MM_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #    a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #meaningful_words = [w for w in words]  #If you don't want stops, use this instead.
    #
    # 6. Join the words back into one string separated by space, 
    #    to make the output easier to use in our Bag of Words,
    #    then return the result. 
    return( " ".join( meaningful_words ))

In [6]:
# Now let's loop through and clean all of the training set at once

# Get the number of MMs based on the dataframe column size
num_MM = train["Visible Text"].size

# Initialize an empty list to hold the clean MMs
clean_train_MM = []

# Loop over each MM; create an index i that goes from 0 to the length
# of theMM list 
for i in range( 0, num_MM ):
    # Call our function for each one, and add the result to the list of
    # clean MMs
    clean_train_MM.append( MM_to_words( train["Visible Text"][i] ) )

# This next bit is optional, and it will give you status updates for every 1,000 rows.
print ("Cleaning and parsing the training set CMS MMs...\n")
clean_train_MM = []
for i in range( 0, num_MM ):
    # If the index is evenly divisible by 100, print a message
    if( (i+1)%100 == 0 ):
        print ("MM %d of %d\n" % ( i+1, num_MM ))                                                                    
    clean_train_MM.append( MM_to_words( train["Visible Text"][i] ))

Cleaning and parsing the training set CMS MMs...

MM 100 of 127



### Bag of Words and TF-IDF Frequencies

In [7]:
print ("Creating the bag of words...\n")

import sklearn
# If you are using Anaconda, Sci-Kit Learn should already be installed. Otherwise you will have to install it.
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             ngram_range=(1, 6),
                             max_features = 9750) # This limits to the 10,000 most frequent words

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
vtrain_data_features = vectorizer.fit_transform(clean_train_MM)

# Numpy arrays are easy to work with, so convert the result to an \

# array
train_data_features = vtrain_data_features.toarray()

# NOTE: CountVectorizer comes with its own options to automatically do preprocessing,
# tokenization, and stop word removal -- for each of these, instead of specifying "None",
# we could have used a built-in method or specified our own function to use.
# However, we wanted to write our own function for data cleaning in this tutorial 
# to show you how it's done step by step.

Creating the bag of words...



In [8]:
# Now let's import the Python package named scikit-learn, which has a TF-IDF vectorizer
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
### This step is optional ###
# Instead of normal countvectors, we can try using TF-IDF vectors
##TF-IDF stands for “term frequency / inverse document frequency” and is a method 
##for emphasizing words that occur frequently in a given document, while at the same time de-emphasising words that occur frequently in many documents.

TFvectorizer = TfidfVectorizer(#decode_error=u'strict',
                             #strip_accents=None, 
                             #lowercase=True,
                             #preprocessor=None,
                             #tokenizer=None,
                             #analyzer=u'word',
                             #stop_words=None,
                             #token_pattern=u'(?u)\b\w\w+\b',
                             ngram_range=(1, 6),
                             #max_df=1.0,
                             #min_df=1,
                             max_features=9750,
                             #vocabulary=vocab,
                             binary=False,
                             norm=u'l2',
                             use_idf=True,
                             smooth_idf=True,
                             sublinear_tf=False)

train_data_features = TFvectorizer.fit_transform(clean_train_MM)
trainpredictors=train_data_features.toarray()

## Linear Regression Model

In [10]:
# We will import the linear regression model from scikit-learn
from sklearn.linear_model import LinearRegression

In [11]:
# Here we specify the parameters of the model
Lmodel = LinearRegression(fit_intercept=True,
                          normalize=False,
                          copy_X=True,
                          n_jobs=1)

# This fits the linear regression model to the training dataset
Lmodel = Lmodel.fit( train_data_features, train["page_MM"] )

In [15]:
###Import the test data###
# Note that when we use the Bag of Words for the test set we only call "transform", 
# not "fit_transform" as we did for the training set. 
# In machine learning, you shouldn't use the test set to fit your model, 
# otherwise you run the risk of overfitting. 
# For this reason, we keep the test set off-limits until we are ready to make predictions.

# Read the test data
test = pd.read_excel("population_dataset_11.7.16.xlsx", header=0, delimiter="\t", \
                   quoting=3 )

In [16]:
# Create an empty list and append the clean MMs one by one
num_MM = len(test["Visible Text"])
clean_test_MM = [] 

print ("Cleaning and parsing the test set MMs...\n")
for i in range(0,num_MM):
    if( (i+1) % 100 == 0 ):
        print ("CMS MM %d of %d\n" % (i+1, num_MM))
    clean_MM = MM_to_words( test["Visible Text"][i] )
    clean_test_MM.append( clean_MM )

Cleaning and parsing the test set MMs...

CMS MM 100 of 54559

CMS MM 200 of 54559

CMS MM 300 of 54559

CMS MM 400 of 54559

CMS MM 500 of 54559

CMS MM 600 of 54559

CMS MM 700 of 54559

CMS MM 800 of 54559

CMS MM 900 of 54559

CMS MM 1000 of 54559

CMS MM 1100 of 54559

CMS MM 1200 of 54559

CMS MM 1300 of 54559

CMS MM 1400 of 54559

CMS MM 1500 of 54559

CMS MM 1600 of 54559

CMS MM 1700 of 54559

CMS MM 1800 of 54559

CMS MM 1900 of 54559

CMS MM 2000 of 54559

CMS MM 2100 of 54559

CMS MM 2200 of 54559

CMS MM 2300 of 54559

CMS MM 2400 of 54559

CMS MM 2500 of 54559

CMS MM 2600 of 54559

CMS MM 2700 of 54559

CMS MM 2800 of 54559

CMS MM 2900 of 54559

CMS MM 3000 of 54559

CMS MM 3100 of 54559

CMS MM 3200 of 54559

CMS MM 3300 of 54559

CMS MM 3400 of 54559

CMS MM 3500 of 54559

CMS MM 3600 of 54559

CMS MM 3700 of 54559

CMS MM 3800 of 54559

CMS MM 3900 of 54559

CMS MM 4000 of 54559

CMS MM 4100 of 54559

CMS MM 4200 of 54559

CMS MM 4300 of 54559

CMS MM 4400 of 54559


  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup

CMS MM 36600 of 54559

CMS MM 36700 of 54559



  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup

CMS MM 36800 of 54559

CMS MM 36900 of 54559

CMS MM 37000 of 54559



  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup

CMS MM 37100 of 54559

CMS MM 37200 of 54559

CMS MM 37300 of 54559



  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup

CMS MM 37400 of 54559

CMS MM 37500 of 54559

CMS MM 37600 of 54559

CMS MM 37700 of 54559

CMS MM 37800 of 54559

CMS MM 37900 of 54559

CMS MM 38000 of 54559

CMS MM 38100 of 54559

CMS MM 38200 of 54559

CMS MM 38300 of 54559

CMS MM 38400 of 54559

CMS MM 38500 of 54559

CMS MM 38600 of 54559

CMS MM 38700 of 54559

CMS MM 38800 of 54559

CMS MM 38900 of 54559

CMS MM 39000 of 54559

CMS MM 39100 of 54559

CMS MM 39200 of 54559

CMS MM 39300 of 54559

CMS MM 39400 of 54559

CMS MM 39500 of 54559

CMS MM 39600 of 54559

CMS MM 39700 of 54559

CMS MM 39800 of 54559

CMS MM 39900 of 54559

CMS MM 40000 of 54559

CMS MM 40100 of 54559

CMS MM 40200 of 54559

CMS MM 40300 of 54559

CMS MM 40400 of 54559

CMS MM 40500 of 54559

CMS MM 40600 of 54559

CMS MM 40700 of 54559

CMS MM 40800 of 54559

CMS MM 40900 of 54559

CMS MM 41000 of 54559

CMS MM 41100 of 54559

CMS MM 41200 of 54559

CMS MM 41300 of 54559

CMS MM 41400 of 54559

CMS MM 41500 of 54559

CMS MM 41600 of 54559

CMS MM 4170

In [17]:
# Get a bag of words for the test set, and convert to a numpy array
vtest_data_features = vectorizer.transform(clean_test_MM)
test_data_features = vtest_data_features.toarray()

In [18]:
###OPTIONAL###
# If using TF-IDF, turn the test dataset vectors into TF-IDF vectors

test_data_features = TFvectorizer.transform(clean_test_MM)
testpredictors=test_data_features.toarray()

In [19]:
# Use the linear regression model to make CMS MM predictions
result = Lmodel.predict(test_data_features)
#result   #enable this to see what the predicted scores look like

In [20]:
###OPTIONAL###
# Copy the results to a pandas dataframe with an "PrimaryKey" column and
# a "MMExist" column
output = pd.DataFrame( data={"id":test["id"], "mm_predict":result} )

# Use pandas to write the comma-separated output file
output.to_csv( "results_train-POPtest_LRM.csv", index=False, quoting=3 )

In [21]:
# An accuracy metric showing how well the linear model predicts on the test dataset
from sklearn.metrics import roc_auc_score
y_true = np.array(test['page_MM'])
y_scores = result
roc_auc_score(y_true, y_scores)

KeyError: 'page_MM'

### Plot the ROC Curve

In [29]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import random

In [30]:
expectedtest = np.array(test["page_MM"])
predicted_test = result

In [31]:
#false_positive_rate, true_positive_rate, thresholds = roc_curve(actual, result)
false_positive_rate, true_positive_rate, thresholds = roc_curve(expectedtest, predicted_test)
roc_auc = auc(false_positive_rate, true_positive_rate)

In [32]:
# There is a tradeoff betwen the TPR and FPR as we move the threshold of the classifier.
# When the test is more accurate the roc curve is closer to the left top borders
# A useless classifier is one that has its ROC curve exactly aligned with the diagonal.


plt.title('Receiver Operating Characteristic (ROC)')
plt.plot(false_positive_rate, true_positive_rate, 'b',
label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

# Concerning the AUC, a simple rule of thumb to evaluate a classifier based on this summary value is the following:

# .90-1 = very good (A)
# .80-.90 = good (B)
# .70-.80 = not so good (C)
# .60-.70 = poor (D)
# .50-.60 = fail (F)