In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk import bigrams
from nltk import word_tokenize
from sklearn.linear_model import LogisticRegression



In [2]:
## change to the appropriate path
df = pd.read_csv('structured_data.csv',encoding='latin1',sep='\t') # seperator is \t as file is tab-delimited
df.head()

Unnamed: 0.1,Unnamed: 0,report status,intro text,Date of inspection visit:,Date of publication:,Inspection report,Summary of findings,Is the service safe?,Is the service effective?,Is the service caring?,...,Is the service safe?.1,Our findings,Is the service effective?.1,Our findings.1,Is the service caring?.1,Our findings.2,Is the service responsive?.1,Our findings.3,Is the service well-led?.1,Our findings.4
0,0,outstanding,Embrace (Geffen) Limited Nunthorpe Oaks Inspe...,21 February 2017 07 March 2017,24 May 2017 Outstanding Good Good ...,24 May 2017,Overall summary We inspected Nunthorpe Oaks...,The service was safe. People told us they f...,The service was effective. Staff had the k...,This service was extremely caring People w...,...,Good,People and their relatives told us they fel...,Good,People who used the service told us staff p...,Outstanding,"People, relatives and professionals praised...",Outstanding,People who used the service and relatives c...,Outstanding,The service was exceptionally well-led. It ...
1,1,outstanding,Insight Specialist Behavioural Service Ltd Asp...,04 December 2017,09 February 2018 Outstanding Good ...,09 February 2018,Overall summary Aspley House is a residenti...,The service remains Good,The service remains good.,The service was exceptionally caring. The p...,...,Good,The safety of everyone at the service both ...,Good,"Health professionals told us, ""I am always ...",Outstanding,A health and social care professional told ...,Outstanding,"People lived active and meaningful lives, t...",Outstanding,"Staff told us, ""The philosophy of the provi..."
2,2,outstanding,Miss S G Howard Victoria Lodge Care Home,26 January 2017,22 March 2017 Outstanding Good Goo...,11 Victoria Road Acocks Green Birmingham Wes...,Overall summary This unannounced inspection...,The service was safe People told us they fe...,The service was effective. There were arra...,The service was very caring. People receive...,...,Good,People and relatives all gave us very posit...,Good,People and relatives we spoke with told us ...,Outstanding,People living in Victoria Lodge were happy ...,Good,Relatives told us that the supportive and r...,Outstanding,The registered manager and provider promote...
3,3,outstanding,Hale Place Care Homes Limited Hale Place Farmh...,04 September 2017 07 September 2017,10 November 2017 Outstanding Good ...,79 Old Road East Peckham Tonbridge Kent TN12...,Overall summary This inspection took place ...,Hale Place Farmhouse was safe. People were ...,Hale Place Farmhouse was effective. Staff r...,Outstanding The service was outstanding i...,...,Good,People told us that they felt safe living a...,Good,People and their relatives spoke positively...,Outstanding,People and their relatives were consistentl...,Outstanding,People were receiving a person centred serv...,Outstanding,People and their relatives were consistentl...
4,4,outstanding,Healthcare Homes (LSC) Limited Sandown Park Ca...,09 January 2018 10 January 2018,27 February 2018 Outstanding Good ...,27 February 2018,Overall summary Our inspection took place o...,The service was safe. Effective systems wer...,"The service was effective. People's likes, ...",Outstanding 4 Sandown Park Care Home Insp...,...,Good,People and relatives provided positive feed...,Good,People and relatives told us they felt staf...,Outstanding,The service continued to provide outstandin...,Outstanding,The service continued to provide outstandin...,Outstanding,The service's management and care team were...


In [28]:
#define parameters of count vectorizer
vec = CountVectorizer(analyzer="word",stop_words='english',
                                   ngram_range=(1, 2),
                                   tokenizer=word_tokenize,
                                   max_features=10000)


lr=LogisticRegression()


#define a scikit learn pipeline
from sklearn.pipeline import Pipeline
bigram_clf = Pipeline([


    ('vectorizer', vec),
    
    ('classifier', lr)     # or any other classifier deemed useful!
   
])










In [29]:
ordered_status = ['good','inadequate','outstanding', 'requires_improvement' ]

In [30]:
#create labels that can be processed by the ML algorithm (i.e., Encode labels with value between 0 and (n_of_labels)-1.).

from sklearn.preprocessing import LabelEncoder

lb_make = LabelEncoder()
df['label'] = lb_make.fit_transform(df['report status'])

#check original column and label column

df[['report status', 'label']]




Unnamed: 0,report status,label
0,outstanding,2
1,outstanding,2
2,outstanding,2
3,outstanding,2
4,outstanding,2
5,requires_improvement,3
6,requires_improvement,3
7,requires_improvement,3
8,requires_improvement,3
9,requires_improvement,3


In [31]:
# Our goal is to train a model to predict the report status of each document 
# (using the newly-created numeric variable "label") from the feature "Our findings" (which represent...)


#define X as text from column 'Our findings'
X=df['Our findings'].values
#define y as text from column 'label' created above
y=df.label.values


In [32]:
# We took the .values because the ML algorythm wants ndarray (no pandas Series)

print(type(df['Our findings'].values))
print(type(df['Our findings']))


<class 'numpy.ndarray'>
<class 'pandas.core.series.Series'>


In [33]:
# split data into train and test set randomly (but in a way that is repeatable with seed random_state)
# we set the test data to be 20% of the total data

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [34]:
# fit the train data into our transformer/classifier pipeline

print(bigram_clf.fit(X_train, y_train))



Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words='english',
 ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])


In [35]:
# Let's predict the labels for the test data (that haven't been used to train the model)
label_pred = bigram_clf.predict(X_test)

In [36]:
# Let's see the predictions and the real values 
print('These are the predicted labels for the test data: ' + str(label_pred))
print('These are the original test labels hat we tried to predict: ' + str(y_test))

# The model did ok: it predicted 3 out of 4 labels correctly

These are the predicted labels for the test data: [2 1 1 0]
These are the original test labels hat we tried to predict: [2 1 1 2]


In [37]:
#score the test data

print(bigram_clf.score(X_test, y_test))

0.75


In [38]:
# let's calculate the AUC (the area under the ROC curve) to evaluate the model
from sklearn.metrics import roc_auc_score

label_pred_prob = bigram_clf.predict_proba(X_test)[:,1]
print(label_pred_prob)
#roc_auc_score(y_test, label_pred_prob)

[ 0.01083155  0.7853501   0.84921508  0.23187166]





### debug classifiers and explain their predictions

In [39]:
import eli5

ModuleNotFoundError: No module named 'eli5'

In [40]:
eli5.show_weights(lr, top=20,vec=vec,target_names=ordered_status) 

NameError: name 'eli5' is not defined

In [41]:
#explain prediction of a particular text
eli5.explain_prediction(lr, X_test[1],vec=vec, top=15,target_names=ordered_status)

NameError: name 'eli5' is not defined

In [42]:
#explaining the predictions of  ad hoc text. here a (kind of nonsensical) sentence based on tokens 
# more important in the 'outstanding' class
# change it to see how the classifier reacts.

eli5.explain_prediction(lr, "a person told us that having service safety actions rated as great",
                        vec=vec, top=15,target_names=ordered_status)

NameError: name 'eli5' is not defined