##  England NHS GP Reviews Sentiment Analysis
source: https://huggingface.co/datasets/janduplessis886/england-nhs-gp-reviews/viewer/default/train

In [11]:
import pandas as pd

df = pd.read_csv('england-nhs-gp-reviews.csv')
df.head()

Unnamed: 0,ode,surgeryname,url,title,star_rating,comment,visited_date
0,E81050,asplands-medical-centre,https://www.nhs.uk/services/gp-surgery/aspland...,What's changed?,3,Have been with this practice for a number of y...,August 2022
1,E81050,asplands-medical-centre,https://www.nhs.uk/services/gp-surgery/aspland...,Woburn surgery,5,I have been a patient at this practice for man...,July 2022
2,E81050,asplands-medical-centre,https://www.nhs.uk/services/gp-surgery/aspland...,Don't waste your time GPs never available,1,"Visited my gp, over resistant hypertension. Gr...",June 2022
3,E81050,asplands-medical-centre,https://www.nhs.uk/services/gp-surgery/aspland...,Great practice,5,I contacted the surgery by telephone for a non...,June 2022
4,K82064,fishermead-medical-centre,https://www.nhs.uk/services/gp-surgery/fisherm...,Welcoming and supportive,5,I have great respect for the staff at Fisherme...,July 2023


In [2]:
print("Number of rows: ", df.shape[0],"\nNumber of Columns: ", df.shape[1])

Number of rows:  61955 
Number of Columns:  7


#### Check for missing data

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61955 entries, 0 to 61954
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   ode           61955 non-null  object
 1   surgeryname   61955 non-null  object
 2   url           61955 non-null  object
 3   title         61955 non-null  object
 4   star_rating   61955 non-null  int64 
 5   comment       61955 non-null  object
 6   visited_date  61955 non-null  object
dtypes: int64(1), object(6)
memory usage: 3.3+ MB


No missing data as all 61955 rows are non-null for all columns

### Features (comment)

In [6]:
features = df['comment']


In [7]:
features.head()

0    Have been with this practice for a number of y...
1    I have been a patient at this practice for man...
2    Visited my gp, over resistant hypertension. Gr...
3    I contacted the surgery by telephone for a non...
4    I have great respect for the staff at Fisherme...
Name: comment, dtype: object

### WordNetLemmatizer function

In [5]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

import string

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import word_tokenize

def lemmatizer(text):
    lemmatizer = WordNetLemmatizer()
    stopWords = stopwords.words('english')
    #Tokenize
    tokens = word_tokenize(text)
    #Lemmatize
    lemmaToken = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.lower() not in stopWords]
    #Remove punctuations
    lemmaToken = [token for token in lemmaToken if token not in string.punctuation]

    lemmaText = ' '.join(lemmaToken)
    return lemmaText

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kianm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kianm\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#### Lemmatize features

In [10]:
features = features.apply(lemmatizer)

In [11]:
features.head()

0    practice number year always found excellent ho...
1    patient practice many year always found helpfu...
2    visited gp resistant hypertension great appoin...
3    contacted surgery telephone non urgent appoint...
4    great respect staff fishermead medical centre ...
Name: comment, dtype: object

### Train the model

### Getting Keywords from comments

In [24]:
featuresClean = features.apply(lemmatizer)
featuresClean.head(10)

0    practice number year always found excellent ho...
1    patient practice many year always found helpfu...
2    visited gp resistant hypertension great appoin...
3    contacted surgery telephone non urgent appoint...
4    great respect staff fishermead medical centre ...
5    tried register go doctor list told catchment a...
6    staff rude talk cant get appointment love mone...
7    contact two member staff practice last week on...
8    ringing 2 day trying sort medication due 2 day...
9    queued outside practice 7:30 told appointment ...
Name: comment, dtype: object

### Text Featurization

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

# convert to NumPy array
tfidf_vecs_model = tfidf.fit(featuresClean)



### Export tfidfVect Mode

In [28]:
import pickle

with open('tfidfVec.pkl', 'wb') as file:
    pickle.dump(tfidf_vecs_model, file)


In [1]:
import pickle
loaded_model =  pickle.load(open('tfidfVec.pkl', 'rb'))

#### Function for loading model and fiting model to cleaned feedback 

In [9]:
def load_model(model_path):
    with open(model_path, "rb") as file:
        model = pickle.load(file)
    return model

def keywords_df(preprocessedData, model):
    tfidf_vecs = model.transform([preprocessedData]).toarray()
    df_columns = model.get_feature_names_out()
    df = pd.DataFrame(data=tfidf_vecs, columns=df_columns)
    return df

#### Testing the model

In [13]:
feedback = "This service was terrible, but the doctor was professional and caring. I wish they could have more consistent service throughtout the entire staff"
clean_fdbk = lemmatizer(feedback)
model = load_model('tfidfVec.pkl')
df_keywords = keywords_df(clean_fdbk, model)
df_keywords.head(10)

Unnamed: 0,00,000,00a,00after,00am,00an,00h,00hrs,00p,00pm,...,zostavax,àgreed,àny,às,àssits,ààaaassssssssssssssssssssssssssssssssssssssssssddddddddddddddfdffffff,çomputer,în,îs,ın
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
df_keywords.shape

(1, 26521)

#### Take the top 10 keywords by score

In [15]:
keywords_dict = df_keywords.max().nlargest(10).to_dict()

In [16]:
keywords_dict

{'consistent': 0.4930864880830275,
 'entire': 0.42060072286320466,
 'wish': 0.36111666796518493,
 'terrible': 0.35491226769692136,
 'service': 0.3369782987540155,
 'caring': 0.2559412825312889,
 'professional': 0.23501236886932628,
 'could': 0.21264279908679223,
 'staff': 0.15713095065391997,
 'doctor': 0.13926434733804982}

#### Get only the top 10 keywords without the score

In [17]:
keywords_lst = list(keywords_dict.keys())
keywords_lst

['consistent',
 'entire',
 'wish',
 'terrible',
 'service',
 'caring',
 'professional',
 'could',
 'staff',
 'doctor']