In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
df = pd.read_csv('healthcare_reviews_v1.csv')
df.head()

Unnamed: 0,Review_Text,Rating,Sentiment
0,I have mixed feelings about my experience.,4,2
1,The staff was caring and attentive. I couldn't...,5,2
2,I have mixed feelings about my experience.,5,2
3,I have mixed feelings about my experience.,5,2
4,The healthcare provider was excellent. I had a...,3,1


In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Parthi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
port_stem = PorterStemmer()

In [5]:
def stemming(review):
    stemmed_review = re.sub('[^a-zA-Z]',' ', review)
    stemmed_review = stemmed_review.lower()
    stemmed_review = stemmed_review.split()
    stemmed_review = [port_stem.stem(word) for word in stemmed_review if not word in stopwords.words('english')]
    stemmed_review = ' '.join(stemmed_review)

    return stemmed_review

In [6]:
df['stemmed_review'] = df['Review_Text'].apply(stemming)

In [7]:
print(df['Sentiment'].value_counts())

Sentiment
2    434
0    398
1    168
Name: count, dtype: int64


In [8]:
X= df['stemmed_review'].values
y = df['Sentiment'].values

In [9]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.20, stratify= y, random_state=2)

#### TFIDF vectorization for converting text to numeric 

In [10]:
vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

#### SMOTE for overcome imbalance in target column values

In [11]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to oversample the minority classes
X_resampled, y_resampled = SMOTE().fit_resample(X_train, y_train)

In [12]:
# Convert the resampled data back to a dense DataFrame

resampled_df = pd.DataFrame(X_resampled.todense()) 
resampled_df['Sentiment'] = y_resampled

# Check the distribution of the Sentiment column after SMOTE

print(resampled_df['Sentiment'].value_counts())


Sentiment
2    347
0    347
1    347
Name: count, dtype: int64


#### Splitting the model for train and test set

In [13]:
X_train_resampled, X_test_resampled, y_train_resampled, y_test_resampled = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=2
)

#### Training the model for Support vector machine (SVM) model after SMOTE

In [14]:
from sklearn.svm import SVC

classifier = SVC(random_state=0)
classifier.fit(X_train_resampled,y_train_resampled)

In [15]:
y_pred = classifier.predict(X_test_resampled)


In [16]:
cm = confusion_matrix(y_test_resampled,y_pred)
print(cm)

[[29 18 14]
 [23 40 10]
 [35 26 14]]


In [17]:
test_data_accuracy = accuracy_score(y_test_resampled,y_pred)
print ('Accuracy score for test set:',test_data_accuracy)

Accuracy score for test set: 0.39712918660287083


#### Logistic Regression

In [18]:
model = LogisticRegression()
model.fit(X_train_resampled, y_train_resampled)

In [19]:
from sklearn.metrics import classification_report, accuracy_score

predictions = model.predict(X_test)

In [20]:
# Accuracy score
X_test_pred = model.predict(X_test_resampled)
test_data_accuracy = accuracy_score(y_test_resampled,X_test_pred)

In [21]:
print ('Accuracy score for test set:',test_data_accuracy)

Accuracy score for test set: 0.39712918660287083


In [22]:
print("Accuracy:", accuracy_score(y_test, predictions))
#print("Classification Report:\n", classification_report(y_test, predictions))

Accuracy: 0.325
