In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import re

import warnings
warnings.filterwarnings('ignore')

In [None]:
df= pd.read_csv('C:/Users/USER/Downloads/healthcare_reviews.csv')

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df_counts = df["Rating"].value_counts().reset_index()
df_counts.head()

In [None]:
df.isnull().sum()

In [None]:
df.sample(5)

In [None]:
#sentiment model

In [None]:
import numpy as np

def create_sentiment(Rating):
    
    if Rating==1 or Rating==2:
        return -1 # negative sentiment
    elif Rating==4 or Rating==5:
        return 1 # positive sentiment
    else:
        return 0 # neutral sentiment

df['Sentiment'] = df['Rating'].apply(create_sentiment)

In [None]:
print(df)

In [None]:
#data preprocessing

In [None]:
df.dropna(axis=0, how="any", subset=None, inplace= True)

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df['Sentiment'].value_counts()

In [None]:
sns.countplot(x='Sentiment',data= df)
plt.show()

In [None]:
def preprocessor(text):
    """ Return a cleaned version of text
    """
    # Remove HTML markup
    text = re.sub('<[^>]*>', '', text)
    # Save emoticons for later appending
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    # Remove any non-word character and append the emoticons,
    # removing the nose character for standarization. Convert to lower case
    text = (re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-', ''))
    
    return text

In [None]:
from nltk.stem import PorterStemmer

porter = PorterStemmer()

def tokenizer_porter(text):
    token = []
    for word in text.split():
        token.append(porter.stem(word))
    
    return token

In [None]:
#model evaluvation

In [None]:
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import nltk

from sklearn.model_selection import train_test_split

y = df['Sentiment'] 
X = df['Review_Text']

my_additional_stop_words = []

#stop = text.ENGLISH_STOP_WORDS.union(my_additional_stop_words)
from spacy.lang.de.stop_words import STOP_WORDS
tfidf = TfidfVectorizer(stop_words=list(STOP_WORDS),
                        tokenizer=tokenizer_porter,
                        preprocessor=preprocessor)

X_tfidf = tfidf.fit_transform(X)

# split the dataset in train and test
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.186, random_state=142)

In [None]:
X_train 

In [None]:
# logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()

clf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Now apply those above metrics to evaluate your model
predictions = clf.predict(X_test)

print('The accuracy score is:',accuracy_score(y_test,predictions))
print('The confusion matrix is:','\n',confusion_matrix(y_test,predictions))
print('The classification report is:','\n',classification_report(y_test,predictions))

In [None]:
#DecisionTreeClassifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train,y_train)

In [None]:
dtc_predictions = dtc.predict(X_test)

In [None]:
print('The accuracy score is:',accuracy_score(y_test,dtc_predictions))
print('The confusion matrix is:','\n',confusion_matrix(y_test,dtc_predictions))
print('The classification report is:','\n',classification_report(y_test,dtc_predictions))

In [None]:
#RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

In [None]:
rfc_predictions = rfc.predict(X_test)

In [None]:
print('The accuracy score is:',accuracy_score(y_test,rfc_predictions))
print('The confusion matrix is:','\n',confusion_matrix(y_test,rfc_predictions))
print('The classification report is:','\n',classification_report(y_test,rfc_predictions))

In [None]:
#K-fold cross validation should be used to tune the model
from sklearn.model_selection import cross_val_score

n = [1 ,5 ,10, 15, 20, 30, 50, 100, 150, 200, 500]
val_results = []

for i in n:
    clf = RandomForestClassifier(n_estimators=i)
    val_results.append(cross_val_score(clf, X_train, y_train,cv=9).mean())
    
plt.scatter(n, val_results)
print(val_results)

In [None]:
final_rfc = RandomForestClassifier(n_estimators=600)

final_rfc.fit(X_train, y_train)
final_predictions = final_rfc.predict(X_test)

In [None]:
print('The accuracy score is:',accuracy_score(y_test,final_predictions))
print('The confusion matrix is:','\n',confusion_matrix(y_test,final_predictions))
print('The classification report is:','\n',classification_report(y_test,final_predictions))

In [None]:
#KNeighborsClassifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
### Tuning using K-fold Cross Validation

from sklearn.model_selection import cross_val_score

val_error_rate = []
neighbors_range = range(1,500,5)

for i in neighbors_range:
    
    knn = KNeighborsClassifier(n_neighbors=i)
    
    val_error = 1 - cross_val_score(knn, X_train, y_train,cv=5).mean()
    val_error_rate.append(val_error)

# Plot settings
plt.figure(figsize=(15,7))
plt.plot(neighbors_range, val_error_rate, color='orange', linestyle='dashed', marker='o',
         markerfacecolor='black', markersize=5, label='Validation Error')
plt.xticks(np.arange(neighbors_range.start, neighbors_range.stop, neighbors_range.step), rotation=60)
plt.grid()
plt.legend()
plt.title('Validation Error vs. K Value')
plt.xlabel('K')
plt.ylabel('Validation Error')
plt.show()

In [None]:
best_k = neighbors_range[val_error_rate.index(min(val_error_rate))]
best_k

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X_train, y_train)
knn_predictions = knn.predict(X_test)

print('The accuracy score is:',accuracy_score(y_test,knn_predictions))
print('The classification report is:','\n',classification_report(y_test,knn_predictions))

matrix=confusion_matrix(y_test,knn_predictions)
plt.figure(figsize = (5,4))
sns.heatmap(matrix, annot=True, fmt = '.2f')

In [None]:
from sklearn.ensemble import RandomForestClassifier

final_rfc = RandomForestClassifier(n_estimators=7)

final_rfc.fit(X_train, y_train)
final_predictions = final_rfc.predict(X_test)

print('The accuracy score is:',accuracy_score(y_test,final_predictions))
print('The confusion matrix is:','\n',confusion_matrix(y_test,final_predictions))
print('The classification report is:','\n',classification_report(y_test,final_predictions))

In [None]:
# So we can say that we almost have same number of reviews. That mean we have very good data.
sns.barplot(data=df, x='Rating', y='Sentiment')
plt.xlabel("Sentiment Type");
plt.ylabel("Total Count");
plt.title("Total Postive ,Negative & Neutral Reviews");

In [None]:
sns.histplot(data=df, x='Rating', y='Sentiment')

In [None]:
#svm

In [None]:
from sklearn.svm import SVC

In [None]:
from sklearn.model_selection import cross_val_score

val_rate = []
c_range =  range(1,200,20)

for i in c_range:
    
    svm = SVC(C=i, kernel='linear')
    
    val_error = 1 - cross_val_score(svm, X_train, y_train,cv=9).mean()
    val_rate.append(val_error)


# Plot settings
plt.figure(figsize=(15,7))
plt.plot(c_range, val_rate, color='orange', linestyle='dashed', marker='o',
         markerfacecolor='black', markersize=5, label='Validation Error')

plt.xticks(np.arange(c_range.start, c_range.stop, c_range.step), rotation=60)
plt.grid()
plt.legend()
plt.title('Validation Error vs. C Value')
plt.xlabel('C')
plt.ylabel('Validation Error')
plt.show()

In [None]:
svm = SVC(kernel='linear',C=9).fit(X_train, y_train)

In [None]:
svm.fit(X_train, y_train)

In [None]:
pred = svm.predict(X_test)

In [None]:
print('The accuracy score is:',accuracy_score(y_test, pred))
print('The confusion matrix is:','\n',confusion_matrix(y_test, pred))
print('The classification report is:','\n',classification_report(y_test, pred))

In [None]:
svm = SVC(kernel='linear',C=42).fit(X_train, y_train)

In [None]:
svm.fit(X_train, y_train)

In [None]:
pred = svm.predict(X_test)

In [None]:
print('The accuracy score is:',accuracy_score(y_test, pred))
print('The confusion matrix is:','\n',confusion_matrix(y_test, pred))
print('The classification report is:','\n',classification_report(y_test, pred))

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.feature_selection import VarianceThreshold

# Reshape X_train, X_test by fit_transform
X_new_train = SelectKBest(chi2, k=45000).fit_transform(X_train, y_train)
X_new_test = SelectKBest(chi2, k=45000).fit_transform(X_test, y_test)

# Build Logistic Regression Model and check accuracy
clf.fit(X_new_train, y_train)

new_predictions = clf.predict(X_new_test)

print('The accuracy score is:',accuracy_score(y_test,new_predictions))
print('The confusion matrix is:','\n',confusion_matrix(y_test,new_predictions))
print('The classification report is:','\n',classification_report(y_test,new_predictions))