<a href="https://colab.research.google.com/github/JNii90/ML-project/blob/main/Patient's_Condition_Classification_Using_Drug_Reviews_%7C_NLP_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd


In [None]:
df = pd.read_csv('/content/drugsComTest_raw.tsv', sep='\t')

df.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,163740,Mirtazapine,Depression,"""I&#039;ve tried a few antidepressants over th...",10.0,"February 28, 2012",22
1,206473,Mesalamine,"Crohn's Disease, Maintenance","""My son has Crohn&#039;s disease and has done ...",8.0,"May 17, 2009",17
2,159672,Bactrim,Urinary Tract Infection,"""Quick reduction of symptoms""",9.0,"September 29, 2017",3
3,39293,Contrave,Weight Loss,"""Contrave combines drugs that were used for al...",9.0,"March 5, 2017",35
4,97768,Cyclafem 1 / 35,Birth Control,"""I have been on this birth control for one cyc...",9.0,"October 22, 2015",4


In [None]:
df.shape

(53766, 7)

In [None]:
df['condition'].value_counts()

Birth Control                                  9648
Depression                                     3095
Pain                                           2100
Anxiety                                        1908
Acne                                           1847
                                               ... 
Gender Dysphoria                                  1
83</span> users found this comment helpful.       1
Meningococcal Meningitis Prophylaxis              1
Tympanostomy Tube Placement Surgery               1
Strongyloidiasis                                  1
Name: condition, Length: 708, dtype: int64

In [None]:
df['rating'].value_counts()

10.0    17016
9.0      9177
1.0      7299
8.0      6156
7.0      3091
5.0      2710
2.0      2334
3.0      2205
6.0      2119
4.0      1659
Name: rating, dtype: int64

In [None]:
df['label'] = df['rating'].apply(lambda x: 'positive' if x >= 5 else 'negative')

df.head()


Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount,label
0,163740,Mirtazapine,Depression,"""I&#039;ve tried a few antidepressants over th...",10.0,"February 28, 2012",22,positive
1,206473,Mesalamine,"Crohn's Disease, Maintenance","""My son has Crohn&#039;s disease and has done ...",8.0,"May 17, 2009",17,positive
2,159672,Bactrim,Urinary Tract Infection,"""Quick reduction of symptoms""",9.0,"September 29, 2017",3,positive
3,39293,Contrave,Weight Loss,"""Contrave combines drugs that were used for al...",9.0,"March 5, 2017",35,positive
4,97768,Cyclafem 1 / 35,Birth Control,"""I have been on this birth control for one cyc...",9.0,"October 22, 2015",4,positive


In [None]:
df['label'].value_counts()

positive    40269
negative    13497
Name: label, dtype: int64

In [None]:
#  find the top 10 conditions with the highest rating add their corresding drug name put it in a table

df_top_10 = df.groupby('condition')['rating'].mean().sort_values(ascending=False).head(10)
df_top_10 = df_top_10.reset_index()

drug_names = []
for condition in df_top_10['condition']:
  top_drug = df[df['condition'] == condition].groupby('drugName')['rating'].mean().sort_values(ascending=False).index[0]
  drug_names.append(top_drug)

df_top_10['drugName'] = drug_names

df_top_10


Unnamed: 0,condition,rating,drugName
0,Parkinsonism,10.0,Kemadrin
1,Glioblastoma Multiforme,10.0,Avastin
2,Gastrointestinal Hemorrhage,10.0,Omeprazole / sodium bicarbonate
3,Immunosuppression,10.0,Methylprednisolone
4,Mononucleosis,10.0,Acyclovir
5,Gender Dysphoria,10.0,Testosterone
6,Giant Cell Tumor of Bone,10.0,Xgeva
7,Rhinorrhea,10.0,Atrovent Nasal
8,B12 Nutritional Deficiency,10.0,Cyanocobalamin
9,Dermatophytosis,10.0,Griseofulvin


Step 1: Import the necessary libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

Step 2: Preprocess the dataset

In [None]:
# Preprocess text data
def preprocess_text(text):
    # Tokenize text
    tokens = word_tokenize(text.lower())
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing to the 'review' column
df['clean_review'] = df['review'].apply(preprocess_text)

In [None]:
# Drop rows with missing values
df.dropna(subset=['clean_review', 'condition'], inplace=True)

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount,label,clean_review
0,163740,Mirtazapine,Depression,"""I&#039;ve tried a few antidepressants over th...",10.0,"February 28, 2012",22,positive,tried antidepressants years citalopram fluoxet...
1,206473,Mesalamine,"Crohn's Disease, Maintenance","""My son has Crohn&#039;s disease and has done ...",8.0,"May 17, 2009",17,positive,son crohn disease done well asacol complaints ...
2,159672,Bactrim,Urinary Tract Infection,"""Quick reduction of symptoms""",9.0,"September 29, 2017",3,positive,quick reduction symptoms
3,39293,Contrave,Weight Loss,"""Contrave combines drugs that were used for al...",9.0,"March 5, 2017",35,positive,contrave combines drugs used alcohol smoking o...
4,97768,Cyclafem 1 / 35,Birth Control,"""I have been on this birth control for one cyc...",9.0,"October 22, 2015",4,positive,birth control one cycle reading reviews type s...


In [None]:
# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['clean_review'], df['condition'], test_size=0.2, random_state=42)

Step 3: Extract features using TF-IDF vectorization

In [None]:
# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the test data
X_test_tfidf = tfidf_vectorizer.transform(X_test)


Step 4: Train a logistic regression classifier

In [None]:
# Initialize and train the logistic regression classifier
classifier = LogisticRegression()
classifier.fit(X_train_tfidf, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Step 5: Evaluate the classifier

In [None]:
# Predict the labels for the test set
y_pred = classifier.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.6136512388966807


In [None]:
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Initialize Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the Random Forest classifier on the training data
rf_classifier.fit(X_train_tfidf, y_train)

# Predict the labels for the test set
y_pred_rf = rf_classifier.predict(X_test_tfidf)

# Calculate accuracy
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy:", accuracy_rf)
