In [2]:
!pip install pandas nltk

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources (first-time use)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the CSV
df = pd.read_csv(r"C:\Users\path")

# Keep only relevant columns
df = df[['conversations', 'disease']].dropna()

# Optional: display a few rows
df.head()





[notice] A new release of pip is available: 25.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip
[nltk_data] Downloading package punkt to C:\Users\haraz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\haraz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\haraz/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,conversations,disease
0,User: I’ve been sneezing a lot today and my no...,allergy
1,User: I’ve developed a rash after eating some ...,allergy
2,"User: My eyes are swollen and itchy, and I can...",allergy
3,User: I’ve been getting headaches and a stuffy...,allergy
4,"User: Every time I eat nuts, my mouth itches. ...",allergy


In [4]:
# Clean HTML separators and text formatting issues
def clean_text(text):
    text = re.sub(r'</s>', ' ', text)         # remove </s>
    text = re.sub(r'[^\w\s]', '', text)       # remove punctuation
    text = text.lower()
    return text

df['cleaned'] = df['conversations'].apply(clean_text)




In [5]:
!pip install transformers
!pip install torch







[notice] A new release of pip is available: 25.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 25.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

# Load model and tokenizer
model_name = "d4data/biomedical-ner-all"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Create NER pipeline
nlp_ner = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Function to extract entities
def extract_medical_entities(text):
    try:
        results = nlp_ner(text)
        return [entity['word'] for entity in results]
    except:
        return []

# Apply to your cleaned text column
df['entities'] = df['cleaned'].apply(extract_medical_entities)

# View some results
df[['cleaned', 'entities']].head()



Unnamed: 0,cleaned,entities
0,user ive been sneezing a lot today and my nose...,"[s, ##neezing, nose, cong, ##ested, bot, all, ..."
1,user ive developed a rash after eating some st...,"[rash, eating, strawberries, bot, allergic rea..."
2,user my eyes are swollen and itchy and i cant ...,"[eyes, swollen, it, ##chy, s, ##neezing, bot, ..."
3,user ive been getting headaches and a stuffy n...,"[headaches, stuffy nose, a few days, bot, alle..."
4,user every time i eat nuts my mouth itches b...,"[every, time, eat, nuts, mouth, it, ##ches, bo..."


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_df=0.9, min_df=2, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['cleaned'])

# LDA
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda_topics = lda.fit_transform(tfidf_matrix)

# Show LDA topics
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx + 1}: ", " | ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

display_topics(lda, tfidf_vectorizer.get_feature_names_out(), 10)


Topic 1:  veins | varicose | fever | malaria | yes | long | chills | swollen | legs | does
Topic 2:  pain | help | chest | doctor | ill | swelling | foods | stomach | eating | try
Topic 3:  asthma | rash | eyes | using | acne | fever | joints | yes | nosebleeds | nose
Topic 4:  typhoid | blood | sugar | diarrhea | diabetes | stomach | ive | pressure | feeling | doctor
Topic 5:  itchy | urine | blisters | patches | skin | yes | fever | psoriasis | theyre | yellow


In [8]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

kmeans = KMeans(n_clusters=5, random_state=42)
df['cluster'] = kmeans.fit_predict(tfidf_matrix)

# Evaluate with silhouette score
sil_score = silhouette_score(tfidf_matrix, df['cluster'])
print("Silhouette Score:", sil_score)


[WinError 2] The system cannot find the file specified
  File "c:\Users\haraz\anaconda3\lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
  File "c:\Users\haraz\anaconda3\lib\subprocess.py", line 505, in run
    with Popen(*popenargs, **kwargs) as process:
  File "c:\Users\haraz\anaconda3\lib\subprocess.py", line 951, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\haraz\anaconda3\lib\subprocess.py", line 1420, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,


Silhouette Score: 0.040066299207263935


In [9]:
!pip install mlxtend





[notice] A new release of pip is available: 25.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

# Remove empty lists
transactions = df['entities'].dropna().apply(lambda x: list(set(x)))  # remove duplicates

# One-hot encoding of transactions
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
symptom_df = pd.DataFrame(te_ary, columns=te.columns_)

# Apriori algorithm
frequent_itemsets = apriori(symptom_df, min_support=0.05, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)
rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']]


Unnamed: 0,antecedents,consequents,support,confidence,lift
0,(##chy),(bot),0.070833,1.000000,1.054945
1,(bot),(##chy),0.070833,0.074725,1.054945
2,(##chy),(it),0.070833,1.000000,4.637681
3,(it),(##chy),0.070833,0.328502,4.637681
4,(##chy),(yes),0.061458,0.867647,1.301471
...,...,...,...,...,...
791,"(worse, yes)","(user, bot)",0.055208,0.469027,1.786768
792,(user),"(worse, bot, yes)",0.055208,0.207031,1.774554
793,(bot),"(user, worse, yes)",0.055208,0.058242,1.054945
794,(worse),"(user, bot, yes)",0.055208,0.331250,1.528846


In [11]:
!pip install vaderSentiment





[notice] A new release of pip is available: 25.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [12]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

vader = SentimentIntensityAnalyzer()
df['sentiment_score'] = df['cleaned'].apply(lambda x: vader.polarity_scores(x)['compound'])


In [13]:
from transformers import pipeline

emotion_classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=None)

# Apply to a sample (you can batch this for speed)
df['emotions'] = df['cleaned'].apply(lambda x: emotion_classifier(x))


In [41]:
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary
from nltk.tokenize import word_tokenize

# Tokenize text
df['tokens'] = df['cleaned'].apply(word_tokenize)
dictionary = Dictionary(df['tokens'])
corpus = [dictionary.doc2bow(text) for text in df['tokens']]

# Convert sklearn LDA to gensim
import gensim.models.ldamodel as gldamodel
gensim_lda = gldamodel.LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=5,
    passes=5,
    random_state=42
)

coherence_model = CoherenceModel(model=gensim_lda, texts=df['tokens'], dictionary=dictionary, coherence='c_v')
print("LDA Coherence Score:", coherence_model.get_coherence())


LDA Coherence Score: 0.32300021680170954


In [42]:
print("Silhouette Score for Clustering:", silhouette_score(tfidf_matrix, df['cluster']))


Silhouette Score for Clustering: 0.040066299207263935
