<a href="https://colab.research.google.com/github/Jiratpol-Techavutichai/Symptops_Recommendation/blob/main/Symptom_Recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Symptom Recommendation Project

## Data Cleaning

In [1]:
import pandas as pd

# Import data
df = pd.read_excel('/content/drive/MyDrive/Data_set/Agnos.xlsx')

df.head()

Unnamed: 0,gender,age,summary,search_term
0,male,28,"{""diseases"": [], ""procedures"": [], ""no_symptom...","มีเสมหะ, ไอ"
1,male,27,"{""diseases"": [], ""procedures"": [], ""no_symptom...","ไอ, น้ำมูกไหล"
2,female,26,"{""diseases"": [], ""procedures"": [], ""no_symptom...",ปวดท้อง
3,male,42,"{""diseases"": [], ""procedures"": [], ""no_symptom...",น้ำมูกไหล
4,female,40,"{""diseases"": [], ""procedures"": [], ""no_symptom...",ตาแห้ง


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   gender       1000 non-null   object
 1   age          1000 non-null   int64 
 2   summary      1000 non-null   object
 3   search_term  1000 non-null   object
dtypes: int64(1), object(3)
memory usage: 31.4+ KB


In [3]:
df.isnull().sum()

Unnamed: 0,0
gender,0
age,0
summary,0
search_term,0


No missing and null values. Next, we will extract symptoms from the column called "summary" .

In [4]:
import json

# Extract symptoms from summary
def extract_yes_symptoms(summary_json):
    try:
        summary = json.loads(summary_json)
        symptoms = summary.get("yes_symptoms", [])
        return [s["text"].strip() for s in symptoms if "text" in s]
    except Exception as e:
        return []

df["symptoms_extracted"] = df["summary"].apply(extract_yes_symptoms)

# Clean up search_item column
df["search_symptoms"] = df["search_term"].fillna("").apply(
    lambda x: [s.strip() for s in str(x).split(",") if s.strip()]
)

# Filter unwanted symptoms
UNWANTED_SYMPTOMS = {"การรักษาก่อนหน้า", "ประวัติอุบัติเหตุ", "Previous treatment"}


# Combine both into
df["all_symptoms"] = df.apply(
    lambda row: [s for s in set(row["symptoms_extracted"] + row["search_symptoms"]) if s not in UNWANTED_SYMPTOMS],
    axis=1
)

# View cleaned data
#print(df[["gender", "age", "search_symptoms", "symptoms_extracted", "all_symptoms"]].head(10))
print(df[["all_symptoms"]].head(10))



                      all_symptoms
0             [ไอ, เสมหะ, มีเสมหะ]
1                  [ไอ, น้ำมูกไหล]
2                        [ปวดท้อง]
3                      [น้ำมูกไหล]
4                         [ตาแห้ง]
5                      [ปวดกระดูก]
6  [ไอ, คันจมูกจามบ่อย, น้ำมูกไหล]
7                        [ปวดท้อง]
8              [ไอ, เจ็บคอ, คันคอ]
9                             [ไอ]


Noticed minority English words, we will change some of them to Thai.

In [5]:
import re

# Function to detect if any word in the "all_symptoms" contains English letters
def contains_english(symptoms):
    return any(re.search(r'[a-zA-Z]', symptom) for symptom in symptoms)

# Create a new column to identify English containing rows
df["has_english"] = df["all_symptoms"].apply(contains_english)

# Display rows with English terms
english_rows = df[df["has_english"]]

print(len(df[df["has_english"]]))
print(english_rows[["gender", "age", "all_symptoms"]].head(10))


81
    gender  age                                       all_symptoms
13  female   37                                            [Fever]
18    male   34                                            [Fever]
34    male   67                                         [diarrhea]
50    male   68            [diarrhea, abdominal pain, stomachache]
52  female    1                                            [Fever]
60    male   56                             [straining to urinate]
87  female   30                                        [back pain]
92    male   59  [wheezing sound, Sore throat, nasal congestion...
93  female   23                                            [Dizzy]
95    male   33                   [coughLightheaded, Fever, cough]


In [6]:
from collections import Counter

english_symptoms = [
    symptom
    for symptoms in english_rows["all_symptoms"]
    for symptom in symptoms
    if re.search(r"[a-zA-Z]", symptom)
]

# Count frequency and list top 3
top_three = Counter(english_symptoms).most_common(3)

# Display
print(f"Top 3 English-containing symptoms:")
for symptom, count in top_three:
    print(f"{symptom}: {count} times")

Top 3 English-containing symptoms:
Fever: 20 times
cough: 12 times
Sore throat: 9 times


Found top three, including Fever, cough, and Sore throat

In [7]:
import pandas as pd
import re

def replace_english_with_thai(symptoms_list, thai_symptoms):
    replaced_symptoms = []
    for symptom in symptoms_list:
        for english_word, thai_word in thai_symptoms.items():
            symptom = re.sub(r'\b' + re.escape(english_word) + r'\b', thai_word, symptom)
        replaced_symptoms.append(symptom)
    return replaced_symptoms

# Use dictionary
thai_symptoms = {
    "Fever": "ไข้",
    "cough": "ไอ",
    "Sore throat": "เจ็บคอ",
    "sore throat": "เจ็บคอ"
}

# Function to place English words with Thai in 'all_symptoms'
df["all_symptoms_thai"] = df["all_symptoms"].apply(lambda x: replace_english_with_thai(x, thai_symptoms))

print(df[["all_symptoms_thai"]].head(15))


                                  all_symptoms_thai
0                              [ไอ, เสมหะ, มีเสมหะ]
1                                   [ไอ, น้ำมูกไหล]
2                                         [ปวดท้อง]
3                                       [น้ำมูกไหล]
4                                          [ตาแห้ง]
5                                       [ปวดกระดูก]
6                   [ไอ, คันจมูกจามบ่อย, น้ำมูกไหล]
7                                         [ปวดท้อง]
8                               [ไอ, เจ็บคอ, คันคอ]
9                                              [ไอ]
10                                        [อาเจียน]
11  [ปวดเมื่อยกล้ามเนื้อ, ปวดเมื่อยกล้ามเนื้อทั่วๆ]
12                   [เสมหะไหลลงคอ, เสมหะ, มีเสมหะ]
13                                            [ไข้]
14                             [คันจมูก, น้ำมูกไหล]


In [8]:
df.head()

Unnamed: 0,gender,age,summary,search_term,symptoms_extracted,search_symptoms,all_symptoms,has_english,all_symptoms_thai
0,male,28,"{""diseases"": [], ""procedures"": [], ""no_symptom...","มีเสมหะ, ไอ","[เสมหะ, ไอ, การรักษาก่อนหน้า]","[มีเสมหะ, ไอ]","[ไอ, เสมหะ, มีเสมหะ]",False,"[ไอ, เสมหะ, มีเสมหะ]"
1,male,27,"{""diseases"": [], ""procedures"": [], ""no_symptom...","ไอ, น้ำมูกไหล","[ไอ, น้ำมูกไหล, การรักษาก่อนหน้า]","[ไอ, น้ำมูกไหล]","[ไอ, น้ำมูกไหล]",False,"[ไอ, น้ำมูกไหล]"
2,female,26,"{""diseases"": [], ""procedures"": [], ""no_symptom...",ปวดท้อง,"[ปวดท้อง, การรักษาก่อนหน้า]",[ปวดท้อง],[ปวดท้อง],False,[ปวดท้อง]
3,male,42,"{""diseases"": [], ""procedures"": [], ""no_symptom...",น้ำมูกไหล,"[น้ำมูกไหล, การรักษาก่อนหน้า]",[น้ำมูกไหล],[น้ำมูกไหล],False,[น้ำมูกไหล]
4,female,40,"{""diseases"": [], ""procedures"": [], ""no_symptom...",ตาแห้ง,"[ตาแห้ง, การรักษาก่อนหน้า]",[ตาแห้ง],[ตาแห้ง],False,[ตาแห้ง]


## Builidng Model

Now, we are ready to proceed the training. Applying TF - IDF, common technique in Natural Language Processing , is perfectly suit to our problem.

In [9]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

# Use preprocessed Thai-only symptoms column
df["joined_symptoms"] = df["all_symptoms_thai"].apply(lambda x: " ".join(x) if isinstance(x, list) else "")

# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["joined_symptoms"])

# Save TF-IDF matrix, vectorizer, and original data
with open("symptom_model.pkl", "wb") as f:
    pickle.dump({
        "vectorizer": vectorizer,
        "matrix": X,
        "data": df
    }, f)

## Predicting Result

In [10]:
import pickle
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load model
with open("symptom_model.pkl", "rb") as f:
    model = pickle.load(f)

vectorizer = model["vectorizer"]
matrix = model["matrix"]
data = model["data"]

def recommend_symptoms(user_input, top_k=5):
    user_vec = vectorizer.transform([user_input])
    similarities = cosine_similarity(user_vec, matrix).flatten()
    top_indices = np.argsort(similarities)[-top_k:][::-1]

    results = data.iloc[top_indices][["gender", "age", "all_symptoms_thai"]]
    results["similarity"] = similarities[top_indices]
    return results

# Example usage
if __name__ == "__main__":
    user_input = input("Enter symptoms (Thai, space-separated): ")
    recommendations = recommend_symptoms(user_input)
    print(recommendations)


Enter symptoms (Thai, space-separated): เจ็บคอ น้ำมูก
     gender  age    all_symptoms_thai  similarity
359  female   35  [เจ็บคอ, น้ำมูกไหล]    0.857009
817  female   45             [เจ็บคอ]    0.808661
25     male   34             [เจ็บคอ]    0.808661
32     male   65             [เจ็บคอ]    0.808661
26   female   34             [เจ็บคอ]    0.808661


## Implement FastAPI symptom recommender API using ngrok

In [12]:
!pip install fastapi uvicorn nest-asyncio pyngrok

Collecting fastapi
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.8-py3-none-any.whl.metadata (10 kB)
Collecting starlette<0.47.0,>=0.40.0 (from fastapi)
  Downloading starlette-0.46.2-py3-none-any.whl.metadata (6.2 kB)
Downloading fastapi-0.115.12-py3-none-any.whl (95 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.2/95.2 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading uvicorn-0.34.2-py3-none-any.whl (62 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyngrok-7.2.8-py3-none-any.whl (25 kB)
Downloading starlette-0.46.2-py3-none-any.whl (72 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.0/72.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: uvicorn, pyngrok, st

In [13]:
!ngrok config add-authtoken 2x5InTeHiwUQCs7mBaiJgCjx4D5_55H25Kfg8U6frjHztvE1W

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [14]:
from fastapi import FastAPI
from pydantic import BaseModel
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pickle
from pyngrok import ngrok
import nest_asyncio
import uvicorn

# Load model
with open("symptom_model.pkl", "rb") as f:
    model = pickle.load(f)

vectorizer = model["vectorizer"]
matrix = model["matrix"]
data = model["data"]

# FastAPI app
app = FastAPI(title="Symptom Recommender API")

class SymptomRequest(BaseModel):
    symptoms: str
    top_k: int = 5

@app.get("/")
def home():
    return {"message": "Welcome to the Symptom Recommender API!"}

@app.post("/recommend")
def get_recommendations(req: SymptomRequest):
    user_vec = vectorizer.transform([req.symptoms])
    similarities = cosine_similarity(user_vec, matrix).flatten()
    top_indices = np.argsort(similarities)[-req.top_k:][::-1]
    results = data.iloc[top_indices].copy()
    results["similarity"] = similarities[top_indices]
    return results[["gender", "age", "all_symptoms_thai", "similarity"]].to_dict(orient="records")


In [15]:
# Apply nested async support
nest_asyncio.apply()

# Open tunnel on port 8000
public_url = ngrok.connect(8000)
print(f"🌐 Public URL: {public_url}")

# Run FastAPI app
uvicorn.run(app, host="0.0.0.0", port=8000)


🌐 Public URL: NgrokTunnel: "https://77ab-34-48-129-136.ngrok-free.app" -> "http://localhost:8000"


INFO:     Started server process [159]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


INFO:     2001:fb1:15:3058:e4d8:76e7:3489:d14e:0 - "GET /docs HTTP/1.1" 200 OK
INFO:     2001:fb1:15:3058:e4d8:76e7:3489:d14e:0 - "GET /openapi.json HTTP/1.1" 200 OK


INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [159]


Run the above cell to get the Public URL, then add /docs at the end to open Swagger UI, like https://77ab-34-48-129-136.ngrok-free.app/docs.


Then, input some sample at the \recommend, such as
{
  "symptoms": "ไข้ ปวดหัว",
  "top_k": 5
}