<a href="https://colab.research.google.com/github/Hatthaporn/Assignment/blob/main/Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **New tried**

In [20]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# --- 1. กำหนดชื่อไฟล์ ---
FILE_PATH = '/content/[CONFIDENTIAL] AI symptom picker data (Agnos candidate assignment).xlsx'
print(f"Attempting to load data from local file: {FILE_PATH}")

df_interactions = pd.DataFrame()
symptom_col_name = 'search_term'

try:
    df = pd.read_excel(FILE_PATH)
    print("Data loaded successfully.")

    if symptom_col_name in df.columns:
        df.rename(columns={symptom_col_name: 'Symptom_Combined'}, inplace=True)

        # --- NEW STEP: Symptom Tokenization ---
        # 1. แยกอาการย่อย (Tokenization) โดยใช้เครื่องหมายลูกน้ำ
        df_tokenized = df.assign(Symptom=df['Symptom_Combined'].str.split(',')).explode('Symptom')

        # 2. Trim whitespace from each token (e.g., " ไอ" -> "ไอ")
        df_tokenized['Symptom'] = df_tokenized['Symptom'].str.strip()

        # Generate 'Patient_ID'
        if 'Patient_ID' not in df_tokenized.columns:
            df_tokenized['Patient_ID'] = df_tokenized.index + 1

        # Filter for interaction data
        df_interactions = df_tokenized[['Patient_ID', 'Symptom']].dropna()

        # ... (Display results) ...
        print("\nTransformed Interaction Data Head (Tokenized Symptoms):")
        print(df_interactions.head())
        print(f"\nTotal unique patients: {df_interactions['Patient_ID'].nunique()}")
        print(f"Total unique symptoms (Tokens): {df_interactions['Symptom'].nunique()}") # Unique symptoms should decrease or become more granular

    else:
        print(f"\n[ERROR] Cannot find the specified symptom column: '{symptom_col_name}'. Model building is skipped.")

except Exception as e:
    print(f"\n--- FATAL ERROR ---")
    print(f"An unexpected error occurred during data loading or preprocessing: {e}")

# --- Step 3, 4, 5: Model Building and Testing ---
if not df_interactions.empty:

    # --- Step 3: Create the Patient-Symptom Matrix and Similarity Model ---
    patient_symptom_matrix = df_interactions.pivot_table(
        index='Patient_ID', columns='Symptom', aggfunc='size', fill_value=0
    )
    patient_symptom_matrix = (patient_symptom_matrix > 0).astype(int)
    symptom_patient_matrix = patient_symptom_matrix.T
    symptom_similarity_matrix = cosine_similarity(symptom_patient_matrix)
    symptom_similarity_df = pd.DataFrame(
        symptom_similarity_matrix, index=symptom_patient_matrix.index, columns=symptom_patient_matrix.index
    )
    print("\n--- Step 3: Symptom-Symptom Similarity Matrix Head (Tokenized) ---")
    print(symptom_similarity_df.iloc[:5, :5])

    # --- Step 4 & 5: Recommendation Function and Testing ---
    def recommend_symptoms(selected_symptom, top_n=5):
        if selected_symptom not in symptom_similarity_df.index: return []
        similarity_scores = symptom_similarity_df[selected_symptom]
        sorted_scores = similarity_scores.sort_values(ascending=False)
        recommended_symptoms = sorted_scores.index[1:top_n + 1].tolist()
        recommended_scores = sorted_scores.values[1:top_n + 1].tolist()
        return list(zip(recommended_symptoms, recommended_scores))

    # Test with the most frequent symptom
    test_symptom = df_interactions['Symptom'].value_counts().index[0]
    recommendations = recommend_symptoms(test_symptom, top_n=5)

    print(f"\n--- Step 5: Testing Recommendation System (Final Result) ---")
    print(f"Most frequent symptom used for test: '{test_symptom}'")
    print(f"\nSymptoms Recommended after selecting '{test_symptom}':")
    for symptom, score in recommendations:
        print(f"- {symptom} (Similarity Score: {score:.4f})")
else:
    print("\n[SKIP] Cannot proceed to build the model or test the function.")

Attempting to load data from local file: /content/[CONFIDENTIAL] AI symptom picker data (Agnos candidate assignment).xlsx
Data loaded successfully.

Transformed Interaction Data Head (Tokenized Symptoms):
   Patient_ID    Symptom
0           1    มีเสมหะ
0           1         ไอ
1           2         ไอ
1           2  น้ำมูกไหล
2           3    ปวดท้อง

Total unique patients: 1000
Total unique symptoms (Tokens): 269

--- Step 3: Symptom-Symptom Similarity Matrix Head (Tokenized) ---
Symptom                       Animal bite  Blurry vision  \
Symptom                                                    
                         1.0          0.0            0.0   
Animal bite              0.0          1.0            0.0   
Blurry vision            0.0          0.0            1.0   
Decreased stool caliber  0.0          0.0            0.0   
Dizzy                    0.0          0.0            0.0   

Symptom                  Decreased stool caliber  Dizzy  
Symptom                          

In [21]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# โหลดข้อมูล (สมมติว่า df_interactions และ symptom_similarity_df ถูกสร้างขึ้นสำเร็จแล้ว)

# --- Final Data Cleaning (Removing empty symptom strings) ---
# เนื่องจากเราพบว่ามี Symptom ที่เป็นช่องว่าง (' ') ถูกสร้างขึ้นจากการ Tokenization
df_interactions_clean = df_interactions[df_interactions['Symptom'].str.strip() != '']
df_interactions_clean = df_interactions_clean[df_interactions_clean['Symptom'].notna()]

print("Data Cleaning: Removed empty/whitespace symptoms.")
print(f"Total unique symptoms (After Clean): {df_interactions_clean['Symptom'].nunique()}")

# --- Re-build Model with Cleaned Data ---
patient_symptom_matrix_clean = df_interactions_clean.pivot_table(
    index='Patient_ID', columns='Symptom', aggfunc='size', fill_value=0
)
patient_symptom_matrix_clean = (patient_symptom_matrix_clean > 0).astype(int)
symptom_patient_matrix_clean = patient_symptom_matrix_clean.T
symptom_similarity_matrix_clean = cosine_similarity(symptom_patient_matrix_clean)
symptom_similarity_df_clean = pd.DataFrame(
    symptom_similarity_matrix_clean,
    index=symptom_patient_matrix_clean.index,
    columns=symptom_patient_matrix_clean.index
)

# --- Final Recommendation Function ---
def symptom_recommender_system(selected_symptom, top_n=5, similarity_df=symptom_similarity_df_clean):
    """
    Final function to recommend next possible symptoms using the Item-Based CF model.
    """
    if selected_symptom not in similarity_df.index:
        return f"Symptom '{selected_symptom}' not found in the model data.", []

    similarity_scores = similarity_df[selected_symptom]
    sorted_scores = similarity_scores.sort_values(ascending=False)

    # Filter out the input symptom itself (Similarity Score 1.0) and empty strings
    recommended_symptoms = [
        (symptom, score) for symptom, score in zip(sorted_scores.index, sorted_scores.values)
        if symptom != selected_symptom and str(symptom).strip() != ''
    ][:top_n]

    return "Success", recommended_symptoms

# --- Final Testing ---
test_symptom = df_interactions_clean['Symptom'].value_counts().index[0]
status, recommendations = symptom_recommender_system(test_symptom, top_n=5)

print(f"\n--- Final Model (Cleaned Data) ---")
print(f"Test Symptom: '{test_symptom}'")
print(f"Status: {status}")
print(f"\nTop 5 Recommended Symptoms:")
for symptom, score in recommendations:
    print(f"- {symptom} (Similarity Score: {score:.4f})")

# Test 2: Another common symptom, e.g., 'ปวดท้อง'
status_2, recommendations_2 = symptom_recommender_system('ปวดท้อง', top_n=3)
print(f"\nTest Symptom: 'ปวดท้อง'")
print(f"Status: {status_2}")
print(f"Top 3 Recommended Symptoms:")
for symptom, score in recommendations_2:
    print(f"- {symptom} (Similarity Score: {score:.4f})")

Data Cleaning: Removed empty/whitespace symptoms.
Total unique symptoms (After Clean): 268

--- Final Model (Cleaned Data) ---
Test Symptom: 'ไอ'
Status: Success

Top 5 Recommended Symptoms:
- มีเสมหะ (Similarity Score: 0.2609)
- น้ำมูกไหล (Similarity Score: 0.2296)
- มีเสมหะน้ำมูกไหล (Similarity Score: 0.1862)
- ไอกลางคืน (Similarity Score: 0.1437)
- น้ำมูกไหลมีเสมหะ (Similarity Score: 0.1267)

Test Symptom: 'ปวดท้อง'
Status: Success
Top 3 Recommended Symptoms:
- จุกแน่นท้อง (Similarity Score: 0.2582)
- คลื่นไส้อาเจียน (Similarity Score: 0.2020)
- อาเจียนคลื่นไส้ (Similarity Score: 0.2020)
