In [None]:
import sys
!{sys.executable} -m pip install numpy pandas scikit-learn matplotlib seaborn
import sys
print(sys.executable)

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sentence_transformers import SentenceTransformer
print("✅ All libraries imported successfully!")

In [None]:
clinical_knowledge = {
    "osteoarthritis": [
        "Osteoarthritis causes reduced stride length and decreased hip extension angle during walking",
        "Patients with hip osteoarthritis show painful gait with shortened stance phase",
        "Knee osteoarthritis results in reduced walking speed and increased step width for stability",
        "Joint pain in osteoarthritis leads to antalgic gait with reduced weight bearing on affected limb"
    ],
    "scoliosis": [
        "Scoliosis patients exhibit asymmetric trunk sway and uneven shoulder alignment",
        "Spinal curvature in scoliosis causes compensatory pelvic tilt during gait",
        "Lateral trunk lean and rotational movement characterize scoliotic walking patterns",
        "Vertebral rotation in scoliosis affects balance and postural control during ambulation"
    ],
    "hip_dysplasia": [
        "Hip dysplasia presents with Trendelenburg gait showing pelvic drop on affected side",
        "Developmental hip dysplasia causes characteristic waddling gait pattern",
        "Limited hip abduction and reduced range of motion affect walking symmetry",
        "Shallow acetabulum in hip dysplasia leads to joint instability and altered gait mechanics"
    ],
     "parkinsons": [
        "Parkinson's disease shows reduced arm swing and shuffling steps with small stride length",
        "Festinating gait and freezing episodes are characteristic of Parkinsonian movement",
        "Stooped posture, slow cadence, and difficulty initiating steps define parkinsonian gait",
        "Bradykinesia and rigidity in Parkinson's cause reduced stride variability and turning difficulty"
    ],
    "healthy": [
        "Normal gait shows symmetric stride length with regular cadence between 100-120 steps per minute",
        "Healthy walking demonstrates balanced arm swing and smooth heel-to-toe progression",
        "Normal gait has symmetrical stance and swing phases with adequate hip extension",
        "Optimal gait efficiency includes coordinated limb movements and stable trunk control"
    ]
} 

print(f"Clinical knowledge collected for {len(clinical_knowledge)} conditions")
for disease, texts in clinical_knowledge.items():
    print(f"  - {disease}: {len(texts)} descriptions")

In [None]:
print("Loading embedding model... (this may take a moment)")
model = SentenceTransformer('all-MiniLM-L6-v2')
print("✅ Model loaded successfully!")
print(f"Model: {model}")

In [None]:
print("\nGenerating clinical embeddings...")

disease_embeddings = {}
all_embeddings_list = []
disease_labels = []

for disease, descriptions in clinical_knowledge.items():
    print(f"Processing {disease}...")
    
    # Encode all descriptions for this disease
    embeddings = model.encode(descriptions, show_progress_bar=False)
    
    # Average embeddings to create single representation
    avg_embedding = np.mean(embeddings, axis=0)
    
    # Store
    disease_embeddings[disease] = avg_embedding
    all_embeddings_list.append(avg_embedding)
    disease_labels.append(disease)
     
    # Convert to numpy array
embedding_matrix = np.array(all_embeddings_list)

print(f"\n✅ Embeddings generated!")
print(f"Embedding dimension: {embedding_matrix.shape[1]}")
print(f"Number of diseases: {embedding_matrix.shape[0]}")
print(f"Diseases: {disease_labels}")


In [None]:
print("\nReducing dimensions for visualization...")

# Apply PCA to reduce to 2D
pca = PCA(n_components=2)
embeddings_2d = pca.fit_transform(embedding_matrix)

# Create visualization
plt.figure(figsize=(12, 8))
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A', '#98D8C8']

for i, disease in enumerate(disease_labels):
    plt.scatter(embeddings_2d[i, 0], embeddings_2d[i, 1], 
               s=300, alpha=0.6, c=colors[i], edgecolors='black', linewidth=2)
    plt.annotate(disease, 
                (embeddings_2d[i, 0], embeddings_2d[i, 1]),
                fontsize=14, 
                fontweight='bold',
                ha='center',
                va='center')
    
plt.title('Clinical Knowledge Embeddings\n(2D PCA Projection)', fontsize=16, fontweight='bold')
plt.xlabel(f'Principal Component 1 ({pca.explained_variance_ratio_[0]*100:.1f}% variance)', fontsize=12)
plt.ylabel(f'Principal Component 2 ({pca.explained_variance_ratio_[1]*100:.1f}% variance)', fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"✅ PCA preserves {sum(pca.explained_variance_ratio_)*100:.1f}% of variance")


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

print("\nCalculating disease similarities...")

# Calculate cosine similarity
similarity_matrix = cosine_similarity(embedding_matrix)

# Create DataFrame for better visualization
similarity_df = pd.DataFrame(
    similarity_matrix,
    index=disease_labels,
    columns=disease_labels
)

print("\nClinical Knowledge Similarity Matrix:")
print(similarity_df.round(3))

# Visualize with heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(similarity_df, annot=True, 
            fmt='.3f',
            cmap='RdYlGn', 
            center=0.5,
            square=True, 
            linewidths=2,
            cbar_kws={"shrink": 0.8, "label": "Cosine Similarity"})

plt.title('Disease Similarity Based on Clinical Descriptions\n(Higher values = more similar symptoms)', 
         fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()



In [None]:
print("\n📊 Key Insights:")

# Find most similar pairs (excluding self-similarity)
similarity_scores = []
for i in range(len(disease_labels)):
    for j in range(i+1, len(disease_labels)):
        similarity_scores.append({
            'disease_1': disease_labels[i],
            'disease_2': disease_labels[j],
            'similarity': similarity_matrix[i, j]
        })

similarity_scores_df = pd.DataFrame(similarity_scores).sort_values('similarity', ascending=False)

print("\nMost Similar Disease Pairs:")
print(similarity_scores_df.head(3))

print("\nLeast Similar Disease Pairs:")
print(similarity_scores_df.tail(3))

In [None]:
print("\n💾 Saving embeddings...")

# Save embeddings as numpy array
np.save('clinical_embeddings.npy', embedding_matrix)

# Save disease labels
with open('disease_labels.txt', 'w') as f:
    for disease in disease_labels:
        f.write(f"{disease}\n")

# Save as DataFrame for easy access
embeddings_df = pd.DataFrame(
    embedding_matrix,
    index=disease_labels
)
embeddings_df.to_csv('clinical_embeddings.csv')

print("✅ Embeddings saved successfully!")
print("   - clinical_embeddings.npy (numpy format)")
print("   - disease_labels.txt (text labels)")
print("   - clinical_embeddings.csv (readable format)")

In [None]:
print("\n📈 Embedding Statistics:")
print(f"Mean embedding value: {embedding_matrix.mean():.4f}")
print(f"Std deviation: {embedding_matrix.std():.4f}")
print(f"Min value: {embedding_matrix.min():.4f}")
print(f"Max value: {embedding_matrix.max():.4f}")

# Distribution of embedding values
plt.figure(figsize=(10, 6))
plt.hist(embedding_matrix.flatten(), bins=50, alpha=0.7, edgecolor='black')
plt.xlabel('Embedding Value', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.title('Distribution of Clinical Embedding Values', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("\n✅ Clinical embeddings generation complete!")