In [1]:

import pandas as pd

# Read the CSV file
df = pd.read_csv(r"C:\Users\isult\Dropbox\AI\Data\pathology_1000_reports.csv")

# Display column names
print("Column names:")
print(df.columns.tolist())


Column names:
['MRN', 'Document_Number', 'DOCUMENT_TYPE', 'Entry_Date', 'Visit', 'AUTHOR_DICTATOR', 'VISIT_LOCATION', 'SERVICE', 'SIGNATURE_DATE_TIME', 'SIGNED_BY', 'Parent_Number', 'Parent_Type', 'HOSPITAL_LOCATION', 'AUTHOR_ID', 'AUTHOR_SERVICE', 'Note', 'Visit_Number']


In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from typing import Optional, Dict, Any

class DiagnosisMapper:
    def __init__(self, icdo_df: pd.DataFrame, topography_df: pd.DataFrame):
        self.icdo_df = icdo_df
        self.topography_df = topography_df
        self.morph_vec = TfidfVectorizer(ngram_range=(1, 2))
        self.topo_vec = TfidfVectorizer(ngram_range=(1, 2))
        # Fit the vectorizers on the morphology and topography terms
        self.morph_vectors = self.morph_vec.fit_transform(icdo_df['morphology'].fillna(''))
        self.topo_vectors = self.topo_vec.fit_transform(topography_df['term'].fillna(''))

    def find_closest_diagnosis(self, desc: str, threshold: float = 0.3) -> Optional[Dict[str, Any]]:
        """
        Find the closest matching diagnosis in ICD-O morphology codes
        
        Args:
            desc: Description to match
            threshold: Minimum similarity score (default: 0.3)
            
        Returns:
            Dictionary with matched code, term and similarity score, or None if no match above threshold
        """
        if not desc:
            return None
        # Transform the input description
        q = self.morph_vec.transform([desc])
        # Calculate similarity with all terms
        sims = cosine_similarity(q, self.morph_vectors).flatten()
        # Get the index of highest similarity
        idx = sims.argmax()
        # Return None if below threshold, otherwise return match info
        return None if sims[idx] < threshold else {
            'code': self.icdo_df.iloc[idx]['icdo'],
            'term': self.icdo_df.iloc[idx]['morphology'],
            'similarity': sims[idx]
        }
    
    def find_closest_topography(self, site_description: str, threshold: float = 0.3) -> Optional[Dict[str, Any]]:
        """
        Find the closest matching site in ICD-O topography codes
        
        Args:
            site_description: Site description to match
            threshold: Minimum similarity score (default: 0.3)
            
        Returns:
            Dictionary with matched code, term and similarity score, or None if no match above threshold
        """
        if not site_description:
            return None
        q = self.topo_vec.transform([site_description])
        sims = cosine_similarity(q, self.topo_vectors).flatten()
        idx = sims.argmax()
        return None if sims[idx] < threshold else {
            'code': self.topography_df.iloc[idx]['ICDO3'],
            'term': self.topography_df.iloc[idx]['term'],
            'similarity': sims[idx]
        }

# Example usage:
if __name__ == "__main__":
    # Sample data - replace with your actual ICD-O codes
    icdo_df = pd.DataFrame({
        'icdo': ['8140/3', '8500/3', '8010/3'],
        'morphology': ['Adenocarcinoma', 'Invasive ductal carcinoma', 'Carcinoma NOS']
    })
    
    topography_df = pd.DataFrame({
        'ICDO3': ['C50.9', 'C34.9', 'C18.9'],
        'term': ['Breast NOS', 'Lung NOS', 'Colon NOS']
    })
    
    # Initialize mapper
    mapper = DiagnosisMapper(icdo_df, topography_df)
    
    # Test diagnosis matching
    test_desc = "Invasive lobular carcinoma"
    result = mapper.find_closest_diagnosis(test_desc)
    print(f"Input: {test_desc}")
    print("Match:", result)
    
    # Test topography matching
    test_site = "Left breast"
    result = mapper.find_closest_topography(test_site)
    print(f"\nInput: {test_site}")
    print("Match:", result)

#     Input: Invasive lobular carcinoma
# Match: {'code': '8500/3', 'term': 'Invasive ductal carcinoma', 'similarity': 0.5871534547302593}

# Input: Left breast
# Match: {'code': 'C50.9', 'term': 'Breast NOS', 'similarity': 0.6524908845125339}

Input: Invasive lobular carcinoma
Match: {'code': '8500/3', 'term': 'Invasive ductal carcinoma', 'similarity': 0.5871534547302593}

Input: Left breast
Match: {'code': 'C50.9', 'term': 'Breast NOS', 'similarity': 0.6524908845125339}
