<a href="https://colab.research.google.com/github/HarishRock0/DSGP/blob/child-protection-component/script/child_protection_nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Child Protection NLP development is here

In [21]:
from google.colab import data_table
data_table.enable_dataframe_formatter()

In [22]:
from google.colab import drive
drive.mount('/content/drive')

!pip install sentence-transformers openpyxl

import pandas as pd
from sentence_transformers import SentenceTransformer, util

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [23]:
file_path = '/content/drive/My Drive/childcases.xlsx'

In [24]:
def preprocess_district_data(df, district_column):
    """
    Cleans district names and prepares numerical columns.
    """
    # 1. Basic string cleaning for District Names
    df[district_column] = df[district_column].astype(str).str.strip().str.upper()

    # 2. Remove special characters/numbers that might be in the district name
    df[district_column] = df[district_column].apply(lambda x: re.sub(r'[^A-Z\s]', '', x))

    # 3. Handle Missing Values: For NLP ranking, we drop rows where District is missing
    df = df.dropna(subset=[district_column])

    return df

In [25]:
def load_data_cleaned(cases_path, demo_path):
    cases_df = pd.read_excel(cases_path)
    demo_df = pd.read_excel(demo_path)

    # Apply preprocessing
    cases_df = preprocess_district_data(cases_df, 'District')
    demo_df = preprocess_district_data(demo_df, 'DISTRICT_N')

    # Standardize numerical data (Coerce errors to 0 or Median)
    cases_df.iloc[:, 1:] = cases_df.iloc[:, 1:].apply(pd.to_numeric, errors='coerce').fillna(0)

    return cases_df, demo_df

In [26]:
def load_and_clean_child_cases(file_path):
    # Using read_excel instead of read_csv to fix the 'utf-8' error
    df = pd.read_excel(file_path, skiprows=2)

    # Rename columns based on your provided script logic
    df = df.rename(columns={'Unnamed: 1': 'District', 'Avg_cases': 'average_child_cases'})

    # Convert S/No to numeric to filter out non-data rows
    df['S/No_numeric'] = pd.to_numeric(df['Unnamed: 0'], errors='coerce')
    df_cleaned = df.dropna(subset=['S/No_numeric']).copy()

    return df_cleaned[['District', 'average_child_cases']]

In [27]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [28]:
templates = {
    'high_cases': {
        'query': 'districts with the highest reported child protection cases and extreme risk',
        'action': lambda df: df.sort_values('average_child_cases', ascending=False).head(10)
    },
    'low_cases': {
        'query': 'safest districts with the lowest number of child cases and highest safety',
        'action': lambda df: df.sort_values('average_child_cases', ascending=True).head(10)
    },
    'moderate_risk': {
        'query': 'districts with medium or average levels of child protection cases',
        'action': lambda df: df.iloc[(df['average_child_cases'] - df['average_child_cases'].mean()).abs().argsort()[:10]]
    },
    'top_5_critical': {
        'query': 'top 5 most dangerous or critical districts needing urgent attention',
        'action': lambda df: df.sort_values('average_child_cases', ascending=False).head(5)
    }
}

In [29]:
def get_district_ranking(user_query, df):
    # 1. Embed the query and templates
    query_embedding = model.encode(user_query)
    template_keys = list(templates.keys())
    template_queries = [t['query'] for t in templates.values()]
    template_embeddings = model.encode(template_queries)

    # 2. Calculate similarities
    similarities = util.cos_sim(query_embedding, template_embeddings)[0]

    # --- DEBUG SECTION ---
    print("\n--- NLP Debugger ---")
    for i, key in enumerate(template_keys):
        print(f"Template: {key} | Score: {similarities[i]:.4f}")
    # ---------------------

    best_index = similarities.argmax()
    best_key = template_keys[best_index]

    print(f"MATCHED TO: {best_key}")
    # Important: .copy() ensures you aren't modifying the original data
    return templates[best_key]['action'](df.copy())

In [None]:
try:
    # Ensuring the data is cleaned before the loop starts
    df_processed = load_and_clean_child_cases(file_path)

    # Standardizing district names to uppercase for consistent NLP identification
    df_processed['District'] = df_processed['District'].str.strip().str.upper()

    print("Data loaded and preprocessed successfully.")
    print("NLP Ranking System is ready.")

    while True:
        print("\n" + "="*50)
        print("NLP DISTRICT RANKING SYSTEM")
        print("Example: 'Which districts have the highest cases?'")
        print("Type 'exit' to quit.")

        user_preference = input("\n[User Preference]: ").strip()

        if user_preference.lower() in ['exit', 'quit']:
            print("Stopping NLP testing session.")
            break


        # This calls the SentenceTransformer logic defined in your script
        top_ranked_districts = get_district_ranking(user_preference, df_processed)

        # Display: Outputting the top 10 districts based on preference
        print(f"\nModel identified ranking for: '{user_preference}'")
        print(top_ranked_districts.to_markdown(index=False))

except Exception as e:
    print(f"CRITICAL ERROR: {e}")
    print("Ensure 'childcases.xlsx' is in the correct path and columns match.")

Data loaded and preprocessed successfully.
NLP Ranking System is ready.

NLP DISTRICT RANKING SYSTEM
Example: 'Which districts have the highest cases?'
Type 'exit' to quit.

[User Preference]: lowest

--- NLP Debugger ---
Template: high_cases | Score: 0.1489
Template: low_cases | Score: 0.2570
Template: moderate_risk | Score: 0.1326
Template: top_5_critical | Score: 0.1397
MATCHED TO: low_cases

Model identified ranking for: 'lowest'
| District     |   average_child_cases |
|:-------------|----------------------:|
| MANNAR       |               54.0667 |
| KILINOCHCHI  |               82.9333 |
| MULLAITIVU   |               89.4667 |
| VAVUNIYA     |               94.3333 |
| TRINCOMALEE  |              127.933  |
| BATTICALOA   |              132.6    |
| JAFFNA       |              132.8    |
| NUWARA ELIYA |              176.333  |
| MATALE       |              181.933  |
| AMPARA       |              196.4    |

NLP DISTRICT RANKING SYSTEM
Example: 'Which districts have the highes