<a href="https://colab.research.google.com/github/HarishRock0/DSGP/blob/child-protection-component/script/child_protection_nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Child Protection NLP development is here

In [1]:
from google.colab import data_table
data_table.enable_dataframe_formatter()

In [2]:
from google.colab import drive
drive.mount('/content/drive')

!pip install sentence-transformers openpyxl

import pandas as pd
from sentence_transformers import SentenceTransformer, util

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

Mounted at /content/drive




In [4]:
file_path = '/content/drive/My Drive/DSGP/childprotection/childcases.xlsx'

In [3]:
def preprocess_district_data(df, district_column):
    """
    Cleans district names and prepares numerical columns.
    """
    # 1. Basic string cleaning for District Names
    df[district_column] = df[district_column].astype(str).str.strip().str.upper()

    # 2. Remove special characters/numbers that might be in the district name
    df[district_column] = df[district_column].apply(lambda x: re.sub(r'[^A-Z\s]', '', x))

    # 3. Handle Missing Values: For NLP ranking, we drop rows where District is missing
    df = df.dropna(subset=[district_column])

    return df

In [5]:
def load_data_cleaned(cases_path, demo_path):
    cases_df = pd.read_excel(cases_path)
    demo_df = pd.read_excel(demo_path)

    # Apply preprocessing
    cases_df = preprocess_district_data(cases_df, 'District')
    demo_df = preprocess_district_data(demo_df, 'DISTRICT_N')

    # Standardize numerical data (Coerce errors to 0 or Median)
    cases_df.iloc[:, 1:] = cases_df.iloc[:, 1:].apply(pd.to_numeric, errors='coerce').fillna(0)

    return cases_df, demo_df

In [6]:
def load_and_clean_child_cases(file_path):
    # Using read_excel instead of read_csv to fix the 'utf-8' error
    df = pd.read_excel(file_path, skiprows=2)

    # Rename columns based on your provided script logic
    df = df.rename(columns={'Unnamed: 1': 'District', 'Avg_cases': 'average_child_cases'})

    # Convert S/No to numeric to filter out non-data rows
    df['S/No_numeric'] = pd.to_numeric(df['Unnamed: 0'], errors='coerce')
    df_cleaned = df.dropna(subset=['S/No_numeric']).copy()

    return df_cleaned[['District', 'average_child_cases']]