In [2]:
import numpy as np
import pandas as pd
import io
import re
import os
import streamlit as st

In [3]:
file_list = [
    "January.csv", "February.csv", "March.csv", "April.csv", "May.csv", "June.csv",
    "July.csv", "August.csv", "September.csv", "October.csv", "November.csv", "December.csv"
]

input_dir = "uncleaned_csvs"

standard_columns = [
    "sl_no",
    "heads_of_crime",
    "major_head",
    "minor_head",
    "current_year_upto",
    "prev_year_same_month",
    "prev_month",
    "current_month"
]

def clean_df(df):
    df = df.loc[:, ~df.columns.str.contains("^Unnamed|^\\s*$")]
    col_map = {}
    for col in df.columns:
        c = col.strip().lower().replace('.', '').replace('  ', ' ').replace(' ', '_').replace('-', '_')
        c = c.replace('heads_of_crime', 'heads_of_crime')
        if 'sl' in c and 'no' in c:
            c = 'sl_no'
        elif 'heads' in c and 'crime' in c:
            c = 'heads_of_crime'
        elif 'major' in c and 'head' in c:
            c = 'major_head'
        elif 'minor' in c and 'head' in c:
            c = 'minor_head'
        elif 'current_year' in c or 'current_year_upto' in c or 'month_under_review' in c:
            c = 'current_year_upto'
        elif 'corresponding_month' in c:
            c = 'prev_year_same_month'
        elif 'previous_month' in c and 'corresponding' not in c:
            c = 'prev_month'
        elif 'current_month' in c:
            c = 'current_month'
        col_map[col] = c
    df = df.rename(columns=col_map)
    for col in standard_columns:
        if col not in df.columns:
            df[col] = pd.NA
    df = df[standard_columns]
    return df

output_dir = "cleaned_csvs"
os.makedirs(output_dir, exist_ok=True)

In [4]:
dfs = []
for fname in file_list:
    file_path = os.path.join(input_dir, fname)
    try:
        try:
            df = pd.read_csv(file_path, encoding="utf-8")
        except UnicodeDecodeError:
            df = pd.read_csv(file_path, encoding="latin1")
        df_clean = clean_df(df)
        month_name = os.path.splitext(os.path.basename(fname))[0]
        df_clean['month'] = month_name
        dfs.append(df_clean)
        out_path = os.path.join(output_dir, fname)
        df_clean.to_csv(out_path, index=False)
    except Exception as e:
        print(f"Error processing {fname}: {e}")

final_df = pd.concat(dfs, ignore_index=True)
final_df.to_csv("all_months_concatenated.csv", index=False)

print(f"Cleaned individual files saved in '{output_dir}/'")
print("Concatenated file saved as 'all_months_concatenated.csv'")

Cleaned individual files saved in 'cleaned_csvs/'
Concatenated file saved as 'all_months_concatenated.csv'


In [5]:
crime_df = pd.read_csv('all_months_concatenated.csv')

In [6]:
crime_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8467 entries, 0 to 8466
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   sl_no                 8466 non-null   float64
 1   heads_of_crime        7731 non-null   object 
 2   major_head            8464 non-null   object 
 3   minor_head            7651 non-null   object 
 4   current_year_upto     8433 non-null   float64
 5   prev_year_same_month  8433 non-null   float64
 6   prev_month            8433 non-null   float64
 7   current_month         8432 non-null   float64
 8   month                 8467 non-null   object 
dtypes: float64(5), object(4)
memory usage: 595.5+ KB


In [7]:
crime_df.drop('sl_no', axis=1, inplace=True)

In [8]:
crime_df.drop_duplicates()
crime_df.shape

(8467, 8)

In [9]:
crime_df = crime_df[
    ~(
        (crime_df['current_month'] == 0) &
        (crime_df['prev_month'] == 0) &
        (crime_df['current_year_upto'] == 0) &
        (crime_df['prev_year_same_month'] == 0)
    )
]

In [10]:
crime_df.shape

(5260, 8)

In [11]:
cols = ['current_month', 'prev_month', 'current_year_upto','prev_year_same_month']
crime_df[cols] = crime_df[cols].replace('', pd.NA)
crime_df.dropna(subset=cols, inplace=True, how='all')

In [12]:
crime_df.shape

(5226, 8)

In [13]:
df_temp = crime_df

In [14]:
def create_generalized_motive_mapping():
    """Create a hierarchical motive mapping with sexual crime patterns"""
    return {
        # Sexual motives (new additions)
        r'rape|sexual assault|molestation|gang rape|incest|lewd|unnatural sex': 'sexual_violence',
        r'sexual jealousy|libido|lust|perversion|pornography': 'sexual_deviance',
        r'minor|child|pedophil|underage|teen|pocso': 'child_sexual_abuse',
        r'voyeurism|stalking|eve-teasing|harassment|indecent|posh|modesty of women': 'sexual_harassment',
        r'marital rape|spousal|husband|wife|partner.*sexual': 'intimate_partner_sexual',
        r'stranger.*sexual|unknown.*sexual|public.*sexual': 'predatory_sexual',

        # Financial motives
        r'gain|profit|money|theft|robbery|extortion|blackmail|bribe|dacoity|burglary|counterfeiting|forgery|breach|post & telegraph': 'financial',
        r'dowry|marriage|wedding|bride': 'dowry',
        r'property|land|house|real estate|estate dispute': 'property_dispute',

        # Relationship motives (enhanced)
        r'adultery|affair|cheating|infidelity|swinging': 'relationship',  # New
        r'personal vendetta|enem|grudge|rivalry|beef': 'personal_vendetta',
        r'love|affection|marital|husband|wife|partner': 'relationship',

        # Social/cultural motives
        r'communal|religion|sectarian|blasphemy|riot|scheduled|sc & st': 'communal',
        r'caste|tribe|ethnic|racial': 'caste_ethnic',
        r'witch|sorcery|black magic|superstition|occult': 'superstition',
        r'human sacrifice|ritual|offering': 'superstition',
        r'bonded labour|slave|trafficking'

        # Psychological motives (enhanced)
        r'psychopath|serial killer|lunacy|mental|insanity|suicide': 'mental_illness',
        r'anger|quarrel|argument|fight|road rage': 'impulsive',

        # Political motives
        r'political|election|electoral|party': 'political',
        r'terrorism|naxal|extremism|insurgency|separatist|arms|national security|defence|false evidence|official secrets': 'national_safety',

        # System-related (existing)
        r'custodial|police|jail|prison|remand': 'custodial',
        r'gang|mafia|organized crime|syndicate|narcotic': 'gang_related',
        r'wrongful|defamation|copyright|consumer|animal|immoral|passport|railways|electricity|trespass|impersonation|copy right|forest|adulteration|cinematography|public justice':'civil_crimes',
        r'civil|negligence|negligent|nuisance|mischief|public safety|explosive|affray|representative of people|cotpa|disobedience|failure to appear to court|escape|aadhaar|foreigner|public servant': 'civil_crimes',

        # STATE LOCAL ACTS
        r'karnataka':'karnataka_crimes',

        # Default categories
        r'revenge|vengeance|retaliation': 'revenge',
        r'professional|contract killing|hitman': 'professional',
        r'civil dispute|neighbor|village': 'community_dispute',
        r'homicide|intimidation|arson|attempt to murder|kidnapping|trafficking|assault|acid|hurt|criminal|murder|infanticide|juvenile justice act|exposure & abandonment|attempting to commit offences|unlawful activities|abetment|guardianship|probation of offenders':'criminal_acts',
        r'motor|road accident|highway':'vehicle_accidents',
        r'miscellaneous|miscarriage|giving false information|ancient monument': 'misc',
        r'cyber|malware|ransomware|online':'cyber_crime'
    }

In [15]:
def map_motive(text, motive_map):
    """Classify text using the motive mapping"""
    text = str(text).lower()
    for pattern, category in motive_map.items():
        if re.search(pattern, text):
            return category
    return 'other'

In [16]:
def generalize_motive_mapping(df):
    """Apply motive mapping to dataframe using BOTH major_head and minor_head"""
    if df.empty:
        return df

    df = df.copy()
    motive_map = create_generalized_motive_mapping()

    # Combine classification from both columns
    df['motive_category'] = (
        df['major_head'].apply(lambda x: map_motive(x, motive_map)) + '|' +
        df['minor_head'].apply(lambda x: map_motive(x, motive_map))
    )

    # Priority to minor_head classification when both exist
    df['motive_category'] = df['motive_category'].apply(
        lambda x: x.split('|')[1] if x.split('|')[1] != 'other' else x.split('|')[0]
    )
 
    return df

In [17]:
crime_df.shape

(5226, 8)

In [18]:
df_with_motives = generalize_motive_mapping(crime_df)
df_with_motives.shape

(5226, 9)

In [19]:
df_final = df_with_motives[df_with_motives['motive_category'] != 'other']
df_final.to_csv('enhanced_crime_data.csv', index=False)

In [20]:
df_final.shape

(5213, 9)

In [21]:
df_final.head()

Unnamed: 0,heads_of_crime,major_head,minor_head,current_year_upto,prev_year_same_month,prev_month,current_month,month,motive_category
0,A - IPC Crime,Murder (Sec.302/303 IPC),For gain,3.0,2.0,4.0,3.0,January,financial
1,A - IPC Crime,Murder (Sec.302/303 IPC),Over Property Dispute,2.0,0.0,4.0,2.0,January,property_dispute
2,A - IPC Crime,Murder (Sec.302/303 IPC),Due to Personal Vendetta or enemity,1.0,1.0,2.0,1.0,January,personal_vendetta
3,A - IPC Crime,Murder (Sec.302/303 IPC),Due to Sexual jealousy,0.0,2.0,0.0,0.0,January,sexual_deviance
5,A - IPC Crime,Murder (Sec.302/303 IPC),For dowry by other means,1.0,0.0,3.0,1.0,January,dowry
