In [81]:
import numpy as np
import pandas as pd
import os
import re

In [82]:
file_list = [
    "January.csv", "February.csv", "March.csv", "April.csv", "May.csv", "June.csv",
    "July.csv", "August.csv", "September.csv", "October.csv", "November.csv", "December.csv"
]

standard_columns = [
    "sl_no",
    "heads_of_crime",
    "major_head",
    "minor_head",
    "current_year_upto",
    "prev_year_same_month",
    "prev_month",
    "current_month"
]

def clean_df(df):
    df = df.loc[:, ~df.columns.str.contains("^Unnamed|^\\s*$")]
    col_map = {}
    for col in df.columns:
        c = col.strip().lower().replace('.', '').replace('  ', ' ').replace(' ', '_').replace('-', '_')
        c = c.replace('heads_of_crime', 'heads_of_crime')
        if 'sl' in c and 'no' in c:
            c = 'sl_no'
        elif 'heads' in c and 'crime' in c:
            c = 'heads_of_crime'
        elif 'major' in c and 'head' in c:
            c = 'major_head'
        elif 'minor' in c and 'head' in c:
            c = 'minor_head'
        elif 'current_year' in c or 'current_year_upto' in c or 'month_under_review' in c:
            c = 'current_year_upto'
        elif 'corresponding_month' in c:
            c = 'prev_year_same_month'
        elif 'previous_month' in c and 'corresponding' not in c:
            c = 'prev_month'
        elif 'current_month' in c:
            c = 'current_month'
        col_map[col] = c
    df = df.rename(columns=col_map)
    for col in standard_columns:
        if col not in df.columns:
            df[col] = pd.NA
    df = df[standard_columns]
    return df

output_dir = "cleaned_csvs"
os.makedirs(output_dir, exist_ok=True)

In [83]:
dfs = []
for fname in file_list:
    try:
        try:
            df = pd.read_csv(fname, encoding="utf-8")
        except UnicodeDecodeError:
            df = pd.read_csv(fname, encoding="latin1")
        df_clean = clean_df(df)
        month_name = os.path.splitext(os.path.basename(fname))[0]
        df_clean['month'] = month_name
        dfs.append(df_clean)
        out_path = os.path.join(output_dir, fname)
        df_clean.to_csv(out_path, index=False)
    except Exception as e:
        print(f"Error processing {fname}: {e}")

final_df = pd.concat(dfs, ignore_index=True)
final_df.to_csv("all_months_concatenated.csv", index=False)

print(f"Cleaned individual files saved in '{output_dir}/'")
print("Concatenated file saved as 'all_months_concatenated.csv'")

Cleaned individual files saved in 'cleaned_csvs/'
Concatenated file saved as 'all_months_concatenated.csv'


In [84]:
final_df.shape

(8467, 9)

In [85]:
final_df.head()

Unnamed: 0,sl_no,heads_of_crime,major_head,minor_head,current_year_upto,prev_year_same_month,prev_month,current_month,month
0,1.0,A - IPC Crime,Murder (Sec.302/303 IPC),For gain,3.0,2.0,4.0,3.0,January
1,2.0,A - IPC Crime,Murder (Sec.302/303 IPC),Over Property Dispute,2.0,0.0,4.0,2.0,January
2,3.0,A - IPC Crime,Murder (Sec.302/303 IPC),Due to Personal Vendetta or enemity,1.0,1.0,2.0,1.0,January
3,4.0,A - IPC Crime,Murder (Sec.302/303 IPC),Due to Sexual jealousy,0.0,2.0,0.0,0.0,January
4,5.0,A - IPC Crime,Murder (Sec.302/303 IPC),For dowry by burning,0.0,0.0,0.0,0.0,January


In [86]:
final_df.drop('sl_no', axis=1)

Unnamed: 0,heads_of_crime,major_head,minor_head,current_year_upto,prev_year_same_month,prev_month,current_month,month
0,A - IPC Crime,Murder (Sec.302/303 IPC),For gain,3.0,2.0,4.0,3.0,January
1,A - IPC Crime,Murder (Sec.302/303 IPC),Over Property Dispute,2.0,0.0,4.0,2.0,January
2,A - IPC Crime,Murder (Sec.302/303 IPC),Due to Personal Vendetta or enemity,1.0,1.0,2.0,1.0,January
3,A - IPC Crime,Murder (Sec.302/303 IPC),Due to Sexual jealousy,0.0,2.0,0.0,0.0,January
4,A - IPC Crime,Murder (Sec.302/303 IPC),For dowry by burning,0.0,0.0,0.0,0.0,January
...,...,...,...,...,...,...,...,...
8462,,Murder,,99.0,8.0,4.0,7.0,December
8463,,Rape,,186.0,13.0,14.0,14.0,December
8464,,Kidnapping,,124.0,7.0,10.0,7.0,December
8465,,Offences under the Protection of Civil Rights ...,,0.0,0.0,0.0,0.0,December


In [87]:
final_df.drop_duplicates()
final_df.shape

(8467, 9)

In [88]:
unique_minor_heads = final_df['minor_head'].dropna().unique()
for m in sorted(unique_minor_heads):
    print(m)

1.1  Custodial Rape
1.2  Gang Rape
1.3  other Cases of Rape
174 A
229 A
ADULTERATION (Sec. 272, 273 to 276 IPC)
AEPS (Aadhaar)
AFFRAY (Sec. 160 IPC )
ANIMAL
ANTIQUES (CULTURAL PROPERTY)
ARMS ACT 1959
ARSON (Sec. 435, 436, 438 IPC )
ATM Fraud
Abetment  
Abetment of Suicide
Accident by Air
Acid Attack (326(a))
Adulterated Food/Drug
Adultery
Advertisement
Advertising Frauds
Agrarian
Agricultural Produce
Air Force
Alcoholic Drink (Liquor)
Ancient Monument & Archaelogical Sites And Remains Act 1958
Animal  
Animal -Related
Antiques & Art Treasures Act 1972
Army
Assembly
At ATM
At Banks
At Commercial Places / Establishments
At Residential Premises
Attemp to Commit
Attempt
Attempt To Commit
Attempt to Acid Attack (326(b))
Attempt to Commit
Attempt to Commit suicide
Attempt to commit
Attempting to commit offences
Audio Tape
Bangalore Development Authority Act
Bank
Banning of Unregulated Deposit Schemes Act
Betting 
Boad Accident
Boat / Ship
Boat/Vessel - Related
Book
Boy
Boy 
Bridge/Dam Collap

In [89]:
motive_patterns = {
    'gain': [
        r'gain', r'monetary', r'ransom', r'robbery', r'extortion', r'cheating', r'for money', r'for valuables'
    ],
    'enmity': [
        r'enmity', r'vendetta', r'revenge', r'rivalry', r'animosity', r'feud', r'grudge', r'hostility'
    ],
    'sexual motive': [
        r'sexual', r'rape', r'molestation', r'eve teasing', r'jealousy', r'adultery', r'lust', r'unnatural sex'
    ],
    'property dispute': [
        r'property', r'land', r'house', r'dispute', r'boundary', r'possession', r'real estate'
    ],
    'dowry': [
        r'dowry'
    ],
    'communal': [
        r'communal', r'caste', r'religion', r'sectarian', r'casteism'
    ],
    'personal': [
        r'personal', r'family', r'domestic', r'relationship', r'marriage', r'love', r'affair'
    ],
    'political': [
        r'political', r'election', r'party', r'vote'
    ],
    'quarrel': [
        r'quarrel', r'argument', r'fight', r'brawl', r'altercation', r'sudden quarrel'
    ],
    'professional': [
        r'professional', r'misconduct', r'negligence', r'malpractice', r'workplace'
    ],
    'theft': [
        r'theft', r'robbery', r'burglary', r'dacoity', r'steal', r'larceny'
    ],
    'kidnapping': [
        r'kidnapping', r'abduction', r'missing'
    ],
    'negligence': [
        r'negligence', r'rash', r'accident'
    ],
    'substance': [
        r'liquor', r'drug', r'narcotic', r'psychotropic', r'hooch', r'alcohol'
    ],
    'public order': [
        r'riot', r'affray', r'unlawful assembly', r'public nuisance', r'public order'
    ],
    'cyber': [
        r'cyber', r'it act', r'online', r'internet'
    ],
    'state security': [
        r'sedition', r'terrorism', r'naxalism', r'espionage', r'waging war', r'insurgency'
    ],
    'other': []  # fallback
}

# Compile regex for each motive for efficiency
compiled_patterns = {
    motive: [re.compile(pat, re.IGNORECASE) for pat in patterns]
    for motive, patterns in motive_patterns.items()
}

def map_general_motive(minor_head):
    if not isinstance(minor_head, str) or not minor_head.strip():
        return 'other'
    for motive, patterns in compiled_patterns.items():
        for pat in patterns:
            if pat.search(minor_head):
                return motive
    return 'other'

final_df['general_motive'] = final_df['minor_head'].apply(map_general_motive)

# Check number of unique general motives (should be <= 18)
print(final_df['general_motive'].nunique())
print(sorted(final_df['general_motive'].unique()))

# Save the updated DataFrame
final_df.to_csv('crime_data_with_general_motives.csv', index=False)

18
['communal', 'cyber', 'dowry', 'enmity', 'gain', 'kidnapping', 'negligence', 'other', 'personal', 'political', 'professional', 'property dispute', 'public order', 'quarrel', 'sexual motive', 'state security', 'substance', 'theft']


In [90]:
final_df.head()

Unnamed: 0,sl_no,heads_of_crime,major_head,minor_head,current_year_upto,prev_year_same_month,prev_month,current_month,month,general_motive
0,1.0,A - IPC Crime,Murder (Sec.302/303 IPC),For gain,3.0,2.0,4.0,3.0,January,gain
1,2.0,A - IPC Crime,Murder (Sec.302/303 IPC),Over Property Dispute,2.0,0.0,4.0,2.0,January,property dispute
2,3.0,A - IPC Crime,Murder (Sec.302/303 IPC),Due to Personal Vendetta or enemity,1.0,1.0,2.0,1.0,January,enmity
3,4.0,A - IPC Crime,Murder (Sec.302/303 IPC),Due to Sexual jealousy,0.0,2.0,0.0,0.0,January,sexual motive
4,5.0,A - IPC Crime,Murder (Sec.302/303 IPC),For dowry by burning,0.0,0.0,0.0,0.0,January,dowry


In [91]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8467 entries, 0 to 8466
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   sl_no                 8466 non-null   float64
 1   heads_of_crime        7731 non-null   object 
 2   major_head            8464 non-null   object 
 3   minor_head            7651 non-null   object 
 4   current_year_upto     8433 non-null   float64
 5   prev_year_same_month  8433 non-null   float64
 6   prev_month            8433 non-null   float64
 7   current_month         8432 non-null   float64
 8   month                 8467 non-null   object 
 9   general_motive        8467 non-null   object 
dtypes: float64(5), object(5)
memory usage: 661.6+ KB


In [96]:
final_df[final_df['general_motive'] == 'other']

sl_no                   6660
heads_of_crime          6079
major_head              6659
minor_head              5845
current_year_upto       6635
prev_year_same_month    6635
prev_month              6635
current_month           6634
month                   6661
general_motive          6661
dtype: int64

In [93]:
final_df.general_motive.describe()

count      8467
unique       18
top       other
freq       6661
Name: general_motive, dtype: object

In [97]:
other_crimes = final_df[final_df['general_motive'] == 'other']

In [98]:
other_crimes.shape

(6661, 10)

AttributeError: 'Series' object has no attribute 'lower'