In [1]:
# --- 1. SETUP: Import your tools and load the data ---
import pandas as pd
from geopy.geocoders import Nominatim  # <-- Add this line
from tqdm import tqdm

# The exact filename from your folder
filename = "42_District_wise_crimes_committed_against_women_2013.csv"

# Load the data
df = pd.read_csv(filename)

# Display the first few rows to confirm it worked
print("File loaded successfully!")
print(df.head())


File loaded successfully!
         STATE/UT   DISTRICT  Year  Rape  Kidnapping and Abduction  \
0  Andhra Pradesh   ADILABAD  2013    61                        47   
1  Andhra Pradesh  ANANTAPUR  2013    28                        84   
2  Andhra Pradesh   CHITTOOR  2013    31                        27   
3  Andhra Pradesh   CUDDAPAH  2013    19                        50   
4  Andhra Pradesh  CYBERABAD  2013   138                       129   

   Dowry Deaths  Assault on women with intent to outrage her modesty  \
0            12                                                197     
1            23                                                337     
2            13                                                119     
3             9                                                318     
4            43                                                350     

   Insult to modesty of Women  Cruelty by Husband or his Relatives  \
0                         138                     

In [2]:
# --- Lesson 1: Inspection ---
# Load the data, adding the column headers it's missing
filename = "42_District_wise_crimes_committed_against_women_2013.csv"
column_names = [
    'STATE/UT', 'DISTRICT', 'Year', 'Rape', 'Kidnapping and Abduction', 
    'Dowry Deaths', 'Assault on women with intent to outrage her modesty',
    'Insult to modesty of Women', 'Cruelty by Husband or his Relatives',
    'Importation of Girls from Foreign Country'
]

df = pd.read_csv(filename)
print("--- Inspection: First 5 Rows ---")
print(df.head())
df.info()

# --- Lesson 2: Cleaning ---
print("\n--- Cleaning the data ---")
df = df[df['DISTRICT'] != 'TOTAL'] # Remove summary rows
df.dropna(subset=['STATE/UT', 'DISTRICT'], inplace=True) # Remove rows with missing locations
print("Cleaning complete.")

--- Inspection: First 5 Rows ---
         STATE/UT   DISTRICT  Year  Rape  Kidnapping and Abduction  \
0  Andhra Pradesh   ADILABAD  2013    61                        47   
1  Andhra Pradesh  ANANTAPUR  2013    28                        84   
2  Andhra Pradesh   CHITTOOR  2013    31                        27   
3  Andhra Pradesh   CUDDAPAH  2013    19                        50   
4  Andhra Pradesh  CYBERABAD  2013   138                       129   

   Dowry Deaths  Assault on women with intent to outrage her modesty  \
0            12                                                197     
1            23                                                337     
2            13                                                119     
3             9                                                318     
4            43                                                350     

   Insult to modesty of Women  Cruelty by Husband or his Relatives  \
0                         138              

In [3]:
print("Step 1: Importing libraries...")
import pandas as pd
from geopy.geocoders import Nominatim
from tqdm import tqdm
print("Libraries imported successfully.")

# --- 2. LOAD DATA: Load the raw Kaggle file and add headers ---
print("\nStep 2: Loading raw data...")
# The exact filename from your folder
filename = "42_District_wise_crimes_committed_against_women_2013.csv"
# These are the correct column headers for this specific file
column_names = [
    'STATE/UT', 'DISTRICT', 'Year', 'Rape', 'Kidnapping and Abduction', 
    'Dowry Deaths', 'Assault on women with intent to outrage her modesty',
    'Insult to modesty of Women', 'Cruelty by Husband or his Relatives',
    'Importation of Girls from Foreign Country'
]
df = pd.read_csv(filename, header=None, names=column_names)
print("Raw data loaded.")

Step 1: Importing libraries...
Libraries imported successfully.

Step 2: Loading raw data...
Raw data loaded.


In [4]:
# ===================================================================
# FINAL SCRIPT FOR PHASE 1: DATA PREPARATION
# Input: 42_District_wise_crimes_committed_against_women_2013.csv
# Output: crimes_cleaned.csv
# ===================================================================

# --- 1. SETUP: Import all necessary tools ---
print("Step 1: Importing libraries...")
import pandas as pd
from geopy.geocoders import Nominatim
from tqdm import tqdm
print("Libraries imported successfully.")

# --- 2. LOAD DATA: Load the raw Kaggle file and add headers ---
print("\nStep 2: Loading raw data...")
# The exact filename from your folder
filename = "42_District_wise_crimes_committed_against_women_2013.csv"
# These are the correct column headers for this specific file
column_names = [
    'STATE/UT', 'DISTRICT', 'Year', 'Rape', 'Kidnapping and Abduction', 
    'Dowry Deaths', 'Assault on women with intent to outrage her modesty',
    'Insult to modesty of Women', 'Cruelty by Husband or his Relatives',
    'Importation of Girls from Foreign Country'
]
# REPLACE IT WITH THIS LINE
df = pd.read_csv(filename)
print("Raw data loaded.")

# --- 3. CLEAN DATA: Remove summary rows ---
print("\nStep 3: Cleaning data...")
df = df[df['DISTRICT'] != 'TOTAL']
df.dropna(subset=['STATE/UT', 'DISTRICT'], inplace=True)
print("Data cleaned.")

# --- 4. FEATURE ENGINEERING PART A: Geocoding ---
print("\nStep 4: Geocoding district locations. This is the slow part, please be patient...")
geolocator = Nominatim(user_agent="aegis_sih_app_final_v2")
df['full_location'] = df['DISTRICT'] + ", " + df['STATE/UT']
unique_locations = df['full_location'].unique()

location_objects = {}
# The tqdm wrapper will show a progress bar
for location in tqdm(unique_locations):
    try:
        location_data = geolocator.geocode(location, timeout=10)
        location_objects[location] = location_data
    except Exception as e:
        location_objects[location] = None
print("Geocoding complete.")

# --- 5. FEATURE ENGINEERING PART B: Unpacking Coordinates ---
print("\nStep 5: Extracting latitude and longitude...")
df['location_obj'] = df['full_location'].map(location_objects)
df.dropna(subset=['location_obj'], inplace=True)

df['latitude'] = df['location_obj'].apply(lambda loc: loc.latitude)
df['longitude'] = df['location_obj'].apply(lambda loc: loc.longitude)
print("Coordinates extracted.")

# --- 6. FEATURE ENGINEERING PART C: Creating risk_score ---
print("\nStep 6: Creating the final risk_score...")
risk_columns = [
    'Rape', 'Kidnapping and Abduction',
    'Assault on women with intent to outrage her modesty'
]
df['risk_score'] = df[risk_columns].sum(axis=1)
print("risk_score created.")

# --- 7. SAVE THE FINAL FILE ---
print("\nStep 7: Saving the clean data...")
final_df = df[['latitude', 'longitude', 'Year', 'risk_score']]
final_df.to_csv('crimes_cleaned.csv', index=False)

print("\n==========================================================")
print("✅ SUCCESS! Phase 1 is complete.")
print("The file 'crimes_cleaned.csv' has been created in your folder.")
print("==========================================================")

Step 1: Importing libraries...
Libraries imported successfully.

Step 2: Loading raw data...
Raw data loaded.

Step 3: Cleaning data...
Data cleaned.

Step 4: Geocoding district locations. This is the slow part, please be patient...


100%|████████████████████████████████████████████████████████████████████████████████| 823/823 [18:12<00:00,  1.33s/it]

Geocoding complete.

Step 5: Extracting latitude and longitude...
Coordinates extracted.

Step 6: Creating the final risk_score...
risk_score created.

Step 7: Saving the clean data...

✅ SUCCESS! Phase 1 is complete.
The file 'crimes_cleaned.csv' has been created in your folder.





In [5]:
# --- 1. SETUP: Import your tools ---
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import joblib

# --- 2. LOAD DATA: Use the clean foundation you just built ---
df = pd.read_csv('crimes_cleaned.csv')

# --- 3. DEFINE FEATURES AND TARGET ---
features = ['latitude', 'longitude', 'Year']
target = 'risk_score'

X = df[features]
y = df[target]

# --- 4. SPLIT DATA: Create the "study material" and the "final exam" ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Data split into {len(X_train)} training records and {len(X_test)} testing records.")

# --- 5. TRAIN THE MODEL ---
print("\nTraining the model...")
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)
print("Model training complete!")

# --- 6. EVALUATE THE MODEL ---
score = model.score(X_test, y_test)
print(f"Model score (R-squared) on test data: {score:.2f}")

# --- 7. SAVE THE MODEL ---
joblib.dump(model, 'risk_model.joblib')
print("Model saved successfully as 'risk_model.joblib'")

Data split into 511 training records and 128 testing records.

Training the model...
Model training complete!
Model score (R-squared) on test data: -1.50
Model saved successfully as 'risk_model.joblib'
