In [125]:
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score

In [131]:
data=pd.read_csv(r"C:\Users\ishan\Desktop\machine learning\government_complaint_management\for_model.csv")

In [134]:
for i in data["Complaint"]:
    print(i)

There is an ongoing robbery at my house, please send help immediately!
I just witnessed a kidnapping near the central park!
A serious accident just happened, and people are injured.
A gang fight is happening near my street, they have weapons.
My house is on fire, I need emergency services now!
There is an active shooter situation in the mall, please respond urgently!
My child is missing, last seen near the school playground.
An elderly person has collapsed on the road and is unresponsive.
There is a man harassing and threatening people with a knife at the subway.
A woman is screaming for help in my neighborhood, I fear domestic violence.
There is a suspicious vehicle parked outside my house for hours.
My neighbor keeps playing loud music late at night, disturbing everyone.
I lost my phone at the metro station, can it be tracked?
I suspect someone is dealing drugs near the community park.
There is an abandoned bag at the train station, it looks suspicious.
I have been receiving scam cal

In [132]:
data["Department"].unique()

array(['Police Department', 'Cyber Crime Department', 'Women Helpline',
       'Fire Department', 'Consumer Protection Department',
       'Child Welfare Department', 'Traffic Police Department',
       'Social Welfare Department', 'Disaster Management',
       'Anti-Corruption Bureau', 'Human Right Department',
       'Muncipal Coporation Department'], dtype=object)

In [103]:
data["Complaint"]=data["Complaint"].apply(lambda x: x.lower())

In [104]:
stoplist=stopwords.words('english')

In [105]:
l=[]
for i in data["Complaint"]:
    temp=i.split()
    j=0
    while j<len(temp):
        if temp[j] in stoplist:
            temp.pop(j)
        else:
            j+=1
    l.append(" ".join(temp))

In [106]:
for i in range(len(l)):
    l[i]=l[i].replace(".","").replace("!","").replace("(","").replace(")","").replace(",","").replace("?","")
l

['ongoing robbery house please send help immediately',
 'witnessed kidnapping near central park',
 'serious accident happened people injured',
 'gang fight happening near street weapons',
 'house fire need emergency services now',
 'active shooter situation mall please respond urgently',
 'child missing last seen near school playground',
 'elderly person collapsed road unresponsive',
 'man harassing threatening people knife subway',
 'woman screaming help neighborhood fear domestic violence',
 'suspicious vehicle parked outside house hours',
 'neighbor keeps playing loud music late night disturbing everyone',
 'lost phone metro station tracked',
 'suspect someone dealing drugs near community park',
 'abandoned bag train station looks suspicious',
 'receiving scam calls claiming police department',
 'street light broken weeks making unsafe night',
 'found wallet street want return owner',
 'someone scratched car parking lot left without informing',
 'stray dog acting aggressively neighb

In [107]:
data["Complaint"]=l

In [130]:
for i in data["Complaint"]:
    print(i)

ongoing robbery house please send help immediately
witnessed kidnapping near central park
serious accident happened people injured
gang fight happening near street weapons
house fire need emergency services now
active shooter situation mall please respond urgently
child missing last seen near school playground
elderly person collapsed road unresponsive
man harassing threatening people knife subway
woman screaming help neighborhood fear domestic violence
suspicious vehicle parked outside house hours
neighbor keeps playing loud music late night disturbing everyone
lost phone metro station tracked
suspect someone dealing drugs near community park
abandoned bag train station looks suspicious
receiving scam calls claiming police department
street light broken weeks making unsafe night
found wallet street want return owner
someone scratched car parking lot left without informing
stray dog acting aggressively neighborhood might bite someone
someone blocking driveway vehicle
shopkeeper overcha

In [109]:
lemmatizer= WordNetLemmatizer()
l=[]
for i in data["Complaint"]:
    words = word_tokenize(i)
    words = [lemmatizer.lemmatize(word) for word in words]
    l.append(" ".join(words))

In [110]:
vectorizer = TfidfVectorizer(max_features=5000)
x = vectorizer.fit_transform(l)

In [111]:
kmeans = KMeans(n_clusters=12, random_state=42, n_init=10)
data['cluster'] = kmeans.fit_predict(x)

In [112]:
terms = vectorizer.get_feature_names_out()
for i in range(12):
    words = [terms[ind] for ind in kmeans.cluster_centers_[i].argsort()[-10:]]
    print(f"Cluster {i}: {', '.join(words)}")

Cluster 0: minor, keep, school, near, caught, someone, public, without, fire, child
Cluster 1: food, shelter, home, destroyed, still, shortage, reserve, cyclone, drinking, water
Cluster 2: near, smell, storm, road, minor, waterlogging, locality, issue, open, causing
Cluster 3: huge, fire, area, need, residential, broken, assistance, large, immediate, evacuation
Cluster 4: clerk, school, extra, free, benefit, teacher, education, money, government, demanding
Cluster 5: fee, demand, collect, shopkeeper, unauthorized, stall, extorting, money, officer, municipal
Cluster 6: paid, board, meter, installation, storm, day, power, line, three, electricity
Cluster 7: clogged, traffic, main, blocking, tree, fallen, due, road, rain, heavy
Cluster 8: meant, misusing, road, local, construction, project, involved, politician, embezzling, fund
Cluster 9: received, provide, department, officer, 10000, paid, fir, file, police, refused
Cluster 10: camp, storm, run, starving, shelter, temporary, damaged, ho

In [113]:
department_mapping = {
    0: "Child Welfare Department",  # Words: child, school, minor, caught, fire  
    1: "Disaster Management",  # Words: cyclone, shelter, destroyed, food, drinking, water  
    2: "Municipal Corporation Department",  # Words: waterlogging, road, open, locality, smell  
    3: "Fire Department",  # Words: fire, evacuation, immediate, huge, residential  
    4: "Education Department",  # Words: school, teacher, education, money, government  
    5: "Consumer Protection Department",  # Words: shopkeeper, unauthorized, fee, extorting, municipal  
    6: "Electricity Department",  # Words: power, electricity, meter, installation, line  
    7: "Traffic Police Department",  # Words: traffic, road, blocking, tree, rain  
    8: "Anti-Corruption Bureau",  # Words: misusing, politician, project, embezzling, fund  
    9: "Police Department",  # Words: police, officer, department, file, FIR  
    10: "Social Welfare Department",  # Words: shelter, temporary, people, storm, camp  
    11: "Pension & Revenue Department"  # Words: pension, fine, official, pay, clearance  
}

In [129]:
list(data["Department"].unique())

['Police Department',
 'Cyber Crime Department',
 'Women Helpline',
 'Fire Department',
 'Consumer Protection Department',
 'Child Welfare Department',
 'Traffic Police Department',
 'Social Welfare Department',
 'Disaster Management',
 'Anti-Corruption Bureau',
 'Human Right Department',
 'Muncipal Coporation Department']

In [115]:
data['clustered_department'] = data['cluster'].map(department_mapping)

In [116]:
data["cluster"].value_counts()

cluster
0     699
2      49
10     28
9      26
4      26
7      23
5      22
1      20
11     19
8      17
6      13
3      11
Name: count, dtype: int64

In [117]:
accuracy_score(data["Department"],data['clustered_department'])

0.12172088142707241

In [119]:
dbscan = DBSCAN(eps=0.5, min_samples=5, metric="cosine")  # Adjust 'eps' based on dataset
data["dbcluster"] = dbscan.fit_predict(x)

In [122]:
department_mapping = {
    0: "Child Welfare Department",
    1: "Disaster Management",
    2: "Municipal Corporation Department",
    3: "Fire Department",
    4: "Education Department",
    5: "Consumer Protection Department",
    6: "Electricity Department",
    7: "Traffic Police Department",
    8: "Anti-Corruption Bureau",
    9: "Police Department",
    10: "Social Welfare Department",
    11: "Pension & Revenue Department"
}

In [124]:
data["dbdepartment"] = data["dbcluster"].map(department_mapping)

In [128]:
valid_clusters = data[data["cluster"] != -1]["cluster"]
valid_X = x[data["cluster"] != -1].toarray()

silhouette_avg = silhouette_score(valid_X, valid_clusters)
db_score = davies_bouldin_score(valid_X, valid_clusters)

print(f"Silhouette Score: {silhouette_avg:.2f}")
print(f"Davies-Bouldin Score: {db_score:.2f}")

Silhouette Score: 0.06
Davies-Bouldin Score: 3.40
