In [5]:
import spacy
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC

In [6]:
nlp = spacy.load("en_core_web_sm")

In [7]:
existing_data = pd.read_csv(r'.\ipc_sections.csv')

In [8]:
def extract_entities(text):
    doc = nlp(text)
    return [ent.text.lower() for ent in doc.ents]

In [9]:
existing_data['Entities'] = existing_data['Offense'].apply(extract_entities).dropna()


In [10]:
flattened_data = existing_data[['Section', 'Punishment', 'Entities']].explode('Entities')


In [11]:
label_encoder = LabelEncoder()
flattened_data['Section_Label'] = label_encoder.fit_transform(flattened_data['Section'])


In [12]:
def extract_entities(text):
    doc = nlp(text)
    return ' '.join([ent.text.lower() for ent in doc.ents])

In [13]:
existing_data['Entities'] = existing_data['Offense'].apply(extract_entities)

In [14]:
existing_data.count()


Description    386
Offense        386
Punishment     386
Section        386
Entities       386
dtype: int64

In [15]:
existing_data['Entities'].sample(10)

176                               
191                               
288                        dacoity
213                               
348      466 the indian penal code
118                               
151                               
8                                 
377    state the public prosecutor
297                               
Name: Entities, dtype: object

In [16]:
flattened_data['Entities'] = flattened_data['Entities'].fillna('')
tfidf_vectorizer = TfidfVectorizer(lowercase=True)
tfidf_matrix = tfidf_vectorizer.fit_transform(flattened_data['Entities'])


In [17]:
classifier = SVC(kernel='linear')
classifier.fit(tfidf_matrix, flattened_data['Section_Label'])

In [18]:
def predict_section_and_punishment(user_input):
    user_entities = extract_entities(user_input)
    
    if isinstance(user_entities, str):
        user_entities = [user_entities]
    
    user_tfidf = tfidf_vectorizer.transform(user_entities)
    predicted_label = classifier.predict(user_tfidf)
    
    predicted_section = label_encoder.inverse_transform(predicted_label)
    
    punishment = existing_data[existing_data['Section'] == predicted_section[0]]['Punishment'].iloc[0]
    
    return predicted_section[0], punishment

In [19]:
user_input = '''I am writing to report an attempted murder incident that occurred on December 10, 2023, at 11:00 PM, in [Location]. The victim, Mr. Ranveer Kumar, narrowly escaped harm during this event. I urgently seek your attention to this matter for a swift investigation.'''
section, punishment = predict_section_and_punishment(user_input)


In [20]:
if section and punishment:
    print(f"User Input: {user_input}")
    print(f"Predicted Section: {section}")
    print(f"Predicted Punishment: {punishment}")
else:
    print("No match found for the input.")

User Input: I am writing to report an attempted murder incident that occurred on December 10, 2023, at 11:00 PM, in [Location]. The victim, Mr. Ranveer Kumar, narrowly escaped harm during this event. I urgently seek your attention to this matter for a swift investigation.
Predicted Section: IPC_435
Predicted Punishment: 7 Years + Fine
