In [None]:
pip install pandas numpy nltk scikit-learn tensorflow



In [None]:
import pandas as pd
import re
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk

In [None]:
# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Load your dataset (assuming CSV format and that your dataset has 'processed_text' and 'label' columns)
data = pd.read_csv('/content/train.csv')  # Replace 'your_dataset.csv' with your file path

# Step 1: Data Cleaning
data.dropna(subset=['crimeaditionalinfo'], inplace=True)
data.drop_duplicates(inplace=True)

# Check unique values in each column
print(data['category'].value_counts())
print(data['sub_category'].value_counts())

category
Online Financial Fraud                                  52496
Online and Social Media Related Crime                   12076
Any Other Cyber Crime                                   10811
Cyber Attack/ Dependent Crimes                           3608
Sexually Obscene material                                1764
Hacking  Damage to computercomputer system etc           1709
Sexually Explicit Act                                    1489
Cryptocurrency Crime                                      473
Online Gambling  Betting                                  444
Child Pornography CPChild Sexual Abuse Material CSAM      357
RapeGang Rape RGRSexually Abusive Content                 248
Online Cyber Trafficking                                  183
Cyber Terrorism                                           161
Ransomware                                                 56
Report Unlawful Content                                     1
Name: count, dtype: int64
sub_category
UPI Related Frauds    

In [None]:
# Step 2: Text Preprocessing
# Define a function to clean text
def clean_text(text):
    text = text.lower()  # Lowercase text
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    return text

# Apply the function to the 'crimeaditionalinfo' column
data['cleaned_text'] = data['crimeaditionalinfo'].apply(clean_text)

In [None]:
def preprocess_text(text):
    tokens = text.split()
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(tokens)

In [None]:
data['processed_text'] = data['cleaned_text'].apply(preprocess_text)

In [None]:
# Prepare data for neural network
X = data['processed_text']
y = data['category']

In [None]:
# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [None]:
# Tokenization
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)

In [None]:
# Padding sequences
max_length = max(len(seq) for seq in X_seq)
X_pad = pad_sequences(X_seq, maxlen=max_length)

In [None]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_pad, y_encoded, test_size=0.2, random_state=42)

In [None]:
# Build the Neural Network model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=max_length))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))



In [None]:
# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test), verbose=2)

Epoch 1/20
2147/2147 - 730s - 340ms/step - accuracy: 0.7339 - loss: 0.7923 - val_accuracy: 0.7386 - val_loss: 0.7632
Epoch 2/20
2147/2147 - 740s - 344ms/step - accuracy: 0.7529 - loss: 0.7181 - val_accuracy: 0.7459 - val_loss: 0.7376
Epoch 3/20
2147/2147 - 768s - 358ms/step - accuracy: 0.7662 - loss: 0.6735 - val_accuracy: 0.7443 - val_loss: 0.7468
Epoch 4/20
2147/2147 - 769s - 358ms/step - accuracy: 0.7792 - loss: 0.6372 - val_accuracy: 0.7417 - val_loss: 0.7598
Epoch 5/20
2147/2147 - 755s - 351ms/step - accuracy: 0.7902 - loss: 0.6071 - val_accuracy: 0.7399 - val_loss: 0.7657
Epoch 6/20
2147/2147 - 737s - 343ms/step - accuracy: 0.8008 - loss: 0.5762 - val_accuracy: 0.7330 - val_loss: 0.7900
Epoch 7/20
2147/2147 - 738s - 344ms/step - accuracy: 0.8096 - loss: 0.5451 - val_accuracy: 0.7331 - val_loss: 0.8205
Epoch 8/20
2147/2147 - 743s - 346ms/step - accuracy: 0.8194 - loss: 0.5205 - val_accuracy: 0.7272 - val_loss: 0.8379
Epoch 9/20
2147/2147 - 744s - 347ms/step - accuracy: 0.8283 - lo

In [None]:
# Predictions
y_pred = np.argmax(model.predict(X_test), axis=-1)

[1m537/537[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 114ms/step


In [None]:
# Determine the labels actually present in y_test
labels = np.unique(y_test)

# Evaluate the model
print("Neural Network Model Evaluation:")
print(f'Accuracy: {accuracy_score(y_test, y_pred):.4f}')
print('Classification Report:')
print(classification_report(y_test, y_pred, labels=labels, target_names=label_encoder.classes_[labels]))

Neural Network Model Evaluation:
Accuracy: 0.7027
Classification Report:
                                                      precision    recall  f1-score   support

                               Any Other Cyber Crime       0.35      0.34      0.35      2116
Child Pornography CPChild Sexual Abuse Material CSAM       0.42      0.26      0.32        68
                                Cryptocurrency Crime       0.56      0.43      0.49        95
                      Cyber Attack/ Dependent Crimes       1.00      1.00      1.00       718
                                     Cyber Terrorism       0.00      0.00      0.00        44
      Hacking  Damage to computercomputer system etc       0.34      0.31      0.32       375
                            Online Cyber Trafficking       0.00      0.00      0.00        36
                              Online Financial Fraud       0.83      0.86      0.85     10520
                            Online Gambling  Betting       0.15      0.09      0

In [None]:
def predict_category(user_input):
    # Step 1: Preprocess the input
    cleaned_input = clean_text(user_input)
    processed_input = preprocess_text(cleaned_input)

    # Step 2: Tokenize and pad the input to match the LSTM input format
    input_seq = tokenizer.texts_to_sequences([processed_input])
    input_pad = pad_sequences(input_seq, maxlen=max_length)  # Ensure the same max_length as during training

    # Step 3: Predict the category
    prediction = model.predict(input_pad)
    predicted_category_index = np.argmax(prediction, axis=1)[0]
    predicted_category = label_encoder.inverse_transform([predicted_category_index])[0]

    return predicted_category

In [None]:
# User input Testing
user_input = input("Enter your complaint: ")
predicted_category = predict_category(user_input)
print(f"Predicted Category: {predicted_category}")

Enter your complaint: My starmaker id was hacked and used another person  My account name  Sonai Das Id sonai Id mobile nu    Full name Sonamoni Das  address Ukhra krishna chura pallyPaschim Bardhaman West bengal 
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
Predicted Category: Online and Social Media Related Crime
