In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [2]:
# Download NLTK resources
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
# Load datasets
train_df = pd.read_csv('train.csv')  # Adjust path as necessary
test_df = pd.read_csv('test.csv')    # Adjust path as necessary

In [4]:
print(train_df.head())

                                category                       sub_category  \
0  Online and Social Media Related Crime  Cyber Bullying  Stalking  Sexting   
1                 Online Financial Fraud                  Fraud CallVishing   
2               Online Gambling  Betting           Online Gambling  Betting   
3  Online and Social Media Related Crime                   Online Job Fraud   
4                 Online Financial Fraud                  Fraud CallVishing   

                                  crimeaditionalinfo  
0  I had continue received random calls and abusi...  
1  The above fraudster is continuously messaging ...  
2  He is acting like a police and demanding for m...  
3  In apna Job I have applied for job interview f...  
4  I received a call from lady stating that she w...  


In [6]:
# Text cleaning and preprocessing function
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove special characters
    text = re.sub(r'\W', ' ', text)
    # Tokenization
    tokens = text.split()
    # Stop word removal and stemming
    ps = PorterStemmer()
    tokens = [ps.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [10]:
print(train_df.isnull().sum())
print(test_df.isnull().sum())

category                 0
sub_category          6591
crimeaditionalinfo      21
dtype: int64
category                 0
sub_category          2236
crimeaditionalinfo       7
dtype: int64


In [12]:
# Handle missing values: drop or fill
train_df['complaint'] = train_df['crimeaditionalinfo'].fillna('')  # Filling NaNs with empty strings
test_df['complaint'] = test_df['crimeaditionalinfo'].fillna('')    # Filling NaNs with empty strings


In [20]:
# Text cleaning and preprocessing function
def preprocess_text(text):
    if not isinstance(text, str):  # Check if the text is a string
        return ''  # Return empty string for non-string types
    # Lowercase
    # Lowercase
    text = text.lower()
    # Remove special characters
    text = re.sub(r'\W', ' ', text)
    # Tokenization
    tokens = text.split()
    # Stop word removal and stemming
    ps = PorterStemmer()
    tokens = [ps.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

    # Apply preprocessing to the train dataset
train_df['cleaned_complaint'] = train_df['crimeaditionalinfo'].apply(preprocess_text)

# Apply preprocessing to the test dataset
test_df['cleaned_complaint'] = test_df['crimeaditionalinfo'].apply(preprocess_text)



**Step 2: Model Development**


In [23]:
print("Train DataFrame Columns:", train_df.columns)
print("Test DataFrame Columns:", test_df.columns)

Train DataFrame Columns: Index(['category', 'sub_category', 'crimeaditionalinfo', 'complaint',
       'cleaned_complaint'],
      dtype='object')
Test DataFrame Columns: Index(['category', 'sub_category', 'crimeaditionalinfo', 'complaint',
       'cleaned_complaint'],
      dtype='object')


In [26]:
X_train = train_df['cleaned_complaint']
y_train = train_df['category']

# Prepare the test dataset
X_test = test_df['cleaned_complaint']
y_test = test_df['category']

In [25]:
# Vectorization
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [27]:
# Train the model
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


**# Step 3: Model Evaluation**

In [28]:
# Predictions
y_pred = model.predict(X_test_vectorized)


In [30]:
# Accuracy measurement
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Classification report
print(classification_report(y_test, y_pred))

Accuracy: 0.76


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                                                      precision    recall  f1-score   support

                               Any Other Cyber Crime       0.43      0.24      0.31      3670
Child Pornography CPChild Sexual Abuse Material CSAM       0.68      0.26      0.38       123
                      Crime Against Women & Children       0.00      0.00      0.00         4
                                Cryptocurrency Crime       0.65      0.51      0.57       166
                      Cyber Attack/ Dependent Crimes       1.00      1.00      1.00      1261
                                     Cyber Terrorism       0.00      0.00      0.00        52
      Hacking  Damage to computercomputer system etc       0.41      0.23      0.30       592
                            Online Cyber Trafficking       0.00      0.00      0.00        61
                              Online Financial Fraud       0.81      0.94      0.87     18896
                            Online Gambling  Betting       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# New Section