<a href="https://colab.research.google.com/github/MarohBJoshua22/21st-Century-Movie-Recommender-for-kids/blob/main/SA_Grooming_Detection_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Inserting the Perverted justice dataset (PJ)

In [2]:
dfpj = pd.read_csv('/content/drive/MyDrive/merged-csv-files.csv')
# dfpj = pd.read_csv('/content/drive/MyDrive/Pj_filtered.csv')
dfpj = dfpj[['BODY']].rename(columns={'BODY': 'Text'})
dfpj = dfpj[['Text']]
dfpj['Text'] = dfpj['Text'].astype(str)  # Convert to string type
dfpj['label'] = 1  # Binary label for non-grooming
dfpj = dfpj.drop_duplicates(subset='Text') #removing duplicates
dfpj = dfpj.dropna() #removing null values
# Function to remove symbols not followed by letters
def remove_symbols(text):
    return re.sub(r'[^A-Za-z0-9\s]+(?=\s|$)', '', text)
dfpj['Text'] = dfpj['Text'].apply(remove_symbols)

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

dfpj['Text'] = dfpj['Text'].apply(remove_stopwords)

In [3]:
dfpj.shape

(39808, 2)

# Inserting the twitter dataset

In [4]:
dftw = pd.read_csv('/content/drive/MyDrive/twitter_training.csv')

# Filter out rows where 'lexicon' is 'negative'
dftw = dftw[dftw['Lexicon'] != 'Negative']
# dftw = dftw[dftw['Lexicon'] != 'Positive']
dftw = dftw[dftw['Lexicon'] != 'Neutral']
dftw = dftw[dftw['Lexicon'] != 'Irrelevant']



In [5]:
dftw.head()

Unnamed: 0,number,Borderlands,Lexicon,Text
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [6]:
dftw = dftw[['Text']].rename(columns={'Text': 'Text'})
dftw['Text'] = dftw['Text'].astype(str)  # Convert to string type
dftw['label'] = 0  # Binary label for non-grooming
dftw = dftw.drop_duplicates(subset='Text') #removing duplicates
dftw = dftw.dropna() #removing null values

def remove_symbols(text):
    return re.sub(r'[^A-Za-z0-9\s]+(?=\s|$)', '', text)
dftw['Text'] = dftw['Text'].apply(remove_symbols)

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

dftw['Text'] = dftw['Text'].apply(remove_stopwords)

In [7]:
dftw.shape

(19138, 2)

In [8]:
dftw.head(100)

Unnamed: 0,Text,label
0,coming borders kill,0
1,im getting borderlands kill,0
2,im coming borderlands murder,0
3,im getting borderlands 2 murder,0
4,im getting borderlands murder,0
...,...,...
245,Finally got around starting Borderlands 3 week...,0
246,Finally managed start Borderlands 3 weekend lo...,0
247,Finally got launch Borderlands 3 weekend I'm a...,0
248,Finally got around starting Borderlands 3 week...,0


# Combine Both Datasets

In [9]:
combined_df = pd.concat([
    dfpj, dftw
    ], ignore_index=True)

# Convert 'label' column to numeric, handling non-numeric values
combined_df['label'] = pd.to_numeric(combined_df['label'], errors='coerce').fillna(0).astype(int)

combined_df.head(1000)
combined_df.shape

(58946, 2)

# TF-IDF for feature extraction

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features based on dataset size
X = tfidf_vectorizer.fit_transform(combined_df['Text']).toarray()

Y = combined_df['label']

# Data Splitting

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

# Classfication

Naive Bayes

In [12]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

nb_model = MultinomialNB()
nb_model.fit(X_train, Y_train)
y_pred_nb = nb_model.predict(X_test)
print(classification_report(Y_test, y_pred_nb))

              precision    recall  f1-score   support

           0       0.93      0.89      0.91      3786
           1       0.95      0.97      0.96      8004

    accuracy                           0.94     11790
   macro avg       0.94      0.93      0.93     11790
weighted avg       0.94      0.94      0.94     11790



Logistic Regression

In [13]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression()
lr_model.fit(X_train, Y_train)
y_pred_lr = lr_model.predict(X_test)
print(classification_report(Y_test, y_pred_lr))

              precision    recall  f1-score   support

           0       0.95      0.88      0.91      3786
           1       0.94      0.98      0.96      8004

    accuracy                           0.95     11790
   macro avg       0.95      0.93      0.94     11790
weighted avg       0.95      0.95      0.95     11790



SVM

In [None]:
from sklearn.svm import SVC

svm_model = SVC(kernel='linear')
svm_model.fit(X_train, Y_train)
y_pred_svm = svm_model.predict(X_test)
print(classification_report(Y_test, y_pred_svm))