<a href="https://colab.research.google.com/github/Hemanth3304/shieldnett/blob/main/shieldnet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

def load_dataset(path):
    df = pd.read_csv(path)
    print("✅ Dataset Loaded Successfully!")
    print(f"🔢 Shape: {df.shape}")
    print("🧾 First 5 entries:")
    print(df.head())
    return df



In [None]:
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("punkt_tab")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\\S+|www\\S+|https\\S+", "", text)
    text = re.sub(r"[^a-zA-Z\\s]", "", text)
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

def preprocess_dataframe(df, text_column="text"):
    df[text_column] = df[text_column].astype(str).apply(clean_text)
    return df

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split

def extract_features(df, text_column="text", label_column="label", method="tfidf"):
    X = df[text_column]
    y = df[label_column]

    if method == "tfidf":
        vectorizer = TfidfVectorizer(max_features=5000)
    elif method == "bow":
        vectorizer = CountVectorizer(max_features=5000)
    else:
        raise ValueError("Invalid method: choose 'tfidf' or 'bow'")

    X_vectorized = vectorizer.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(
        X_vectorized, y, test_size=0.2, random_state=42
    )

    return X_train, X_test, y_train, y_test, vectorizer


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

def train_models(X_train, y_train):
    models = {
        "LogisticRegression": LogisticRegression(max_iter=1000),
        "SVM": SVC(kernel='linear', probability=True),
        "RandomForest": RandomForestClassifier(),
        "NaiveBayes": MultinomialNB()
    }

    for name, model in models.items():
        model.fit(X_train, y_train)
        print(f"✅ Trained {name}")

    return models


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print("🧪 Evaluation Results:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))
    print("\nDetailed Classification Report:")
    print(classification_report(y_test, y_pred))


In [None]:
# Load and preprocess data
df = load_dataset("/content/Suspicious Communication on Social Platforms.csv")
df = preprocess_dataframe(df, text_column="comments")

# Extract features
X_train, X_test, y_train, y_test, _ = extract_features(df, text_column="comments", label_column="tagging", method="tfidf")

# Train models
models = train_models(X_train, y_train)

# Evaluate models
for name, model in models.items():
    print(f"\n📊 Evaluating {name}")
    evaluate_model(model, X_test, y_test)

✅ Dataset Loaded Successfully!
🔢 Shape: (20001, 2)
🧾 First 5 entries:
                                            comments  tagging
0                             Get fucking real dude.        1
1   She is as dirty as they come  and that crook ...        1
2   why did you fuck it up. I could do it all day...        1
3   Dude they dont finish enclosing the fucking s...        1
4   WTF are you talking about Men? No men thats n...        1
✅ Trained LogisticRegression
✅ Trained SVM
✅ Trained RandomForest
✅ Trained NaiveBayes

📊 Evaluating LogisticRegression
🧪 Evaluation Results:
Accuracy: 0.8517870532366908
Precision: 0.9939759036144579
Recall: 0.6277742549143944
F1 Score: 0.769529731830548

Detailed Classification Report:
              precision    recall  f1-score   support

           0       0.80      1.00      0.89      2424
           1       0.99      0.63      0.77      1577

    accuracy                           0.85      4001
   macro avg       0.90      0.81      0.83      40

In [None]:
import joblib
joblib.dump(models["RandomForest"], "random_forest_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")


['tfidf_vectorizer.pkl']

In [None]:
from sklearn.svm import LinearSVC
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

# 🧠 TF-IDF + Train model
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['comments'])
y = df['tagging']

model = LinearSVC()
model.fit(X, y)

# 💾 Save model and vectorizer
with open("cyberbully_model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

print("✅ Model and vectorizer saved!")

✅ Model and vectorizer saved!


In [None]:
def predict_text(model, vectorizer, input_text):
    cleaned = clean_text(input_text)
    vec = vectorizer.transform([cleaned])
    prediction = model.predict(vec)[0]
    return "Cyberbullying" if prediction == 1 else "Non-Cyberbullying"

To mount your Google Drive, run the following code cell and follow the instructions:

Once your Drive is mounted, you can access your files using the path `/content/drive/My Drive/your_folder/your_file.csv`. Please replace `your_folder/your_file.csv` with the actual path to your dataset file.

In [None]:
text = input("Enter a sentence to test: ")
print(predict_text(model, vectorizer, text))


Enter a sentence to test: good
Non-Cyberbullying


In [None]:
X_train, X_test, y_train, y_test, vectorizer = extract_features(df, text_column='comments', label_column='tagging')

In [None]:
import joblib

# Save the vectorizer
joblib.dump(vectorizer, "vectorizer.pkl")
print("✅ Vectorizer saved as 'vectorizer.pkl'")


✅ Vectorizer saved as 'vectorizer.pkl'


In [None]:

import joblib

def evaluate_all_models(models, X_test, y_test):
    best_model = None
    best_f1 = 0
    best_name = ""

    for name, model in models.items():
        y_pred = model.predict(X_test)
        f1 = f1_score(y_test, y_pred)
        print(f"\n📊 {name} F1 Score: {f1:.4f}")
        if f1 > best_f1:
            best_f1 = f1
            best_model = model
            best_name = name

    print(f"\n🏆 Best Model: {best_name} (F1 Score: {best_f1:.4f})")
    joblib.dump(best_model, "best_model.pkl")
    print("✅ Best model saved as 'best_model.pkl'")

    return best_model



In [None]:
 %%writefile app.py
import streamlit as st
st.title("✅ Hello from Streamlit!")
import joblib
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

# Load model and vectorizer
model = joblib.load("random_forest_model.pkl")
vectorizer = joblib.load("tfidf_vectorizer.pkl")

st.title("🛡️ Cyberbullying Comment Checker")

comment = st.text_area("💬 Enter your comment:")

if st.button("Check"):
    if comment.strip() == "":
        st.warning("Please enter a comment.")
    else:
        cleaned = clean_text(comment)
        features = vectorizer.transform([cleaned])
        prediction = model.predict(features)[0]

        if prediction == 1:
            st.error("🚫 This comment appears to be bullying. It cannot be posted.")
        else:
            st.success("✅ This comment is safe to post.")

Overwriting app.py


In [None]:
!pip install streamlit pyngrok




In [None]:
!ngrok config add-authtoken  2zVg8VeaXkDIFhdrMD7VmcDoMyF_84cEV62jevg8r78YsK2GX


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
!streamlit run app.py &>/content/logs.txt &


In [None]:
!streamlit run app.py &



Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://35.233.224.198:8501[0m
[0m
[34m  Stopping...[0m


In [None]:
 from pyngrok import ngrok

# Make sure ngrok is running on the same port Streamlit is using (8501)
public_url = ngrok.connect(8501)
print("🌐 Streamlit is live at:", public_url)



🌐 Streamlit is live at: NgrokTunnel: "https://2ffe-35-233-224-198.ngrok-free.app" -> "http://localhost:8501"


In [None]:
# Find and kill all running ngrok processes
!pkill ngrok

# You can verify that no ngrok processes are running with:
# !pgrep ngrok