In [None]:
import torch
import streamlit as st
import joblib
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import re
import string
from nltk.corpus import stopwords
import spacy
import emoji
from ntscraper import Nitter
import pandas as pd
import googleapiclient.discovery
from langdetect import detect
import warnings
import subprocess
import sys
import tensorflow as tf

warnings.filterwarnings('ignore')


In [None]:
# Setup Streamlit background
st.markdown(
    """
    <style>
    .stApp {
        background-image: url('https://images.rawpixel.com/image_800/czNmcy1wcml2YXRlL3Jhd3BpeGVsX2ltYWdlcy93ZWJzaXRlX2NvbnRlbnQvbHIvdjU0NmJhdGNoMy1teW50LTM0LWJhZGdld2F0ZXJjb2xvcl8xLmpwZw.jpg');
        background-size: cover;
    }
    </style>
    """,
    unsafe_allow_html=True
)


In [None]:
# Initialize the scraper
scraper = Nitter()

# Define chat abbreviations and text processing
chat_words = {
    "lol": "laugh out loud", "brb": "be right back", "ttyl": "talk to you later",
    "gtg": "got to go", "btw": "by the way", "omg": "oh my god", "idk": "i don't know",
}

def chat_convo(text):
    new_text = []
    for w in text.split():
        w = w.strip(string.punctuation)
        if w.lower() in chat_words:
            new_text.append(chat_words[w.lower()])
        else:
            new_text.append(w)
    return " ".join(new_text)

def demojize_text(text):
    return emoji.demojize(text)

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word.lower() not in stop_words]
    text = ' '.join(words)
    text = chat_convo(text)
    text = demojize_text(text)
    tokens = [token.text for token in nlp(text)]
    return tokens


In [None]:
# Function to load the model and tokenizer
def load_model_and_tokenizer():
    try:
        model = AutoModelForSequenceClassification.from_pretrained(
            r"C:\Users\acer\Desktop\MegaProj\bertweet_sentiment_model"
        )
        tokenizer = AutoTokenizer.from_pretrained(
            r"C:\Users\acer\Desktop\MegaProj\bertweet_sentiment_model"
        )
        return model, tokenizer
    except Exception as e:
        st.error(f"Error loading model/tokenizer: {str(e)}")
        return None, None

In [None]:
# Function to make predictions using the model
def predict(text, model, tokenizer):
    if model is None or tokenizer is None:
        return -1
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(probs, dim=1).item()
    return predicted_class

# Function to extract video ID from YouTube URL
def extract_video_id(url):
    url = url.split('?')[0]  # Remove any query parameters
    patterns = [
        r'(?:https?:\/\/)?(?:www\.)?(?:youtube\.com\/(?:[^\/]+\/.+\/|(?:v|e(?:mbed)?)\/|.*[?&]v=)|youtu\.be\/)([^"&?\/\s]{11})',
        r'^([^"&?\/\s]{11})$'
    ]
    for pattern in patterns:
        match = re.search(pattern, url)
        if match:
            return match.group(1)
    return None  # Return None if no valid ID is found

# Function to scrape YouTube comments
def scrape_youtube_comments(video_id, language='en'):
    try:
        DEVELOPER_KEY = " AIzaSyB6-LJLUT2tNwwbqGrt8VaYIrpT81Iaod0"  # Replace with your API key
        youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=DEVELOPER_KEY)
        comments = []
        nextPageToken = None

        while len(comments) < 100:
            request = youtube.commentThreads().list(
                part="snippet",
                videoId=video_id,
                maxResults=100,
                pageToken=nextPageToken
            )
            response = request.execute()

            for item in response['items']:
                comment = item['snippet']['topLevelComment']['snippet']['textOriginal']
                try:
                    if detect(comment) == language:
                        comments.append(comment)
                        if len(comments) >= 100:
                            break
                except:
                    continue

            nextPageToken = response.get('nextPageToken')
            if not nextPageToken:
                break

        return comments
    except Exception as e:
        st.error(f"Error scraping YouTube comments: {str(e)}")
        return []

# Install spaCy model if necessary
def install_spacy_model():
    try:
        import spacy
        spacy.load("en_core_web_sm")
    except OSError:
        st.warning("Installing spaCy English model...")
        try:
            subprocess.run([sys.executable, "-m", "spacy", "download", "en_core_web_sm"], check=True)
            st.success("Model installed successfully!")
        except subprocess.CalledProcessError:
            st.error("Failed to install. Please run manually:")
            st.code("python -m spacy download en_core_web_sm")
            st.stop()

# Main function to handle Streamlit interface
def main():
    # Ensure necessary models are loaded
    install_spacy_model()

    try:
        nlp = spacy.load("en_core_web_sm")
    except Exception as e:
        st.error(f"Failed to load spaCy model: {str(e)}")
        st.stop()

    model, tokenizer = load_model_and_tokenizer()

    st.sidebar.title("Options")
    option = st.sidebar.radio("Choose an option", 
                            ["Home", "Tweets Analysis", "YouTube Comments Analysis", "File Upload", "About"])

    if option == "Home":
        st.title("Detection of Hate Speech on Social Media")
        user_input = st.text_area("Enter text here to check if it is Hate Speech")
        if st.button("Check for Hate Text"):
            if user_input.strip() == "":
                st.warning("Please enter some text to analyze.")
            else:
                prediction = predict(user_input)
                if prediction == -1:
                    st.error("Model not loaded properly.")
                elif prediction == 0:
                    st.success("**Unlikely to be Hate Speech**")
                else:
                    st.error("**Potential Hate Speech**")

    elif option == "Tweets Analysis":
        st.title("Tweets Prediction for Textual Hate Speech")
        st.write("Enter term or a username to scrape tweets and predict for any potential hate speech")
       # Fixing duplicate widget ID error by adding a unique key
        scrape_option = st.radio("Choose an option", ["Term", "Username"], key="scrape_option")
        input_text = st.text_input("Enter term" if scrape_option == "Term" else "Enter Twitter username")
        if st.button("Scrape Tweets"):
            if input_text.strip() == "":
                st.warning("Please enter a valid input.")
            else:
                tweets = scraper.get_tweets(input_text, mode='term' if scrape_option == "Term" else 'user', number=10)
                if tweets is not None:
                    tweet_texts = [tweet['text'] for tweet in tweets['tweets']]
                    predictions = [predict(text) for text in tweet_texts]
                    df = pd.DataFrame({'Tweet': tweet_texts, 'Prediction': predictions})
                    st.write("Predictions for Tweets:")
                    st.write(df)
                else:
                    st.error("Failed to retrieve tweets. Please check the input and try again.")

    elif option == "YouTube Comments Analysis":
        st.title("YouTube Comments Analysis")
        st.write("Enter a YouTube video link to scrape comments and make predictions")
        vid_input = st.text_area("Enter video link here")
        if st.button("Scrape comments"):
            if vid_input.strip() == "":
                st.warning("Please enter a valid link")
            else:
                video_id = extract_video_id(vid_input)
                if video_id:
                    comments = scrape_youtube_comments(video_id)
                    if comments:
                        predictions = [predict(comment, model, tokenizer) for comment in comments]
                        df = pd.DataFrame({'Comment': comments, 'Prediction': predictions})
                        st.write(df)
                    else:
                        st.error("No comments found or error fetching comments.")
                else:
                    st.error("Invalid video ID. Please check the YouTube link.")

    elif option == "File Upload":
        st.title("Upload files for Prediction")
        st.write("Predict hate text from data present in a .csv file")
        uploaded_file = st.file_uploader("Choose a file", type=["csv"])
        if uploaded_file is not None:
            df = pd.read_csv(uploaded_file, usecols=['text'])
            st.write("Preview of the 'text' column from the uploaded file:")
            st.write(df.head())
            predictions = [predict(text) for text in df['text']]
            df['Prediction'] = predictions
            st.write("Predictions for uploaded data:")
            st.write(df)

    elif option == "About":
        st.title("Detection of Hate Speech Against LGBT+ on Social Media")
        st.write("""
        [Project description here remains unchanged.]
        """)

if __name__ == "__main__":
    main()

In [None]:

def install_spacy_model():
    try:
        import spacy
        spacy.load("en_core_web_sm")
    except OSError:
        st.warning("Installing spaCy English model...")
        try:
            subprocess.run([sys.executable, "-m", "spacy", "download", "en_core_web_sm"], check=True)
            st.success("Model installed successfully!")
        except subprocess.CalledProcessError:
            st.error("Failed to install. Please run manually:")
            st.code("python -m spacy download en_core_web_sm")
            st.stop()


In [None]:

def main():
    st.sidebar.title("Options")
    option = st.sidebar.radio("Choose an option", 
                            ["Home", "Tweets Analysis", "YouTube Comments Analysis", "File Upload", "About"])

    if option == "Home":
        st.title("Detection of Hate Speech on Social Media")
        user_input = st.text_area("Enter text here to check if it is Hate Speech")
        if st.button("Check for Hate Text"):
            if user_input.strip() == "":
                st.warning("Please enter some text to analyze.")
            else:
                prediction = predict(user_input)
                if prediction == -1:
                    st.error("Model not loaded properly.")
                elif prediction == 0:
                    st.success("**Unlikely to be Hate Speech**")
                else:
                    st.error("**Potential Hate Speech**")


In [None]:

# install_spacy_model()

# try:
#     nlp = spacy.load("en_core_web_sm")
# except Exception as e:
#     st.error(f"Failed to load spaCy model: {str(e)}")
#     st.stop()

# model, tokenizer = load_model_and_tokenizer()

# if __name__ == "__main__":
#     main()


In [None]:
import nbformat

notebook_path = "Hate_Speech_App (Running).ipynb"
output_script_path = "hate_speech_app.py"

with open(notebook_path, "r", encoding="utf-8") as f:
    notebook = nbformat.read(f, as_version=4)

code_cells = [cell for cell in notebook.cells if cell.cell_type == "code"]
with open(output_script_path, "w", encoding="utf-8") as f:
    for cell in code_cells:
        f.write(cell.source + "\n\n")

print("✅ Streamlit app code saved as 'hate_speech_app.py'")


In [None]:
import os
print(os.getcwd())  # shows the folder where files are saved
print(os.listdir())  # lists files in that folder


In [None]:
# This won't work from inside Jupyter, but will remind you what to run:
print("Now open a terminal and run this command:")
print("streamlit run hate_speech_app.py")
