**Social Media Trolls Identification using ML**

In [1]:
import os
import textstat
import string
import chardet
import requests
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from bs4 import BeautifulSoup


**after importing libraries i picked up a random blog website to extract** 

In [2]:
url=["https://newbreak.church/holy-spirit-guidance/"]

**now creating a function to extract that website** 


In [3]:
def extract_text_from_url(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    with requests.Session() as session:
        try:
            response = session.get(url, headers=headers)
            response.raise_for_status() 
            soup = BeautifulSoup(response.content, 'html.parser')
            text = soup.get_text()
            return text
        except requests.exceptions.RequestException as e:
            print(f"Error fetching content from {url}: {e}")
            return None

**now saving that extracts into text format**

In [4]:
def save_text_to_file(text, filename):
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(text)

**preprocessing data from text file**

In [5]:
def process_file(file_path):
    try:
        with open(file_path, 'rb') as file:
            result = chardet.detect(file.read())
            encoding = result['encoding']
        
        with open(file_path, 'r', encoding=encoding) as file:
            text = file.read()

        tokens = word_tokenize(text)
        sentences = sent_tokenize(text)

        return calculate_scores_and_features(tokens, sentences)

    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None


**providing features for better analysis result**

In [6]:
def calculate_scores_and_features(tokens, sentences):
    positive_score = sum(1 for word in tokens if word in positive_stopwords)
    negative_score = sum(1 for word in tokens if word in negative_stopwords)

    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (len(tokens) + 0.000001)

    avg_sentence_length = sum(len(sent_tokenize(sentence)) for sentence in sentences) / len(sentences)
    percentage_complex_words = textstat.difficult_words(" ".join(tokens))
    fog_index = textstat.gunning_fog(" ".join(tokens))
    avg_words_per_sentence = len(tokens) / len(sentences)
    complex_word_count = textstat.difficult_words(" ".join(tokens))
    word_count = len(tokens)
    syllables_per_word = textstat.syllable_count(" ".join(tokens)) / word_count
    personal_pronouns = sum(1 for word in tokens if word.lower() in ['i', 'me', 'my', 'mine', 'myself'])
    avg_word_length = sum(len(word) for word in tokens) / word_count

    return positive_score, negative_score, polarity_score, subjectivity_score, avg_sentence_length, \
        percentage_complex_words, fog_index, avg_words_per_sentence, complex_word_count, \
        word_count, syllables_per_word, personal_pronouns, avg_word_length


**we just downloaded a random dict. of words for refrence and using it as stop words**

In [7]:
def read_stopwords(file_path):
    with open(file_path, 'r') as file:
        stopwords = file.read().split()
    return stopwords

positive_stopwords = read_stopwords('C:/Users/ADMIN/Desktop/internship/words/positivew.txt')
negative_stopwords = read_stopwords('C:/Users/ADMIN/Desktop/internship/words/negetivew.txt')

**creating a main funt. which will provide us analysis results of particular website**  

In [8]:
def main():
    url = ["https://newbreak.church/holy-spirit-guidance/"]
    data = {'URL': [], 'Positive': [], 'Negative': [], 'Polarity': [], 'Subjectivity': [],
            'Avg Sentence Length': [], 'Difficult Words': [], 'Fog Index': [],
            'Avg Number of Words per Sentence': [], 'Complex Word Count': [], 'Word Count': [],
            'Syllables per Word': [], 'Personal Pronouns': [], 'Avg Word Length': []}

    for url_item in url:
        text = extract_text_from_url(url_item)
        if text is not None:
  
            filename = os.path.join(os.getcwd(), f"{url_item.strip('/').replace('://', '_').replace('/', '_')}.txt")
            save_text_to_file(text, filename)
            scores = process_file(filename)

            if scores is not None:
                positive_score, negative_score, polarity_score, subjectivity_score, avg_sentence_length, \
                difficult_words, fog_index, avg_words_per_sentence, complex_word_count, \
                word_count, syllables_per_word, personal_pronouns, avg_word_length = scores

                data['URL'].append(url_item)
                data['Positive'].append(positive_score)
                data['Negative'].append(negative_score)
                data['Polarity'].append(polarity_score)
                data['Subjectivity'].append(subjectivity_score)
                data['Avg Sentence Length'].append(avg_sentence_length)
                data['Difficult Words'].append(difficult_words)
                data['Fog Index'].append(fog_index)
                data['Avg Number of Words per Sentence'].append(avg_words_per_sentence)
                data['Complex Word Count'].append(complex_word_count)
                data['Word Count'].append(word_count)
                data['Syllables per Word'].append(syllables_per_word)
                data['Personal Pronouns'].append(personal_pronouns)
                data['Avg Word Length'].append(avg_word_length)

    df = pd.DataFrame(data)
    df.to_excel('sentiment_analysis_results_with_features.xlsx', index=False)

if __name__ == "__main__":
    main()

