# Parsing Social Media API's into CSV's

### General: 

In [2]:
# Pandas is a software library written for the Python programming language for data manipulation and analysis.
import pandas as pd
# NumPy is a library for the Python programming language, adding support for large, multi-dimensional arrays and matrices, along with a large collection of high-level mathematical functions to operate on these arrays
import numpy as np

import re


In [3]:
def remove_special_characters(text):
    """
    Remove all special characters from the given text.
    :param text: The input text string.
    :return: Cleaned text string with only alphanumeric characters and spaces.
    """
    if isinstance(text, str):  # Check if the input is a string
        text = text.replace("&#39;", "'").replace("�", "")
        cleaned_text = re.sub(r'[^A-Za-z0-9\s]+', '', text)
        return cleaned_text
    return '' 


In [4]:

# List of cyberbullying keywords (can be expanded)
cyberbullying_keywords = [
    "loser", "idiot", "stupid", "ugly", "kill", "dumb", "hate", "trash", 
    "fat", "moron", "freak", "retard", "bitch"
]

# Text speak dictionary to expand common abbreviations
text_speak_dict = {
    "u": "you",
    "ur": "your",
    "lol": "laugh out loud",
    "omg": "oh my god",
    "lmao": "laughing my ass off",
    "rofl": "rolling on the floor laughing",
    "btw": "by the way",
    "tbh": "to be honest",
    "idk": "i don't know",
    "smh": "shaking my head",
    "gr8": "great",
    "wtf": "what the f***",
    "imo": "in my opinion",
    "fml": "f*** my life"
    # Add more as needed
}


In [5]:

# Function to expand text speak using regex
def expand_text_speak(text):
    """
    Replace text speak abbreviations with their full forms using regex.
    :param text: The input text string.
    :return: Text with expanded abbreviations.
    """
    if isinstance(text, str):  # Check if the input is a string
        # Replace each text speak abbreviation using the dictionary
        for abbrev, full_form in text_speak_dict.items():
            text = re.sub(rf'\b{abbrev}\b', full_form, text, flags=re.IGNORECASE)
        return text
    return ''


In [6]:

# Function to label comments as "Cyberbullying" or "Not Cyberbullying"
def label_comment(text):
    # Clean the text by removing special characters and expanding text speak
    cleaned_text = remove_special_characters(text)
    expanded_text = expand_text_speak(cleaned_text)
    
    # Check for any cyberbullying keywords in the expanded text
    if any(keyword in expanded_text.lower() for keyword in cyberbullying_keywords):
        return "Cyberbullying"
    return "Not Cyberbullying"


In [7]:

# Function to read comments from CSV, label them, and save to a new CSV
def label_comments_from_csv(input_csv: str, output_csv: str):
    """
    Read comments from a CSV, label them, and save the labeled data to a new CSV.
    
    :param input_csv: Path to the input CSV file containing comments.
    :param output_csv: Path to the output CSV file for labeled comments.
    """
    # Read the CSV file into a DataFrame
    df = pd.read_csv(input_csv)

    if df.empty:
        raise ValueError("csv is empty")
    
    df.columns = ['Comment']
    # Apply labeling function to each comment
    df['Label'] = df['Comment'].apply(label_comment)
    df['Comment'] = df['Comment'].apply(remove_special_characters)
    # Save the labeled data to a new CSV file
    df.to_csv(output_csv, index=False)
    print(f"Labeled comments saved to {output_csv}")


In [8]:
df_list = []
csv_files = ["data/CSVName.csv", "data/CSVName.csv"]
for file_path in csv_files:
    df = pd.read_csv(file_path)  # Read the CSV file
    df_list.append(df)  # Append the DataFrame to the list

# Concatenate all DataFrames in the list into one DataFrame
merged_df = pd.concat(df_list, ignore_index=True)
df_t = merged_df.apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)
df_t.to_csv('merged_output.csv', index=False)
df_t.describe()

FileNotFoundError: [Errno 2] No such file or directory: 'data/CSVName.csv'

In [83]:
input_csv_file = 'merged_output.csv'  # Path to your existing comments CSV
output_csv_file = 'labeled_comments.csv'  # Path to save the labeled comments
label_comments_from_csv(input_csv_file, output_csv_file)
df = pd.read_csv(output_csv_file)
df.head()

Labeled comments saved to labeled_comments.csv


Unnamed: 0,Comment,Label
0,This might be the most iconic video of 2023,Not Cyberbullying
1,Hey gurl i know you made mistakes gurl but its ok gurl i forgive you because i know that if i made a mistake i would want to be forgiven so gurl i want you to know that Jesus really loves you and that you can come to him as you already are and he can help us all oki love ya gurl for always being funny you have an incredible talent I just want you to know that Jesus cares for you and He wants to get to know u ok,Not Cyberbullying
2,Listen Ive been a fan since miranda sings and I didnt catch that till lateat least you tried to keep contact with your fansIll take a pair of your underwear anyday lmao I dont believe youre one bit racist or a part of the pedophile machine that runs America Stay strong lady seriously I dont understand what it is to be famous and lose so many fans over some stupid shit Im sorry you had to suffer this ridiculous mess And theyll find someone new to tear apart and Ill applaud your come back too,Cyberbullying
3,Guys YN she is a comedian its her job to make videos like this and try r thing with her sending stuff to fans shes a comedian and like tf is she supposed to do idc what the internet is were all humans and if she did do bad stuff its in the future everyone makes mistakes and although people make bigger ones Its sad to think of people who see her that way bc I still love Colleen,Not Cyberbullying
4,Like this if your watching in 2024,Not Cyberbullying


In [1]:
def keyword_presence(text):
    text = text.lower()  # Convert to lowercase for case-insensitive matching
    return {keyword: int(keyword in text) for keyword in cyberbullying_keywords}

# Apply the function to create new binary columns
keyword_columns = df['Comment'].apply(keyword_presence).apply(pd.Series)

# Concatenate the new binary columns to the original DataFrame
df = pd.concat([df, keyword_columns], axis=1)

NameError: name 'df' is not defined