Spam Detect Stage 1

In [None]:
import pandas as pd
import re
import emoji
from tqdm import tqdm

# Initialize tqdm for pandas apply
tqdm.pandas(desc="Generating spam features and initial scores")

# --- File Paths (HARDCODED) ---
INPUT_CSV = r"C:\Users\User\Desktop\Datathon\comments_videos_cleaned(2).csv"
OUTPUT_CSV = r"C:\Users\User\Desktop\Datathon\spam_output(A).csv"

# --- Define Spam Rules and Constants ---
SPAM_KEYWORDS = {
    'subscribe', 'channel', 'follower', 'crypto', 'giveaway', 'win', 'promo',
    'bitcoin', 'sale', 'discount', 'free money', 'check out my', 'link in bio',
    'earn money', 'affiliate', 'deal', 'offer', 'limited time', 'click here',
    'dm me', 'follow me', 'telegram', 'whatsapp'
}
CONTACT_WORDS = {
    'dm', 'contact', 'whatsapp', 'telegram', 'email', 'phone', 'call', 'reach out',
    'social media', 'instagram', 'facebook', 'twitter', 'tiktok', 'snapchat'
}
SHORTLINK_PATTERNS = [
    r'bit\.ly/\S+', r'tinyurl\.com/\S+', r'goo\.gl/\S+', r't\.co/\S+'
]

def get_emoji_count(text: str) -> int:
    if not isinstance(text, str):
        return 0
    matches = re.findall(r':[a-z0-9_\- ]+?:', text.lower())
    return len(matches)

def calculate_features_and_spam_prob_A(row: pd.Series) -> pd.Series:
    comment_text = row.get('textOriginal', '')
    if not isinstance(comment_text, str):
        comment_text = ''

    has_url = 0
    has_shortlink = 0
    has_contact_words = 0
    emoji_count_feat = 0
    char_repeats = 0
    dup_text = 0  # Will be updated later

    if re.search(r'http\S+|www\S+|\S+\.\S{2,}/', comment_text):
        has_url = 1
    if any(re.search(pattern, comment_text) for pattern in SHORTLINK_PATTERNS):
        has_shortlink = 1
    if any(keyword in comment_text for keyword in CONTACT_WORDS):
        has_contact_words = 1
    emoji_count_feat = get_emoji_count(comment_text)
    if re.search(r'(.)\1{4,}', comment_text):  # 5+ repeated characters
        char_repeats = 1

    # Spam probability score A
    spam_score_A = 0.0
    if has_url:
        spam_score_A += 3.0
    if has_shortlink:
spam_score_A += 1.5
    words = re.findall(r'\b\w+\b', comment_text)
    if any(word in SPAM_KEYWORDS for word in words):
        spam_score_A += 1.5
    if has_contact_words:
        spam_score_A += 1.0
    if char_repeats:
        spam_score_A += 1.0
    if emoji_count_feat > 7:
        spam_score_A += 1.0
    if not comment_text.strip():
        spam_score_A += 0.0
    elif not any(char.isalpha() for char in comment_text) and len(comment_text) > 5:
        spam_score_A += 1.0


    return pd.Series({
        'has_url': has_url,
        'has_shortlink': has_shortlink,
        'has_contact_words': has_contact_words,
        'emoji_count': emoji_count_feat,
        'char_repeats': char_repeats,
        'dup_text': dup_text,  # Placeholder
        'spam_prob_A': spam_score_A
    })

# ==============================================================================
# Main Execution
# ==============================================================================
if __name__ == "__main__":
    print(f"Loading data from: {INPUT_CSV}")
    try:
        df = pd.read_csv(INPUT_CSV, encoding='latin1')
    except FileNotFoundError:
        print(f"Error: Input file not found at '{INPUT_CSV}'")
        exit()
    except Exception as e:
        print(f"Error loading CSV: {e}")
        exit()

    print("Generating spam features and initial spam probability (spam_prob_A)...")
    feature_df = df.progress_apply(calculate_features_and_spam_prob_A, axis=1)
    df = pd.concat([df, feature_df], axis=1)

    # Handle duplicate detection
    # Fill NaN values with an empty string before calculating value counts
    dup_counts = df['textOriginal'].fillna('').value_counts()
    # Explicitly handle NaN in the map function
    df['dup_text'] = df['textOriginal'].map(lambda x: 1 if pd.notna(x) and x in dup_counts and dup_counts[x] > 1 else 0)

    # --- Summary Logs ---
    feature_columns = [
        'has_url', 'has_shortlink', 'has_contact_words', 'char_repeats', 'dup_text'
    ]
    print("\n--- Spam Feature Trigger Counts ---")
    for col in feature_columns:
        print(f"{col}: {(df[col] == 1).sum()}")
    print(f"emoji_count (>7): {(df['emoji_count'] > 7).sum()}")

    print("\n--- Spam Probability Summary ---")
    print(f"Total comments: {len(df)}")
    print(f"Comments with spam_prob_A > 0: {(df['spam_prob_A'] > 0).sum()}")
    print(f"Percentage: {100 * (df['spam_prob_A'] > 0).mean():.2f}%")
    print("------------------------------------------------------\n")

    print("--- Feature Stamping & Spam Detection Pass A Summary ---")
    print(f"Original number of comments: {len(df)}")
    print("DataFrame enriched with new features and 'spam_prob_A' column.")
    print("No rows removed. First few rows preview:")
    print(df.head())
    print(f"\nColumns added: {list(feature_df.columns)}")
    print("------------------------------------------------------\n")

    # Save output
    df.to_csv(OUTPUT_CSV, index=False, encoding='utf-8')
    print(f"✅ Successfully saved enriched data to: {OUTPUT_CSV}")

Loading data from: C:\Users\User\Desktop\Datathon\comments_videos_cleaned(2).csv
Generating spam features and initial spam probability (spam_prob_A)...


Generating spam features and initial scores: 100%|███████████████████████████| 889782/889782 [04:44<00:00, 3122.22it/s]



--- Spam Feature Trigger Counts ---
has_url: 4263
has_shortlink: 0
has_contact_words: 14538
char_repeats: 16413
dup_text: 157855
emoji_count (>7): 0

--- Spam Probability Summary ---
Total comments: 889782
Comments with spam_prob_A > 0: 38361
Percentage: 4.31%
------------------------------------------------------

--- Feature Stamping & Spam Detection Pass A Summary ---
Original number of comments: 889782
DataFrame enriched with new features and 'spam_prob_A' column.
No rows removed. First few rows preview:
   commentId  channelId  videoId  authorId  \
0    1781382      14492    74288   2032536   
1     289571      14727    79618   3043229   
2     569077       3314    51826    917006   
3    2957962       5008    58298   1853470   
4     673093      21411     1265   2584166   

                           textOriginal  parentCommentId  comment_likeCount  \
0    please lesbian flag beg would rock              NaN                  0   
1  apply mashed potato juice mixed curd        319

Spam Detect Stage 2

In [None]:
import pandas as pd
import re
from tqdm import tqdm

# === CONFIG ===
INPUT_CSV  = r"C:\Users\User\Desktop\Datathon\lang_remove.csv"
OUTPUT_CSV = r"C:\Users\User\Desktop\Datathon\spam_final_scaled.csv"

# === 1) Load Data ===
df = pd.read_csv(INPUT_CSV)

# === 2) Define spam keywords for Spam B ===
SPAM_KEYWORDS_B = {
    'subscribe', 'channel', 'follower', 'crypto', 'giveaway', 'win', 'promo',
    'bitcoin', 'sale', 'discount', 'free money', 'check out my', 'link in bio',
    'earn money', 'affiliate', 'deal', 'offer', 'limited time', 'click here',
    'dm me', 'follow me', 'telegram', 'whatsapp'
}

# === 3) Compute raw spam keyword count for Spam B ===
def compute_spam_raw_b(text):
    if pd.isna(text):
        return 0
    text = text.lower()
    score = sum(1 for kw in SPAM_KEYWORDS_B if kw in text)
    return score

tqdm.pandas(desc="Computing Spam Prob B (raw count)")
df["spam_raw_B"] = df["textOriginal"].progress_apply(compute_spam_raw_b)

# === 4) Scale spam_raw_B into spam_prob_B (0–6 range, like spam_prob_A) ===
max_raw_b = df["spam_raw_B"].max()
if max_raw_b > 0:
    df["spam_prob_B"] = df["spam_raw_B"] / max_raw_b * 6
else:
    df["spam_prob_B"] = 0

# === 5) Compute final spam score (average of A & B) ===
df["spam_score"] = df[["spam_prob_A", "spam_prob_B"]].mean(axis=1)

# === 6) Classify as spam or not (threshold = 1.0) ===
df["final_is_spam"] = df["spam_score"].apply(lambda x: 1 if x >= 1.0 else 0)

# === 7) Save result ===
df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8-sig")

print(f"✅ Done! Saved scaled spam scores + classification → {OUTPUT_CSV}")
print(df[["textOriginal", "spam_prob_A", "spam_prob_B", "spam_score", "final_is_spam"]].head(10))



Computing Spam Prob B (raw count): 100%|███████████████████████████████████| 623223/623223 [00:04<00:00, 143813.96it/s]


✅ Done! Saved scaled spam scores + classification → C:\Users\User\Desktop\Datathon\spam_final_scaled.csv
                                        textOriginal  spam_prob_A  \
0                 please lesbian flag beg would rock          0.0   
1                            missed calls mars alien          1.0   
2                                               baaa          0.0   
3                look like raven phenomena raven cap          0.0   
4                                           american          0.0   
5  red heart red heart red heart red heart red he...          0.0   
6    love videos thank red heart red heart red heart          0.0   
7                                          red heart          0.0   
8  true skinny jeans always nipped loudly crying ...          0.0   
9                  plz upload vedio teeth transition          0.0   

   spam_prob_B  spam_score  final_is_spam  
0          0.0         0.0              0  
1          0.0         0.5              0  
2  