In [6]:
import pandas as pd
import numpy as np

df = pd.read_csv('df_train.csv')

df.head()

Unnamed: 0,Id,Tweet,following,followers,actions,is_retweet,location,Type
0,10091,It's the everything else that's complicated. #...,0.0,11500.0,,0.0,Chicago,Quality
1,10172,Eren sent a glare towards Mikasa then nodded a...,0.0,0.0,,0.0,,Quality
2,7012,I posted a new photo to Facebook http://fb.me/...,0.0,0.0,,0.0,"Scotland, U.K",Quality
3,3697,#jan Idiot Chelsea Handler Diagnoses Trump Wit...,3319.0,611.0,294.0,0.0,FBBIGBANG&2NE1TH,Spam
4,10740,Pedophile Anthony Weiner is TERRIFIED of Getti...,4840.0,1724.0,1522.0,0.0,www.instagram.com/fender,Spam


In [21]:
# df Preprocessing

# Fill missing values
df['following'] = df['following'].fillna(0)
df['followers'] = df['followers'].fillna(0)
df['actions'] = df['actions'].fillna(0)
df['is_retweet'] = df['is_retweet'].fillna(0)

df['tweet_length'] = df['Tweet'].apply(len)  # Length of the tweet
df['hashtag_count'] = df['Tweet'].apply(lambda x: x.count('#'))  # Number of hashtags
df['mention_count'] = df['Tweet'].apply(lambda x: x.count('@'))  # Number of mentions
df['url_count'] = df['Tweet'].apply(lambda x: x.count('http'))  # Number of URLs
df['capitalized_count'] = df['Tweet'].apply(lambda x: sum(1 for c in x if c.isupper()))  # Capitalized words
df['exclamation_count'] = df['Tweet'].apply(lambda x: x.count('!'))  # Exclamation symbols
df['question_mark_count'] = df['Tweet'].apply(lambda x: x.count('?'))  # Question marks


# Encode the target variable (Type)
df['Type'] = df['Type'].map({'Spam': 1, 'Quality': 0})
print(df.head())

      Id                                              Tweet  following  \
0  10091  It's the everything else that's complicated. #...        0.0   
1  10172  Eren sent a glare towards Mikasa then nodded a...        0.0   
2   7012  I posted a new photo to Facebook http://fb.me/...        0.0   
3   3697  #jan Idiot Chelsea Handler Diagnoses Trump Wit...     3319.0   
4  10740  Pedophile Anthony Weiner is TERRIFIED of Getti...     4840.0   

   followers  actions  is_retweet                  location  Type  \
0    11500.0      0.0         0.0                   Chicago   NaN   
1        0.0      0.0         0.0                       NaN   NaN   
2        0.0      0.0         0.0             Scotland, U.K   NaN   
3      611.0    294.0         0.0          FBBIGBANG&2NE1TH   NaN   
4     1724.0   1522.0         0.0  www.instagram.com/fender   NaN   

   tweet_length  hashtag_count  mention_count  url_count  capitalized_count  \
0            84              2              0          0     

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import nltk
from nltk.corpus import stopwords

# nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))
# Convert the 'Tweet' col to string and handle non-string entries
df['Tweet'] = df['Tweet'].astype(str)

def clean_text(text):
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"@\w+", "", text)     # Remove mentions
    text = re.sub(r"#\w+", "", text)     # Remove hashtags
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    text = " ".join([word for word in text.split() if word not in STOPWORDS])  # Remove stopwords
    return text

df['cleaned_tweet'] = df['Tweet'].astype(str).apply(clean_text)
vectorizer = TfidfVectorizer(max_features=500)  
X_text_feature = vectorizer.fit_transform(df['cleaned_tweet']).toarray()  # Convert to array

# Re-extract features now that all entries are strings
df['tweet_length'] = df['Tweet'].apply(len)  # Length of the tweet
df['hashtag_count'] = df['Tweet'].apply(lambda x: x.count('#'))  
df['mention_count'] = df['Tweet'].apply(lambda x: x.count('@'))  
df['url_count'] = df['Tweet'].apply(lambda x: x.count('http'))  
df['capitalized_count'] = df['Tweet'].apply(lambda x: sum(1 for c in x if c.isupper())) 
df['exclamation_count'] = df['Tweet'].apply(lambda x: x.count('!'))
df['question_mark_count'] = df['Tweet'].apply(lambda x: x.count('?'))  

print(df.head(9))
X_meta_features = df[['following', 'followers', 'actions', 'is_retweet', 'tweet_length', 
                      'hashtag_count', 'mention_count', 'url_count', 
                      'capitalized_count', 'exclamation_count', 'question_mark_count']]

# Step 3: Combine both sets of features (NLP + Metaheuristic)
import numpy as np
X_combined = np.hstack((X_text_feature, X_meta_features.values))
y = df['Type']

# Step 4: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)


# Display the shape of the training and testing sets
X_train.shape, X_test.shape, y_train.shape, y_test.shape

      Id                                              Tweet  following  \
0  10091  It's the everything else that's complicated. #...        0.0   
1  10172  Eren sent a glare towards Mikasa then nodded a...        0.0   
2   7012  I posted a new photo to Facebook http://fb.me/...        0.0   
3   3697  #jan Idiot Chelsea Handler Diagnoses Trump Wit...     3319.0   
4  10740  Pedophile Anthony Weiner is TERRIFIED of Getti...     4840.0   
5   9572  EBMUD ending penalties for excessive water use...     4435.0   
6  10792  Big day.  #WeTheNorth #yyz #thesix #sunset #sk...        0.0   
7  11594  #UPA #scams to the tune of Rs 12 lakh Crore #S...        0.0   
8  12594  **MISSING**\nA male tabby cat has gone missing...    39000.0   

   followers  actions  is_retweet                  location  Type  \
0    11500.0      0.0         0.0                   Chicago   NaN   
1        0.0      0.0         0.0                       NaN   NaN   
2        0.0      0.0         0.0             Scotla

((9574, 511), (2394, 511), (9574,), (2394,))

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Metadata features
X_meta_features = df[['following', 'followers', 'actions', 'is_retweet', 'tweet_length',
                      'hashtag_count', 'mention_count', 'url_count',
                      'capitalized_count', 'exclamation_count', 'question_mark_count']]

# Combine features
X_combined = np.hstack((X_text_feature, X_meta_features.values))
y = df['Type']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)

# Predictions
y_pred = rf_model.predict(X_test)

# Model Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9862155388471178

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1198
           1       0.99      0.99      0.99      1196

    accuracy                           0.99      2394
   macro avg       0.99      0.99      0.99      2394
weighted avg       0.99      0.99      0.99      2394


Confusion Matrix:
[[1181   17]
 [  16 1180]]
