In [5]:
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
df = pd.read_csv('data_train.csv')

df.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,Id,Tweet,following,followers,actions,is_retweet,location,Type
0,10091,It's the everything else that's complicated. #...,0.0,11500.0,,0.0,Chicago,Quality
1,10172,Eren sent a glare towards Mikasa then nodded a...,0.0,0.0,,0.0,,Quality
2,7012,I posted a new photo to Facebook http://fb.me/...,0.0,0.0,,0.0,"Scotland, U.K",Quality
3,3697,#jan Idiot Chelsea Handler Diagnoses Trump Wit...,3319.0,611.0,294.0,0.0,FBBIGBANG&2NE1TH,Spam
4,10740,Pedophile Anthony Weiner is TERRIFIED of Getti...,4840.0,1724.0,1522.0,0.0,www.instagram.com/fender,Spam


In [6]:
# df Preprocessing

# Fill missing values
df['following'] = df['following'].fillna(0)
df['followers'] = df['followers'].fillna(0)
df['actions'] = df['actions'].fillna(0)
df['is_retweet'] = df['is_retweet'].fillna(0)


# Encode the target variable (Type)
df['Type'] = df['Type'].map({'Spam': 1, 'Quality': 0})
print(df.head())

      Id                                              Tweet  following  \
0  10091  It's the everything else that's complicated. #...        0.0   
1  10172  Eren sent a glare towards Mikasa then nodded a...        0.0   
2   7012  I posted a new photo to Facebook http://fb.me/...        0.0   
3   3697  #jan Idiot Chelsea Handler Diagnoses Trump Wit...     3319.0   
4  10740  Pedophile Anthony Weiner is TERRIFIED of Getti...     4840.0   

   followers  actions  is_retweet                  location  Type  
0    11500.0      0.0         0.0                   Chicago     0  
1        0.0      0.0         0.0                       NaN     0  
2        0.0      0.0         0.0             Scotland, U.K     0  
3      611.0    294.0         0.0          FBBIGBANG&2NE1TH     1  
4     1724.0   1522.0         0.0  www.instagram.com/fender     1  


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import nltk
from nltk.corpus import stopwords

#nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))
# Convert the 'Tweet' col to string and handle non-string entries
df['Tweet'] = df['Tweet'].astype(str)

# extract features now that all entries are strings
df['tweet_length'] = df['Tweet'].apply(len)  # Length of the tweet
df['hashtag_count'] = df['Tweet'].apply(lambda x: x.count('#'))
df['mention_count'] = df['Tweet'].apply(lambda x: x.count('@'))
df['url_count'] = df['Tweet'].apply(lambda x: x.count('http'))
df['capitalized_count'] = df['Tweet'].apply(lambda x: sum(1 for c in x if c.isupper()))
df['exclamation_count'] = df['Tweet'].apply(lambda x: x.count('!'))
df['question_mark_count'] = df['Tweet'].apply(lambda x: x.count('?'))


def clean_text(text):
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"@\w+", "", text)     # Remove mentions
    text = re.sub(r"#\w+", "", text)     # Remove hashtags
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    text = " ".join([word for word in text.split() if word not in STOPWORDS])  # Remove stopwords
    return text

df['cleaned_tweet'] = df['Tweet'].astype(str).apply(clean_text)
vectorizer = TfidfVectorizer(max_features=500)
X_text_feature = vectorizer.fit_transform(df['cleaned_tweet']).toarray()  # Convert to array

print(df.head(9))
df.to_csv('preprocessed_data.csv', index=False)
X_meta_features = df[['following', 'followers', 'actions', 'is_retweet', 'tweet_length',
                      'hashtag_count', 'mention_count', 'url_count',
                      'capitalized_count', 'exclamation_count', 'question_mark_count']]

# Step 3: Combine both sets of features (NLP + Metaheuristic)
X_combined = np.hstack((X_text_feature, X_meta_features.values))
y = df['Type']

# Step 4: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)


# Display the shape of the training and testing sets
X_train.shape, X_test.shape, y_train.shape, y_test.shape

      Id                                              Tweet  following  \
0  10091  It's the everything else that's complicated. #...        0.0   
1  10172  Eren sent a glare towards Mikasa then nodded a...        0.0   
2   7012  I posted a new photo to Facebook http://fb.me/...        0.0   
3   3697  #jan Idiot Chelsea Handler Diagnoses Trump Wit...     3319.0   
4  10740  Pedophile Anthony Weiner is TERRIFIED of Getti...     4840.0   
5   9572  EBMUD ending penalties for excessive water use...     4435.0   
6  10792  Big day.  #WeTheNorth #yyz #thesix #sunset #sk...        0.0   
7  11594  #UPA #scams to the tune of Rs 12 lakh Crore #S...        0.0   
8  12594  **MISSING**\nA male tabby cat has gone missing...    39000.0   

   followers  actions  is_retweet                  location  Type  \
0    11500.0      0.0         0.0                   Chicago     0   
1        0.0      0.0         0.0                       NaN     0   
2        0.0      0.0         0.0             Scotla

((9574, 511), (2394, 511), (9574,), (2394,))

In [9]:
# Whale Optimization Algorithm for feature selection
class WhaleOptimizationAlgorithm:
    def __init__(self, X, y, num_whales=30, max_iter=2):
        self.X = X
        self.y = y
        self.num_whales = num_whales
        self.max_iter = "max_iter"
        self.best_features = None
        self.best_score = 0

    def fitness(self, features):
        # Train a classifier and return the accuracy
        if np.sum(features) == 0:
            return 0  # Avoid empty feature sets
        X_subset = self.X[:, features.astype(bool)]
        X_train, X_test, y_train, y_test = train_test_split(X_subset, self.y, test_size=0.2, random_state=42)
        model = AdaBoostClassifier()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        return accuracy_score(y_test, y_pred)

    def optimize(self):
    # Initialize positions of whales
      positions = np.random.rand(self.num_whales, self.X.shape[1])
      for iteration in range(self.max_iter):
        print(f"Iteration {iteration + 1}/{self.max_iter}")
        for i in range(self.num_whales):
            # Calculate fitness
            score = self.fitness(positions[i])
            if score > self.best_score:
                self.best_score = score
                self.best_features = positions[i]
        # Update positions (simplified version of WOA)
        positions += np.random.rand(self.num_whales, self.X.shape[1]) * 0.1
      return self.best_features


In [15]:
class WhaleOptimizationAlgorithm:
    def __init__(self, X, y, num_whales=30, max_iter=2):
        self.X = X
        self.y = y
        self.num_whales = num_whales
        self.max_iter = max_iter # Changed: Assign the integer value of max_iter
        self.best_features = None
        self.best_score = 0

    def fitness(self, features):
        # Train a classifier and return the accuracy
        if np.sum(features) == 0:
            return 0  # Avoid empty feature sets
        X_subset = self.X[:, features.astype(bool)]
        X_train, X_test, y_train, y_test = train_test_split(X_subset, self.y, test_size=0.2, random_state=42)
        model = AdaBoostClassifier()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        return accuracy_score(y_test, y_pred)

    def optimize(self):
    # Initialize positions of whales
      positions = np.random.rand(self.num_whales, self.X.shape[1])
      for iteration in range(self.max_iter):
        print(f"Iteration {iteration + 1}/{self.max_iter}")
        for i in range(self.num_whales):
            # Calculate fitness
            score = self.fitness(positions[i])
            if score > self.best_score:
                self.best_score = score
                self.best_features = positions[i]
        # Update positions (simplified version of WOA)
        positions += np.random.rand(self.num_whales, self.X.shape[1]) * 0.1
      return self.best_features

In [16]:
# Adding new tweets to the existing DataFrame `df` within the Jupyter notebook file and re-running preprocessing.
# This assumes that `df` already exists in the notebook environment.
additional_tweets = [
    {"Id": 20011, "Tweet": "Discover amazing deals on our site today! Limited time offer.", "following": 120, "followers": 3400, "actions": 3, "is_retweet": 0, "location": "New York", "Type": 1},
    {"Id": 20012, "Tweet": "Had a great time at the tech conference! #Innovation #TechTrends", "following": 50, "followers": 1800, "actions": 10, "is_retweet": 0, "location": "San Francisco", "Type": 0},
    {"Id": 20013, "Tweet": "Our biggest sale of the year is here! Shop now before it's gone.", "following": 400, "followers": 22000, "actions": 15, "is_retweet": 0, "location": "Online", "Type": 1},
    {"Id": 20014, "Tweet": "New blog: 'Top 10 Travel Destinations for 2023' - Check it out!", "following": 80, "followers": 2500, "actions": 1, "is_retweet": 0, "location": "Los Angeles", "Type": 0},
    {"Id": 20015, "Tweet": "Proud of our team’s recent achievements #TeamGoals #Success", "following": 60, "followers": 900, "actions": 7, "is_retweet": 0, "location": "Chicago", "Type": 0},
    {"Id": 20016, "Tweet": "Flash sale on our winter collection! Get up to 70% off.", "following": 320, "followers": 16000, "actions": 12, "is_retweet": 0, "location": "New York", "Type": 1},
    {"Id": 20017, "Tweet": "Check out the new feature on our app for faster transactions.", "following": 90, "followers": 1100, "actions": 3, "is_retweet": 0, "location": "Boston", "Type": 0},
    {"Id": 20018, "Tweet": "Our experts share top tips for healthy living. Learn more!", "following": 200, "followers": 3000, "actions": 4, "is_retweet": 0, "location": "San Diego", "Type": 0},
    {"Id": 20019, "Tweet": "Last chance! Register now for early bird discounts on tickets.", "following": 500, "followers": 4500, "actions": 10, "is_retweet": 0, "location": "Seattle", "Type": 1},
    {"Id": 20020, "Tweet": "Big things are coming. Stay tuned for exciting announcements!", "following": 70, "followers": 4000, "actions": 5, "is_retweet": 0, "location": "Miami", "Type": 0},
    {"Id": 20021, "Tweet": "Get inspired with our new DIY project ideas on our blog.", "following": 150, "followers": 600, "actions": 0, "is_retweet": 0, "location": "Austin", "Type": 0},
    {"Id": 20022, "Tweet": "Hurry! Limited stock available on selected products.", "following": 380, "followers": 14000, "actions": 8, "is_retweet": 0, "location": "Online", "Type": 1},
    {"Id": 20023, "Tweet": "Here's how to make the most of your workout routines #Fitness", "following": 210, "followers": 2500, "actions": 3, "is_retweet": 0, "location": "Denver", "Type": 0},
    {"Id": 20024, "Tweet": "Sneak peek of our new collection launching next month!", "following": 170, "followers": 3300, "actions": 6, "is_retweet": 0, "location": "Chicago", "Type": 1},
    {"Id": 20025, "Tweet": "Excited to partner with industry leaders on innovative solutions!", "following": 60, "followers": 1800, "actions": 4, "is_retweet": 0, "location": "San Jose", "Type": 0},
    {"Id": 20026, "Tweet": "Save the date! Join us for an exclusive webinar next week.", "following": 500, "followers": 5000, "actions": 8, "is_retweet": 0, "location": "Online", "Type": 1},
    {"Id": 20027, "Tweet": "Our holiday collection is out now! Perfect gifts for loved ones.", "following": 140, "followers": 9000, "actions": 9, "is_retweet": 0, "location": "Houston", "Type": 1},
    {"Id": 20028, "Tweet": "Free workshop: Learn how to manage stress effectively", "following": 110, "followers": 700, "actions": 0, "is_retweet": 0, "location": "Portland", "Type": 0},
    {"Id": 20029, "Tweet": "New product alert! Discover the latest in tech gadgets.", "following": 310, "followers": 14000, "actions": 7, "is_retweet": 0, "location": "San Francisco", "Type": 1},
    {"Id": 20030, "Tweet": "Can't wait to share some exciting news with you all soon!", "following": 90, "followers": 4000, "actions": 2, "is_retweet": 0, "location": "Seattle", "Type": 0},
    {"Id": 20031, "Tweet": "Congratulations to our employee of the month! #TeamAppreciation", "following": 80, "followers": 900, "actions": 6, "is_retweet": 0, "location": "Los Angeles", "Type": 0},
]
# Convert additional tweets to a DataFrame
new_tweets_df = pd.DataFrame(additional_tweets)
# Assuming `df` is the existing DataFrame in the notebook environment
# Concatenate with the original DataFrame
df = pd.concat([df, new_tweets_df], ignore_index=True)
# Re-run preprocessing if required
df['cleaned_tweet'] = df['Tweet'].astype(str).apply(clean_text)


In [17]:
import warnings
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
# Suppress FutureWarnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Run the Whale Optimization Algorithm
woa = WhaleOptimizationAlgorithm(X_train, y_train)
best_features = woa.optimize()

# Train the AdaBoost classifier on the selected features
X_train_selected = X_train[:, best_features.astype(bool)]
X_test_selected = X_test[:, best_features.astype(bool)]
model = AdaBoostClassifier(algorithm='SAMME')  # Explicitly using SAMME
model.fit(X_train_selected, y_train)
y_pred = model.predict(X_test_selected)
print("Accuracy:", accuracy_score(y_test, y_pred))

Iteration 1/2
Iteration 2/2
Accuracy: 0.9970760233918129
