In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
import collections
import nltk
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report

In [3]:
# Load dataset
df = pd.read_csv(r"C:\Users\ADMIN\Documents\DisasterAlert\src\datasets\tweets.csv")

In [4]:
# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
print(df['target'].value_counts())
# Data is imbalance with 9256 non-disaster tweets and 2114 disaster tweets

target
0    9256
1    2114
Name: count, dtype: int64


In [12]:
# Function to preprocess text
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'\@\w+|\#\w+', '', text)  # Remove mentions and hashtags
    text = re.sub(r'[^A-Za-z\s]', '', text)  # Remove special characters
    tokens = word_tokenize(text)  # Tokenization
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]  # Stemming
    return " ".join(tokens)

In [14]:
# Apply preprocessing
df["clean_text"] = df["text"].apply(lambda x: preprocess_text(str(x)) if pd.notnull(x) else "")


In [15]:
# Check if 'clean_text' exists now
print(df.columns)

Index(['id', 'keyword', 'location', 'text', 'target', 'clean_text'], dtype='object')


In [16]:
# Convert text into numerical features using TF-IDF
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=10000)
X = vectorizer.fit_transform(df["clean_text"])  # Convert tweets into vectors
y = df["target"]  # Labels (0 or 1)

In [17]:
# Print shape to confirm success
print("Shape of X:", X.shape)

Shape of X: (11370, 10000)


In [18]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
# Define hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'max_features': ['sqrt', 'log2']
}

In [20]:
# Initialize Random Forest
rf = RandomForestClassifier(random_state=42, class_weight='balanced')

In [21]:
# Perform Randomized Search
rf_search = RandomizedSearchCV(rf, param_grid, n_iter=20, cv=3, scoring='f1_macro', n_jobs=-1, verbose=2, random_state=42)
rf_search.fit(X_train, y_train)


Fitting 3 folds for each of 20 candidates, totalling 60 fits


In [22]:
# Best parameters
print("Best Parameters:", rf_search.best_params_)


Best Parameters: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': None}


In [23]:
# Train best model
best_rf = rf_search.best_estimator_
y_pred = best_rf.predict(X_test)

In [24]:
# Evaluate model
from sklearn.metrics import classification_report, accuracy_score
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8970976253298153
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.95      0.94      1878
           1       0.73      0.65      0.69       396

    accuracy                           0.90      2274
   macro avg       0.83      0.80      0.81      2274
weighted avg       0.89      0.90      0.89      2274



In [30]:
import json
import time
import os

def classify_reddit_posts():
    try:
        # Load detected Reddit posts
        with open("detected_posts.json", "r", encoding="utf-8") as f:
            reddit_posts = json.load(f)
        
        if not reddit_posts:
            print("No new posts detected. Waiting...")
            return

        # Extract and preprocess combined text (title + selftext)
        reddit_texts = [
            f"{post['title']} {post['selftext']}" if post.get("selftext") else post["title"]
            for post in reddit_posts
        ]
        preprocessed_texts = [preprocess_text(text) for text in reddit_texts]

        # Convert to TF-IDF vectors
        X_reddit = vectorizer.transform(preprocessed_texts)

        # Predict disaster classification
        predictions = best_rf.predict(X_reddit)

        # Add predictions to posts
        classified_posts = []
        for i, post in enumerate(reddit_posts):
            post["disaster_prediction"] = int(predictions[i])
            if post["disaster_prediction"] == 1:  # If classified as disaster
                classified_posts.append(post)

        # Save only disaster-related posts
        with open("classified_reddit_posts.json", "w") as f:
            json.dump(classified_posts, f, indent=4)

        print(f"✅ Classified {len(classified_posts)} disaster-related posts.")
    
    except FileNotFoundError:
        print("⚠️ detected_posts.json not found. Waiting for new data...")

# Monitor and classify new posts continuously
last_mod_time = 0

while True:
    current_mod_time = os.path.getmtime("detected_posts.json")

    if current_mod_time != last_mod_time:
        print("🔄 Detected change in detected_posts.json! Re-running classification...")
        classify_reddit_posts()
        last_mod_time = current_mod_time
    else:
        print("⏳ No changes detected. Waiting...")

    time.sleep(10)

🔄 Detected change in detected_posts.json! Re-running classification...
No new posts detected. Waiting...
⏳ No changes detected. Waiting...
⏳ No changes detected. Waiting...
🔄 Detected change in detected_posts.json! Re-running classification...
✅ Classified 1 disaster-related posts.
🔄 Detected change in detected_posts.json! Re-running classification...
✅ Classified 4 disaster-related posts.
🔄 Detected change in detected_posts.json! Re-running classification...
✅ Classified 7 disaster-related posts.
🔄 Detected change in detected_posts.json! Re-running classification...
✅ Classified 7 disaster-related posts.
⏳ No changes detected. Waiting...
⏳ No changes detected. Waiting...
⏳ No changes detected. Waiting...
⏳ No changes detected. Waiting...
⏳ No changes detected. Waiting...
⏳ No changes detected. Waiting...
⏳ No changes detected. Waiting...
⏳ No changes detected. Waiting...
⏳ No changes detected. Waiting...
⏳ No changes detected. Waiting...
⏳ No changes detected. Waiting...
⏳ No changes 

KeyboardInterrupt: 