In [13]:
import spacy
print(spacy.util.get_installed_models())

['en_core_web_md']


In [3]:
import json
import pandas as pd
import re
import spacy
import os
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity

# File path for events.json
events_file = "events.json"

# Load spaCy model
nlp = spacy.load("en_core_web_md")

# Define disaster keywords
disaster_keywords = {
    "Earthquake": ["earthquake", "seismic", "magnitude", "tremor", "fault line", "aftershock"],
    "Flood": ["flood", "flash flood", "heavy rain", "overflow", "dam break", "inundation", "flooding"],
    "Hurricane": ["hurricane", "typhoon", "cyclone", "storm surge", "tropical storm", "storm", "thunderstorm"],
    "Tornado": ["tornado", "twister", "funnel cloud", "whirlwind"],
    "Industrial Accident": ["explosion", "chemical spill", "factory fire", "toxic leak", "gas leak"],
    "Road Accident": ["road accident", "car crash", "traffic", "collision", "road mishap", "vehicle overturned", "Highway"]
}

# Function to classify disasters
def classify_disaster(title, selftext):
    text = f"{title} {selftext}".lower()
    for disaster, keywords in disaster_keywords.items():
        if any(re.search(rf"\b{word}\b", text) for word in keywords):
            return disaster  
    return "Other"

# Function to extract location
def extract_location(text):
    doc = nlp(text)
    locations = [ent.text for ent in doc.ents if ent.label_ == "GPE"]
    return locations[0] if locations else "Unknown"

# Function to process new posts
def process_new_posts():
    try:
        # Load all processed posts
        with open("all_processed_posts.json", "r", encoding="utf-8") as file:
            all_processed_posts = json.load(file)
        
        # Load clustered disasters
        with open("clustered_disasters.json", "r", encoding="utf-8") as file:
            clustered_disasters = json.load(file)

        # Convert to DataFrame
        clustered_df = pd.DataFrame(clustered_disasters)
        
        # Extract URLs from clustered disasters
        clustered_urls = {post["url"] for post in clustered_disasters}
        
        # Filter new posts
        new_posts = [post for post in all_processed_posts if post["url"] not in clustered_urls]

        # Save new unclustered posts
        with open("new_unclustered_posts.json", "w", encoding="utf-8") as file:
            json.dump(new_posts, file, indent=4)

        print(f"🔹 Found {len(new_posts)} new unclustered posts.")

        # ✅ If no new posts, return instead of stopping
        if not new_posts:
            print("🔹 No new posts to process. Waiting for updates...")
            return  

        # Convert to DataFrame
        df = pd.DataFrame(new_posts)

        # Apply disaster classification
        df["disaster_type"] = df.apply(lambda row: classify_disaster(row["title"], row["selftext"]), axis=1)

        # Apply location extraction
        df["location"] = df.apply(lambda row: extract_location(row["title"] + " " + row["selftext"]), axis=1)

        # Create event_name field
        df["event_name"] = df.apply(lambda row: f"{row['disaster_type']} - {row['location']}" 
                                    if row["location"] != "Unknown" else f"{row['disaster_type']} - Unknown", axis=1)

        # print("created event_name field in df")
        
        
        
        # Initialize sub-clusters
        df["is_first_post"] = False
        
        # Process sub-clustering
        for index, row in df.iterrows():
            # Load existing events
            with open("events.json", "r", encoding="utf-8") as file:
                existing_events = json.load(file)
            # print("loading events.json")
            
            # Extract existing event names
            existing_event_names = {re.sub(r" - Cluster \d+$", "", event["event_name"]) for event in existing_events}
            # print("extracting existing event names...")
            # print(existing_event_names)

            # Load clustered disasters
            with open("clustered_disasters.json", "r", encoding="utf-8") as file:
                new_clustered_disasters = json.load(file)
            # print("Loaded the cluster disaster file inside the loop")
        
            # Convert to DataFrame
            new_clustered_df = pd.DataFrame(new_clustered_disasters)
            # print("Converting loaded file to dataframe")
            
            if row["event_name"] in existing_event_names:
                # print("if event_name exists in events.json")
                base_event_name = row["event_name"]
                # print("set base_event_name")

                
                
                # Check if new_clustered_df is empty
                if new_clustered_df.empty:
                    # print("Cluster is empty")
                    df.at[index, "event_name"] = f"{row['event_name']} - Cluster 0"
                    df.at[index, "is_first_post"] = True
                    df.at[index, "sub_cluster"] = 0
                    continue  # Move to the next iteration of the loop
                
                # Find matching clustered posts
                matching_posts = new_clustered_df[
                    new_clustered_df["event_name"].str.startswith(base_event_name)
                ].copy()
                # print("finding matching posts with same base_event_name")
                
                if not matching_posts.empty:
                    print(f"🔹 Found matching clustered posts for '{base_event_name}'")

                    # Extract text features
                    matching_posts["combined_text"] = matching_posts["title"] + " " + matching_posts["selftext"]
                    current_post_text = row["title"] + " " + row["selftext"]

                    # Compute TF-IDF vectors
                    vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
                    X = vectorizer.fit_transform(matching_posts["combined_text"].tolist() + [current_post_text])

                    # Compute cosine similarity
                    similarities = cosine_similarity(X[-1:], X[:-1]).flatten()

                    # Find best match
                    max_similarity_index = similarities.argmax()
                    max_similarity_score = similarities[max_similarity_index]

                    # Find the max similarity score
                    max_similarity_score = similarities.max()
                    
                    # Get all indices where similarity is equal to max
                    max_similarity_indices = [i for i, score in enumerate(similarities) if score == max_similarity_score]
                    
                    if max_similarity_score > 0.5:

                        # Get all sub-clusters with the max similarity score
                        possible_sub_clusters = [int(sub_cluster) for sub_cluster in matching_posts.iloc[max_similarity_indices]["sub_cluster"].unique()]
                        
                        # Construct all possible event names
                        possible_event_names = [f"{base_event_name} - Cluster {sub_cluster}" for sub_cluster in possible_sub_clusters]
                    
                        print(f"✅ Possible sub-clusters: {possible_sub_clusters}")
                        print(f"✅ Possible event names: {possible_event_names}")
                    
                        # Check for first posts in new_clustered_df with any possible event name
                        first_posts = new_clustered_df[
                            (new_clustered_df["event_name"].isin(possible_event_names)) & 
                            (new_clustered_df["is_first_post"] == True)
                        ]
                    
                        if not first_posts.empty:
                            print(f"🟢 Found first posts in: {first_posts['event_name'].tolist()}")
                    
                            # Extract `created_utc` of all first posts
                            first_posts_utc = first_posts["created_utc"].tolist()
                            current_post_utc = row["created_utc"]
                    
                            # Check if current post lies within 30 days of any first post
                            within_30_days = [
                                sub_cluster for sub_cluster, utc in zip(first_posts["sub_cluster"], first_posts_utc)
                                if (current_post_utc - utc) < (30 * 24 * 60 * 60)  # 30 days in seconds
                            ]
                    
                            if within_30_days:
                                assigned_sub_cluster = int(within_30_days[0])  # Assign first matching cluster
                                print(f"🔄 Assigned existing sub-cluster {assigned_sub_cluster} (within 30 days).")
                            else:
                                assigned_sub_cluster = int(max(possible_sub_clusters) + 1)
                                df.at[index, "is_first_post"] = True
                                print(f"🕒 More than 30 days since all first posts. Assigning new sub-cluster {assigned_sub_cluster}")
                    
                        else:
                            assigned_sub_cluster = int(max(possible_sub_clusters) + 1)
                            df.at[index, "is_first_post"] = True
                            print(f"⚠️ No first posts found. Assigning new sub-cluster {assigned_sub_cluster}")
                    
                        


                        
                    else:
                        assigned_sub_cluster = int(matching_posts["sub_cluster"].max() + 1) if not matching_posts["sub_cluster"].isna().all() else 0
                        print(f"➕ New sub-cluster {assigned_sub_cluster} assigned")
                        df.at[index, "is_first_post"] = True
                        

                    # Assign sub-cluster
                    df.at[index, "sub_cluster"] = int(assigned_sub_cluster)
                    df.at[index, "event_name"] = f"{row['event_name']} - Cluster {assigned_sub_cluster}"
                    # Load existing events
                    if os.path.exists(events_file):
                        with open(events_file, "r", encoding="utf-8") as f:
                            try:
                                events_list = json.load(f)
                            except json.JSONDecodeError:
                                events_list = []  # Handle empty or corrupted JSON
                    else:
                        events_list = []
                        
                    # New event object
                    new_event_name = {"event_name": df.at[index, "event_name"]}
                        
                    # Append only if it doesn't already exist
                    if new_event_name not in events_list:
                        events_list.append(new_event_name)
                        
                    # Save updated event list
                    with open(events_file, "w", encoding="utf-8") as f:
                        json.dump(events_list, f, indent=4, ensure_ascii=False)
                    
                else:
                    df.at[index, "sub_cluster"] = 0  
                    df.at[index, "event_name"] = f"{row['event_name']} - Cluster 0"
                    df.at[index, "is_first_post"] = True
                    # Load existing events
                    if os.path.exists(events_file):
                        with open(events_file, "r", encoding="utf-8") as f:
                            try:
                                events_list = json.load(f)
                            except json.JSONDecodeError:
                                events_list = []  # Handle empty or corrupted JSON
                    else:
                        events_list = []
                        
                    # New event object
                    new_event_name = {"event_name": df.at[index, "event_name"]}
                        
                    # Append only if it doesn't already exist
                    if new_event_name not in events_list:
                        events_list.append(new_event_name)
                        
                    # Save updated event list
                    with open(events_file, "w", encoding="utf-8") as f:
                        json.dump(events_list, f, indent=4, ensure_ascii=False)

            else:
                # print("matching event name not found")
                # print("event name of  current post -")
                # print(df.at[index, "event_name"])
                df.at[index, "event_name"] = f"{row['event_name']} - Cluster 0"
                df.at[index, "is_first_post"] = True
                df.at[index, "sub_cluster"] = 0
                # print("updated event name of the current post -")
                # print(df.at[index, "event_name"])
                # Load existing events
                if os.path.exists(events_file):
                    with open(events_file, "r", encoding="utf-8") as f:
                        try:
                            events_list = json.load(f)
                        except json.JSONDecodeError:
                            events_list = []  # Handle empty or corrupted JSON
                else:
                    events_list = []
                        
                # New event object
                new_event_name = {"event_name": df.at[index, "event_name"]}
                # print("new event name -")
                # print(new_event_name)
                
                # Append only if it doesn't already exist
                if new_event_name not in events_list:
                    events_list.append(new_event_name)
                
                # Save updated event list
                with open(events_file, "w", encoding="utf-8") as f:
                    json.dump(events_list, f, indent=4, ensure_ascii=False)

            
            # **Update clustered_disasters.json after processing the current row**
            current_row_df = df.loc[[index]]  # Convert the current row to DataFrame
        
            updated_clustered_df = pd.concat([new_clustered_df, current_row_df], ignore_index=True)
        
            json_data = updated_clustered_df.to_json(orient="records", indent=4)
            parsed_json = json.loads(json_data)
        
            with open("clustered_disasters.json", "w", encoding="utf-8") as f:
                json.dump(parsed_json, f, indent=4, ensure_ascii=False)

            print("post added succefully")
                
    
        print("✅ ALL Data successfully updated in clustered_disasters.json")

    except Exception as e:
        print(f"❌ Error occurred: {e}")

# Infinite loop to check file modification time
last_mtime = os.path.getmtime("all_processed_posts.json")  # Get initial modification time

while True:
    time.sleep(10)  # Wait 10 seconds

    try:
        current_mtime = os.path.getmtime("all_processed_posts.json")
            
        if current_mtime != last_mtime:
            print("\n🔄 Detected change in all_processed_posts.json. Processing new posts...\n")
            process_new_posts()
            last_mtime = current_mtime  # Update last modification time

        else:
            print("⏳ No changes detected. Waiting...")

    except Exception as e:
        print(f"❌ Error monitoring file: {e}")

⏳ No changes detected. Waiting...

🔄 Detected change in all_processed_posts.json. Processing new posts...

🔹 Found 0 new unclustered posts.
🔹 No new posts to process. Waiting for updates...
⏳ No changes detected. Waiting...
⏳ No changes detected. Waiting...
⏳ No changes detected. Waiting...
⏳ No changes detected. Waiting...
⏳ No changes detected. Waiting...
⏳ No changes detected. Waiting...

🔄 Detected change in all_processed_posts.json. Processing new posts...

🔹 Found 0 new unclustered posts.
🔹 No new posts to process. Waiting for updates...
⏳ No changes detected. Waiting...
⏳ No changes detected. Waiting...
⏳ No changes detected. Waiting...

🔄 Detected change in all_processed_posts.json. Processing new posts...

🔹 Found 0 new unclustered posts.
🔹 No new posts to process. Waiting for updates...
⏳ No changes detected. Waiting...

🔄 Detected change in all_processed_posts.json. Processing new posts...

🔹 Found 2 new unclustered posts.
post added succefully
post added succefully
✅ ALL Da

KeyboardInterrupt: 