In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import folium
from folium.plugins import HeatMap
import json
import os
from datetime import datetime
import re

# --- Configuration ---
# Input file from src/scrapers/article_scraper.py
TALIBAN_SCRAPED_ARTICLES_JSON = 'data/raw/english_extracted_articles.json'
# Pre-generated geocode cache file
GEOCODE_CACHE_JSON = 'data/intermediate/cleaned_location_coordinates.json' 

# Output files for this notebook
PROCESSED_TALIBAN_DIR = 'data/processed/'
PROCESSED_TALIBAN_CSV = os.path.join(PROCESSED_TALIBAN_DIR, 'taliban_extracted_events.csv')
VISUALIZATIONS_DIR = 'visualizations/'

# Date for regime shift analysis
TAKEOVER_DATE_STR = "2021-08-15"
TAKEOVER_DATE = pd.to_datetime(TAKEOVER_DATE_STR)

# Map Configuration
AFGHANISTAN_CENTER_LAT = 33.93911
AFGHANISTAN_CENTER_LON = 67.709953
DEFAULT_MAP_ZOOM = 5

# Plotting style
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (14, 7) # Default figure size
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', '{:.2f}'.format)
seed = 123 # For reproducibility in case of sampling

# Create output directories
os.makedirs(PROCESSED_TALIBAN_DIR, exist_ok=True)
os.makedirs(VISUALIZATIONS_DIR, exist_ok=True)

print("Cell 1: Imports and Configuration complete.")

Cell 1: Imports and Configuration complete.


In [2]:
# Attempt to load the primary scraped data file
input_file_path = TALIBAN_SCRAPED_ARTICLES_JSON
if not os.path.exists(input_file_path):
    # Fallback to a common data/raw location
    input_file_path_alt = os.path.join('data', 'raw', TALIBAN_SCRAPED_ARTICLES_JSON)
    if os.path.exists(input_file_path_alt):
        input_file_path = input_file_path_alt
    else:
        print(f"ERROR: Raw Taliban articles file '{TALIBAN_SCRAPED_ARTICLES_JSON}' not found in project root or data/raw/.")
        print("This notebook requires the output from 'src/scrapers/article_scraper.py'.")
        print("A minimal DUMMY DataFrame will be created for demonstration, but EDA will not be meaningful.")
        df_taliban_raw = pd.DataFrame([
            {'link': 'dummy1', 'title': 'Dummy Event 1', 'body': 'Content 1',
             'metadata': {'publication_date': 'January 01, 2020'}, 'extracted_location': 'KABUL'},
            {'link': 'dummy2', 'title': 'Dummy Event 2', 'body': 'Content 2',
             'metadata': {'publication_date': 'September 15, 2021'}, 'extracted_location': 'HELMAND'}
        ] * 100) # Multiply for some data volume
        # Add varied dummy dates
        np.random.seed(seed)
        date_range_dummy = pd.date_range(start="2016-06-01", end="2024-02-20", freq='D')
        random_dates_dummy = np.random.choice(date_range_dummy, size=len(df_taliban_raw))
        df_taliban_raw['metadata'] = df_taliban_raw['metadata'].apply(lambda x: x.copy())
        for i, date_obj in enumerate(random_dates_dummy):
            df_taliban_raw.loc[i, 'metadata']['publication_date'] = date_obj.strftime("%B %d, %Y")
else:
    try:
        df_taliban_raw = pd.read_json(input_file_path)
        print(f"Successfully loaded raw Taliban articles from '{input_file_path}'. Shape: {df_taliban_raw.shape}")
    except Exception as e:
        print(f"An error occurred loading '{input_file_path}': {e}")
        df_taliban_raw = pd.DataFrame() # Ensure df_taliban_raw exists

if df_taliban_raw.empty:
    print("Critical Error: Taliban raw data is empty. Cannot proceed.")
else:
    print("\nColumns:", df_taliban_raw.columns.tolist())
    print("\nFirst 2 rows (raw):")
    display(df_taliban_raw.head(2))

# Make a working copy
df_taliban = df_taliban_raw.copy()

ERROR: Raw Taliban articles file 'data/raw/english_extracted_articles.json' not found in project root or data/raw/.
This notebook requires the output from 'src/scrapers/article_scraper.py'.
A minimal DUMMY DataFrame will be created for demonstration, but EDA will not be meaningful.


AttributeError: 'numpy.datetime64' object has no attribute 'strftime'

In [None]:
def parse_taliban_date(date_str):
    """Parses various date string formats found in Taliban article metadata."""
    if pd.isna(date_str) or not isinstance(date_str, str):
        return pd.NaT
    date_str = date_str.strip()
    formats_to_try = [
        "%B %d, %Y", "%d %B %Y", "%b %d, %Y", "%d %b %Y",
        "%B %d. %Y", "%d. %B %Y", "%Y-%m-%d", "%m/%d/%Y", "%d/%m/%Y"
    ]
    for fmt in formats_to_try:
        try: return datetime.strptime(date_str, fmt)
        except ValueError: continue
    date_str_cleaned = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str, flags=re.IGNORECASE)
    if date_str_cleaned != date_str:
        for fmt in formats_to_try:
            try: return datetime.strptime(date_str_cleaned, fmt)
            except ValueError: continue
    match_slash_year = re.match(r"(\w+)\s+(\d+)\s+/\s+(\d{4})", date_str)
    if match_slash_year:
        month_str, day_str, year_str = match_slash_year.groups()
        try: return datetime.strptime(f"{month_str} {day_str} {year_str}", "%B %d %Y")
        except ValueError: pass # Try next if full month name fails
        try: return datetime.strptime(f"{month_str} {day_str} {year_str}", "%b %d %Y")
        except ValueError: pass
    return pd.NaT

if not df_taliban.empty:
    print("\n--- Parsing Dates ---")
    if 'metadata' in df_taliban.columns:
        df_taliban['publication_date_str'] = df_taliban['metadata'].apply(
            lambda x: x.get('publication_date') if isinstance(x, dict) else None
        )
        df_taliban['event_date'] = df_taliban['publication_date_str'].apply(parse_taliban_date)
        
        parsed_count = df_taliban['event_date'].notna().sum()
        total_dates_to_parse = df_taliban['publication_date_str'].notna().sum()
        print(f"Successfully parsed {parsed_count} of {total_dates_to_parse} non-null publication date strings.")
        if parsed_count < total_dates_to_parse:
            print(f"  ({total_dates_to_parse - parsed_count} dates could not be parsed and are NaT).")

        df_taliban['year_month'] = df_taliban['event_date'].dt.to_period('M')
        df_taliban['year'] = df_taliban['event_date'].dt.year # Add year column for filtering
        print("Added 'event_date', 'year_month', 'year' columns.")
    else:
        print("ERROR: 'metadata' column not found. Date processing skipped.")
        df_taliban['event_date'] = pd.NaT
        df_taliban['year_month'] = pd.NaT
        df_taliban['year'] = np.nan
else:
    print("DataFrame empty. Date parsing skipped.")

In [None]:
if not df_taliban.empty:
    print("\n--- Cleaning and Geocoding Taliban Locations (using cache) ---")

    location_coordinates_dict = {}
    geocode_cache_path = GEOCODE_CACHE_JSON
    if not os.path.exists(geocode_cache_path):
        # Fallback to data/intermediate if not in root
        geocode_cache_path_alt = os.path.join('data', 'intermediate', GEOCODE_CACHE_JSON)
        if os.path.exists(geocode_cache_path_alt):
            geocode_cache_path = geocode_cache_path_alt
        else:
             print(f"Warning: Geocode cache '{GEOCODE_CACHE_JSON}' not found in root or data/intermediate/. Heatmaps might be sparse.")

    if os.path.exists(geocode_cache_path):
        try:
            with open(geocode_cache_path, 'r', encoding='utf-8') as f:
                cached_data = json.load(f)
                for loc, coords in cached_data.items():
                    if isinstance(coords, list) and len(coords) == 2:
                        location_coordinates_dict[loc] = tuple(coords)
                    else: location_coordinates_dict[loc] = None
            print(f"Loaded geocode cache '{geocode_cache_path}' with {len(location_coordinates_dict)} entries.")
        except Exception as e:
            print(f"Error loading geocode cache: {e}")
    else:
        print(f"Geocode cache not found at '{geocode_cache_path}'. Locations will not be geocoded in this notebook.")

    # Comprehensive cleaning map (ensure this matches the one used to create the cache)
    cleaning_map = {
        "KABIL": "KABUL", "KAB": "KABUL", # ... (Your full cleaning_map) ...
        "NANGARHRA": "NANGARHAR", "HLEMNAD": "HELMAND", "PAKTKIA": "PAKTIKA",
        "NONE": "_IGNORE_", "L": "_IGNORE_", "KH": "_IGNORE_", "D": "_IGNORE_", "S": "_IGNORE_", "NA": "_IGNORE_"
    }

    def clean_df_location_name(raw_name):
        if pd.isna(raw_name) or not isinstance(raw_name, str) or not raw_name.strip(): return None
        name_upper = raw_name.strip().upper()
        if name_upper == 'NAN': return None
        cleaned = cleaning_map.get(name_upper, name_upper)
        return None if cleaned == "_IGNORE_" else cleaned

    if 'extracted_location' in df_taliban.columns:
        df_taliban['cleaned_location'] = df_taliban['extracted_location'].apply(clean_df_location_name)
        df_taliban['latitude'] = df_taliban['cleaned_location'].apply(
            lambda x: location_coordinates_dict.get(x)[0] if isinstance(location_coordinates_dict.get(x), tuple) else np.nan)
        df_taliban['longitude'] = df_taliban['cleaned_location'].apply(
            lambda x: location_coordinates_dict.get(x)[1] if isinstance(location_coordinates_dict.get(x), tuple) else np.nan)
        
        geocoded_count = df_taliban['latitude'].notna().sum()
        print(f"Applied location cleaning. Mapped coordinates for {geocoded_count} events from cache.")
    else:
        print("ERROR: 'extracted_location' column not found. Location processing skipped.")
        df_taliban['cleaned_location'] = None; df_taliban['latitude'] = np.nan; df_taliban['longitude'] = np.nan
else:
    print("DataFrame empty. Location processing skipped.")

In [None]:
if not df_taliban.empty and 'year_month' in df_taliban.columns and 'event_date' in df_taliban.columns:
    print("\n--- EDA: Figure 3 - Taliban Reports Over Time ---")
    
    df_plot_fig3 = df_taliban.dropna(subset=['event_date']).sort_values('event_date')

    if not df_plot_fig3.empty:
        monthly_reports = df_plot_fig3.groupby('year_month').size()
        if isinstance(monthly_reports.index, pd.PeriodIndex):
            monthly_reports.index = monthly_reports.index.to_timestamp()

        plt.figure(figsize=(16, 8))
        plt.plot(monthly_reports.index, monthly_reports.values, marker='o', linestyle='-', color='orangered', markersize=5)
        
        plt.title('Number of Taliban Events per Month', fontsize=15) # Adjusted from Figure caption
        plt.xlabel('Month', fontsize=12)
        plt.ylabel('Number of Events', fontsize=12)
        
        plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
        plt.gca().xaxis.set_major_locator(mdates.YearLocator(1))
        plt.gca().xaxis.set_minor_locator(mdates.MonthLocator(bymonth=[1,4,7,10])) # Quarterly minor ticks
        plt.tick_params(axis='x', which='major', labelsize=10, rotation=0) # Keep years unrotated
        plt.tick_params(axis='y', labelsize=10)

        min_date_plot, max_date_plot = monthly_reports.index.min(), monthly_reports.index.max()
        if pd.notna(min_date_plot) and pd.notna(max_date_plot) and min_date_plot <= TAKEOVER_DATE <= max_date_plot:
             plt.axvline(TAKEOVER_DATE, color='steelblue', linestyle='--', lw=1.5, label=f'Taliban Takeover ({TAKEOVER_DATE_STR})')
             plt.legend(fontsize=10)

        plt.grid(True, which='major', linestyle='-', linewidth='0.5', color='darkgrey')
        plt.grid(True, which='minor', linestyle=':', linewidth='0.3', color='lightgrey')
        
        plt.tight_layout()
        fig3_path = os.path.join(VISUALIZATIONS_DIR, 'figure3_taliban_reports_over_time.png')
        plt.savefig(fig3_path, dpi=300, bbox_inches='tight')
        print(f"Figure 3 saved to {fig3_path}")
        plt.show()
    else:
        print("No data with valid dates to plot for Figure 3.")
else:
    print("DataFrame empty or required date columns missing. Skipping Figure 3.")

In [None]:
if not df_taliban.empty and 'latitude' in df_taliban.columns and 'longitude' in df_taliban.columns and 'event_date' in df_taliban.columns:
    print("\n--- EDA: Figure 4 & 5 - Pre/Post Takeover Location Heatmaps ---")

    df_taliban_geocoded = df_taliban.dropna(subset=['latitude', 'longitude', 'event_date'])
    if df_taliban_geocoded.empty:
        print("No geocoded Taliban events with valid dates available for heatmaps.")
    else:
        df_pre_takeover = df_taliban_geocoded[df_taliban_geocoded['event_date'] < TAKEOVER_DATE]
        df_post_takeover = df_taliban_geocoded[df_taliban_geocoded['event_date'] >= TAKEOVER_DATE]

        print(f"Events for Pre-Takeover Heatmap: {len(df_pre_takeover)}")
        print(f"Events for Post-Takeover Heatmap: {len(df_post_takeover)}")

        def create_heatmap(df_period, period_name, figure_num_str, output_filename):
            if not df_period.empty:
                # Filter out potential outliers or data points far from Afghanistan for map focus
                df_map_filtered = df_period[
                    (df_period['latitude'] > 29) & (df_period['latitude'] < 39) &
                    (df_period['longitude'] > 60) & (df_period['longitude'] < 75)
                ]
                
                if df_map_filtered.empty:
                    print(f"No data points within Afghanistan's approximate bounds for {period_name} heatmap.")
                    return

                map_viz = folium.Map(location=[AFGHANISTAN_CENTER_LAT, AFGHANISTAN_CENTER_LON], 
                                     zoom_start=DEFAULT_MAP_ZOOM, 
                                     tiles="CartoDB positron") # Using a subtle base map
                folium.TileLayer('openstreetmap', name="Street Map View").add_to(map_viz)
                
                heat_data = [[row['latitude'], row['longitude']] for idx, row in df_map_filtered.iterrows()]
                
                if heat_data:
                    HeatMap(heat_data, 
                            name=f"{period_name} Event Density", 
                            radius=15, # Adjust for visual preference
                            blur=12,   # Adjust for visual preference
                            max_zoom=4 # Prevent heatmap from becoming too sparse on zoom
                           ).add_to(map_viz)
                    folium.LayerControl().add_to(map_viz)
                    
                    map_path = os.path.join(VISUALIZATIONS_DIR, output_filename)
                    map_viz.save(map_path)
                    print(f"Figure {figure_num_str} ({period_name} Heatmap) saved to {map_path}")
                    # display(map_viz) # Uncomment to display inline
                else:
                    print(f"No valid heat data for {period_name} heatmap after filtering.")
            else:
                print(f"No data for {period_name} to generate heatmap.")

        # --- Figure 4: Pre-Takeover Heatmap ---
        create_heatmap(df_pre_takeover, "Pre-Takeover", "4", 'figure4_heatmap_pre_takeover.html')

        # --- Figure 5: Post-Takeover Heatmap ---
        create_heatmap(df_post_takeover, "Post-Takeover", "5", 'figure5_heatmap_post_takeover.html')
else:
    print("DataFrame empty or coordinate/date columns missing. Skipping Heatmaps.")

In [None]:
if not df_taliban.empty:
    # Define relevant columns for the final processed output
    columns_to_save = [
        'link', 'title', 'body', # Core article info
        'publication_date_str', 'event_date', 'year_month', 'year', # Date info
        'extracted_location', 'cleaned_location', 'latitude', 'longitude', # Location info
        'metadata' # Keep raw metadata if needed for other analyses
    ]
    
    # Ensure all selected columns exist; create with NaNs if not (though they should be by now)
    df_taliban_processed = pd.DataFrame()
    for col in columns_to_save:
        if col in df_taliban.columns:
            df_taliban_processed[col] = df_taliban[col]
        else:
            df_taliban_processed[col] = np.nan
            print(f"Warning: Column '{col}' was missing from df_taliban, added with NaNs for saving.")
            
    try:
        df_taliban_processed.to_csv(PROCESSED_TALIBAN_CSV, index=False)
        print(f"\nProcessed Taliban data saved to '{PROCESSED_TALIBAN_CSV}'")
        # Optional: Save to Excel (requires openpyxl)
        # df_taliban_processed.to_excel(PROCESSED_TALIBAN_XLSX, index=False, engine='openpyxl')
        # print(f"Processed Taliban data also saved to '{PROCESSED_TALIBAN_XLSX}'")
    except Exception as e:
        print(f"Error saving processed Taliban data: {e}")
        
    print("\nFinal processed Taliban DataFrame sample:")
    display(df_taliban_processed.head(3))
    print(f"Shape: {df_taliban_processed.shape}")
else:
    print("DataFrame is empty. Nothing to save.")

print("\n--- Notebook 2: Taliban Exploration and Data Preparation Complete ---")