In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Authors:**
​
* [Natan Grayman (2344104)](https://www.kaggle.com/natangrayman)
* [Liad Peretz (2373287)](https://www.kaggle.com/liadperetz)
​
## **Introduction: 2021 South African Social Unrest**

---

In this notebook, we explore the dynamic intersection of US election tweets and GDELT data. Our analysis will uncover intriguing insights and draw compelling comparisons between social media trends and conventional media coverage of the US election.

**The 2021 South African Unrest** — often referred to as the July 2021 riots — unfolded in South Africa's KwaZulu-Natal and Gauteng provinces from **July 9 to 18, 2021**. This wave of civil unrest was triggered by the imprisonment of former President Jacob Zuma for contempt of court.

For more detailed information about the 2021 South African Unrest, you can visit the [Wikipedia page](https://en.wikipedia.org/wiki/2021_South_African_unrest).

---


1. [Twitter Dataset Analysis](#section-one)

   * [Dataset Summary](#section-one-a)
     * Description of the Twitter Dataset used.
   * [Frequency of words in tweets](#section-one-b)
   * [Number of Tweets per Day](#section-one-c)
   * [Location Analysis](#section-one-d)
   
   
2. [GDELT Analysis](#section-two)
   * [GDELT Description](#section-two-a)
   * [GDELT Exploratory Data Analysis (EDA)](#section-two-b)
   * [Total Number of Mentions per day](#section-two-c)
   * [GDELT Sentiment Analysis](#section-two-d)
   
   
   
3. [Google Trends Data Analysis](#section-three)
    * [Google Trends Description](#section-three-a)
    * [Web Search](#section-three-b)
    * [News Search ](#section-three-c)
   
   
4. [Pattern of Interest Comparisons](#section-four)
    * [Comparison of Number of Tweets and GDELT Articles per Day](#section-four-a)
    
  
5. [Mathematical Framework](#section-five)


<a id="section-one"></a>
## **1. Twitter Dataset Analysis**

<a id="section-one-a"></a>
## Dataset Summary


**Source**: The dataset has been collected from an open-source repository available at this site: [Twitter Stream Archive](https://archive.org/search?query=collection%3Atwitterstream&sort=-publicdate).

**Date Range**: The dataset covers a substantial period, ranging from **July 9, 2021**, to **July 21, 2021**.

**Volume**: This dataset is extensive, with approximately **4,000 to 5,000 tweets recorded every minute** during the specified date range.

The dataset provides a comprehensive collection of tweets, capturing a snapshot of social media conversations during a significant time frame. It offers a valuable resource for analyzing and understanding online discourse during the specified period.





In [None]:
csv_file_path= '/kaggle/input/all-days-extracted-tweets-unrest-2021/Extracted_south_african_protest_7.csv'
unrest_tweets2 = pd.read_csv(csv_file_path, low_memory=False)

In [None]:
unrest_tweets2.head()

In [None]:
print('There are {} rows and {} columns in the twitter dataset'.format(unrest_tweets2.shape[0],unrest_tweets2.shape[1]))

<a id="section-one-c"></a>
## Number of Tweets per Day

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter

# Assuming earthquake_tweets is your DataFrame with the 'date' column
unrest_tweets2['date'] = pd.to_datetime(unrest_tweets2["Tweet Created At"])  # Convert the 'date' column to datetime

# Resample the data by day and count the number of tweets
tweets_per_day = unrest_tweets2.resample('D', on='date').size()

# Create a figure and axis
fig, ax = plt.subplots()

# Plot the data
ax.plot(tweets_per_day.index, tweets_per_day.values, marker='o')

# Set the x-axis format to display only the date
date_formatter = DateFormatter('%Y-%m-%d')
ax.xaxis.set_major_formatter(date_formatter)

# Set the dates starting from the beginning of the event:
specific_dates = [pd.Timestamp('2021-07-09')]  
date_range = pd.date_range(start='2021-07-10', end='2021-07-18', freq='D')
all_dates = specific_dates + date_range.tolist() 

ax.set_xticks(all_dates)

# # Ensure that the specific date '2023-02-06' is displayed on the x-axis
# ax.axvline(pd.Timestamp('2023-02-06'), color='red', linestyle='--', label='2023-02-06')
# ax.legend()

# Rotate the x-axis labels for better visibility
plt.xticks(rotation=90)

# Set labels and title
ax.set_xlabel('Date')
ax.set_ylabel('Number of Tweets')
ax.set_title('Number of Tweets per Day')

plt.tight_layout()
plt.show()

<a id="section-one-d"></a>
# Location Analysis

In [None]:
import pandas as pd

# Count non-null values in the "Tweet Coordinates" column
non_null_count = unrest_tweets2["Tweet Coordinates"].count()

# Display the count
print("Number of non-null values in 'Tweet Coordinates':", non_null_count)

In [None]:
import pandas as pd

# Print the first 5 rows of the "User Location" column
print("Sample of 'User Location' column:")
print(unrest_tweets2["User Location"].head())

# Check the datatype of the "User Location" column
data_type = unrest_tweets2["User Location"].dtype
print("\nDatatype of 'User Location' column:", data_type)


In [None]:
import pandas as pd

# Count non-null values in the "Tweet Coordinates" column
non_null_count = unrest_tweets2["User Location"].count()

# Display the count
print("Number of non-null values in 'User Location':", non_null_count)

# Filter rows with non-null values in the "User Location" column
non_null_user_location_df = unrest_tweets2[unrest_tweets2["User Location"].notnull()]

# Check the size (number of rows and columns)
num_rows, num_columns = non_null_user_location_df.shape
print("Number of rows:", num_rows)
print("Number of columns:", num_columns)

# Check for null values in the "User Location" column
null_values_count = non_null_user_location_df["User Location"].isnull().sum()
print("Number of null values in 'User Location' column:", null_values_count)


**Test code for API to resolve locations into longitude and latitute:**

In [None]:
# import os
# import pandas as pd
# from geopy.geocoders import Nominatim

# # Define the geocoding function
# def geocode_location(location):
#     geolocator = Nominatim(user_agent="Investigation Project")
#     try:
#         location = geolocator.geocode(location)
#         return (location.longitude, location.latitude)
#     except AttributeError:
#         return None

# # Test the geocoding function with a sample location
# sample_location = "South Africa, Cape Town"

# # Call the geocoding function and print the result
# result = geocode_location(sample_location)

# if result is not None:
#     longitude, latitude = result
#     print(f"Longitude: {longitude}, Latitude: {latitude}")

#     # Create a DataFrame from the geocoding result
#     data = {'Location': [sample_location], 'Longitude': [longitude], 'Latitude': [latitude]}
#     df = pd.DataFrame(data)

#     # Define the output directory
#     output_dir = '/kaggle/working/'

#     # Save the DataFrame to a CSV file in the output directory
#     output_filename = 'geocoding_result.csv'
#     output_filepath = os.path.join(output_dir, output_filename)
#     df.to_csv(output_filepath, index=False)

#     print(f"Geocoding result saved to {output_filepath}")
# else:
#     print("Location not found or there was an issue with geocoding.")


**The following code is used to query the Nominatim API to resolve the locations to longitudes and latitude for plotting. However, within the data, errors arose from the queries, so the data has been split into blocks to isolate which part of the data is corrupted/producing the query error. Each block of the overall data is stored in a csv which in turn will be combined.**

In [None]:
# import os
# import pandas as pd
# from geopy.geocoders import Nominatim
# from geopy.exc import GeocoderQueryError

# def geocode_and_save_block(user_locations, block_number, batch_size=100):
#     # Define a function to geocode user locations to latitude and longitude
#     def geocode_location_batch(locations, batch_size=batch_size):
#         geolocator = Nominatim(user_agent="SA Tweets from Social Unrest")
#         total_locations = len(locations)
#         geocoded_data = []
        
#         # Define a counter to keep track of successful geocoded locations
#         geocoded_count = 0

#         for i in range(0, total_locations, batch_size):
#             batch_locations = locations[i:i + batch_size]
#             batch_results = []

#             for location in batch_locations:
#                 try:
#                     result = geolocator.geocode(location, timeout=10)  # Increase timeout
#                     if result:
#                         batch_results.append((location, result.longitude, result.latitude))
#                         geocoded_count += 1
#                 except (AttributeError, GeocoderQueryError) as e:
#                     print(f"Error geocoding location: {location} - {str(e)}")

#             geocoded_data.extend(batch_results)
#             print(f"Geocoded {geocoded_count} of {total_locations} locations.")

#         return geocoded_data

#     # Split the data into four blocks
#     total_data = len(user_locations)
#     block_size = total_data // 4  # Divide the data into 4 equal blocks

#     if block_number not in [1, 2, 3, 4]:
#         raise ValueError("Block number should be 1, 2, 3, or 4.")

#     # Choose the specified block
#     block = user_locations[(block_number - 1) * block_size:block_number * block_size]

#     # Batch geocode the specified block of user locations
#     geocoded_results = geocode_location_batch(block)

#     # Create a DataFrame from the geocoded results
#     geocoded_df = pd.DataFrame(geocoded_results, columns=["User Location", "Longitude", "Latitude"])

#     # Define the output CSV file path
#     output_directory = '/kaggle/working/'
#     output_filename = f'block{block_number}_geocoding_data.csv'
#     output_filepath = os.path.join(output_directory, output_filename)

#     # Save the geocoded data to a CSV file
#     geocoded_df.to_csv(output_filepath, index=False)
#     print(f"Geocoded data saved to {output_filepath}")

#     print(f"Geocoding completed successfully for block {block_number}.")

# # Extract the 'User Location' column as a list
# user_locations = non_null_user_location_df["User Location"].tolist()

# # Specify which block to geocode and save (e.g., block_number=1)
# geocode_and_save_block(user_locations, block_number=1)


In [None]:
import pandas as pd
import os

# Define the directory path where the CSV files are located
directory = '/kaggle/input/all-geocoded-data/'

# Define the file names of the CSV files
file_names = [
    'block1_geocoding_data.csv',
    'block2_geocoding_data.csv',
    'block3_geocoding_data.csv',
    'block4_geocoding_data.csv'
]

# Initialize an empty list to store DataFrames
data_frames = []

# Define a function to preprocess the CSV data
def preprocess_csv(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    # Initialize lists to store valid longitude and latitude values
    valid_longitude = []
    valid_latitude = []

    for line in lines[1:]:
        values = line.strip().split(',')
        # Check if there are at least 2 values (longitude and latitude)
        if len(values) >= 2:
            # Attempt to convert longitude and latitude to floats
            try:
                longitude = float(values[-2])
                latitude = float(values[-1])
                valid_longitude.append(longitude)
                valid_latitude.append(latitude)
            except ValueError:
                # Skip lines with invalid longitude or latitude values
                continue

    # Create a DataFrame from the valid longitude and latitude values
    df = pd.DataFrame({'Longitude': valid_longitude, 'Latitude': valid_latitude})

    return df

for file_name in file_names:
    print(f"Reading file: {file_name}")
    file_path = directory + file_name
    # Preprocess the CSV file
    df = preprocess_csv(file_path)
    data_frames.append(df)

# Concatenate the DataFrames vertically to combine them
combined_df = pd.concat(data_frames, ignore_index=True)

# Display key features of the combined DataFrame
print("Head of the Combined DataFrame:")
print(combined_df.head())

print("\nNumber of Rows and Columns in the Combined DataFrame:")
print(combined_df.shape)

# Now, you have the combined DataFrame containing only valid longitude and latitude.

In [None]:
import geopandas as gpd
from shapely.geometry import Point
import matplotlib.pyplot as plt


# Create Point geometries using Shapely
geometry = [Point(xy) for xy in zip(combined_df['Longitude'], combined_df['Latitude'])]

# Create a GeoDataFrame
crs = 'EPSG:4326'  # Assuming the coordinates are in WGS 84
geo_df = gpd.GeoDataFrame(combined_df, crs=crs, geometry=geometry)

# Download world shapefile from GeoPandas datasets (if not already downloaded)
# gpd.datasets.get_path('naturalearth_lowres').to_csv("path_to_shapefile.zip")
 
# Load world shapefile
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

# Create subplots
fig, ax = plt.subplots(1, figsize=(16, 8), facecolor='lightblue')

# Plot the world map with white color
world.plot(ax=ax, color='white', edgecolor='black')

# Plot your geo_df data
geo_df.plot(ax=ax, markersize=1, color='m', marker='o')

# Turn off the axis
ax.axis('off')

# Show the plot
plt.show()

In [None]:
import geopandas as gpd
from shapely.geometry import Point
import matplotlib.pyplot as plt


# Create Point geometries using Shapely
geometry = [Point(xy) for xy in zip(combined_df['Longitude'], combined_df['Latitude'])]

# Create a GeoDataFrame
crs = 'EPSG:4326'  # Assuming the coordinates are in WGS 84
geo_df = gpd.GeoDataFrame(combined_df, crs=crs, geometry=geometry)

# Load South African provinces shapefile
sa_provinces = gpd.read_file('/kaggle/input/natural-earth-provinces/ne_10m_admin_1_states_provinces.shp')

# Filter the shapefile to get only South Africa
south_africa = sa_provinces[sa_provinces['admin'] == 'South Africa']

# Get the bounding box of South Africa
sa_bbox = south_africa.bounds

# Set the map extent to cover South Africa's bounding box
fig, ax = plt.subplots(1, figsize=(16, 8), facecolor='white')
ax.set_xlim([sa_bbox.minx.min(), sa_bbox.maxx.max()])
ax.set_ylim([sa_bbox.miny.min(), sa_bbox.maxy.max()])

# Plot the South African provinces
south_africa.boundary.plot(ax=ax, linewidth=2, color='black')

# Plot your geo_df data within the bounds of South Africa
geo_df.plot(ax=ax, markersize=1, color='m', marker='o')

# Turn off the axis
ax.axis('off')

# Show the plot
plt.show()


<a id="section-two"></a>
#  **Global Database of Events, Language, and Tone (GDELT)**

<a id="section-two-a"></a>
The Global Database of Events, Language, and Tone (GDELT) is a comprehensive and continuously updated dataset that monitors and records various global events, news articles, and media sources from around the world. GDELT's primary purpose is to provide a vast repository of structured data that researchers, analysts, and data scientists can use to analyze and gain insights into global events, trends, and sentiments.
​
Key features of GDELT include:
​
- **Event Data**: GDELT captures a wide range of events, including political, economic, social, and cultural events, across different countries and regions.
​
- **Media Monitoring**: GDELT scans thousands of news articles, broadcasts, and online sources in multiple languages to extract valuable information.
​
- **Sentiment Analysis**: It includes sentiment analysis and tone indicators, helping to understand the emotional context of news and events.
​
- **Temporal Coverage**: GDELT's data goes back several decades, allowing users to explore historical trends and patterns.
​
- **Geospatial Information**: The dataset includes geospatial information, enabling the mapping of events and their locations.
​
https://www.gdeltproject.org/
​

The Global Database of Events, Language, and Tone (GDELT) is a comprehensive and continuously updated dataset that monitors and records various global events, news articles, and media sources from around the world. GDELT's primary purpose is to provide a vast repository of structured data that researchers, analysts, and data scientists can use to analyze and gain insights into global events, trends, and sentiments.
                                                                                                                                                                                                               

<a id="section-two-b"></a>
# GDELT Exploratory Data Analysis (EDA)

In [None]:
import pandas as pd

 

# Define the directory and file name
directory = "south-africa-gdelts-riots-2021"
file_name = "Query_riots_2021_test6_allColumns.csv"

 

# Create the full file path
file_path = "/kaggle/input/south-africa-gdelt-riots-2021/Query_roits_2021_test6_allColumns.csv"

 

# Load the dataset into a Pandas DataFrame
try:
    df = pd.read_csv(file_path)
    
    # Display the main specifications of the dataset
    print("Dataset Specifications:")
    print(f"File Path: {file_path}")
    print(f"Number of Rows: {len(df)}")
    print(f"Number of Columns: {len(df.columns)}")
    print("Column Names:")
    for column in df.columns:
        print(f" - {column}")
    print("Data Types:")
    for column, dtype in df.dtypes.items():
        print(f" - {column}: {dtype}")
#     print("Summary Statistics:")
#     print(df.describe())
except FileNotFoundError:
    print(f"File '{file_path}' not found.")
except Exception as e:
    print(f"An error occurred: {str(e)}")

<a id="section-two-c"></a>
# Total Number of Mentions per day

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
file_path = "/kaggle/input/south-africa-gdelt-riots-2021/Query_roits_2021_test6_allColumns.csv"
df = pd.read_csv(file_path)

# Group the data by SQLDATE and count the number of data points for each date
date_counts = df['SQLDATE'].value_counts().sort_index()

# Create a bar graph
plt.figure(figsize=(12, 6))
date_counts.plot(kind='bar', color='skyblue')
plt.title('Number of articles per day')
plt.xlabel('DATE')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
file_path = "/kaggle/input/south-africa-gdelt-riots-2021/Query_roits_2021_test6_allColumns.csv"
df = pd.read_csv(file_path)

# Group the data by SQLDATE and count the number of data points for each date
date_counts = df['SQLDATE'].value_counts().sort_index()

# Create a line graph
plt.figure(figsize=(12, 6))
date_counts.plot(kind='line', color='skyblue')
plt.title('Number of articles per day')
plt.xlabel('DATE')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()


**Number of Mentions of the Article per day**

**Definition of NumMentions:**
NumMentions=	(NULLABLE;	INTEGER)	This is the total number of mentions of this event across all source documents during the 15 minute update in which it was first seen. Multiple references to an event within a single document also contribute to this count. This can be used as a method of assessing the “importance” of an event: the more discussion of that event, the more likely it is to be significant. The total universe of source documents and the density of events within them vary over time, so it is recommended that this field be normalized by the average or other measure of the universe of events during the time period of interest. This field is actually a composite score of the total number of raw mentions and the number of mentions extracted from reprocessed versions of each article (see the discussion for the Mentions table). NOTE: this field refers only to the first news report to mention an event and is not updated if the event is found in a different context in other news reports. It is included for legacy purposes – for more precise information on the positioning of an event, see the Mentions table.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
file_path = "/kaggle/input/south-africa-gdelt-riots-2021/Query_roits_2021_test6_allColumns.csv"
df = pd.read_csv(file_path)

# Group the data by SQLDATE and sum the NumMentions for each date
date_mentions = df.groupby('SQLDATE')['NumMentions'].sum()

# Create a bar graph
plt.figure(figsize=(12, 6))
date_mentions.plot(kind='bar', color='skyblue')
plt.title('Total Number of NumMentions per Day')
plt.xlabel('SQLDATE')
plt.ylabel('Total NumMentions')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

<a id="section-two-d"></a>
# GDELT Sentiment Analysis 

**Distribution of sentiment of articles**
This uses the the column title AvgTone to gauge the sentiment.

**Definition of AvgTone:**
AvgTone=	(NULLABLE;	FLOAT)	This is the average “tone” of all documents containing one or more mentions of this event during the 15 minute update in which it was first seen. The score ranges from -100 (extremely negative) to +100 (extremely positive). Common values range between -10 and +10, with 0 indicating neutral. This can be used as a method of filtering the “context” of events as a subtle measure of the importance of an event and as a proxy for the “impact” of that event. For example, a riot event with a slightly negative average tone is likely to have been a minor occurrence, whereas if it had an extremely negative average tone, it suggests a far more serious occurrence. A riot with a positive score likely suggests a very minor occurrence described in the context of a more positive narrative (such as a report of an attack occurring in a discussion of improving conditions on the ground in a country and how the number of attacks per day has been greatly reduced). NOTE: this field refers only to the first news report to mention an event and is not updated if the event is found in a different context in other news reports. It is included for legacy purposes – for more precise information on the positioning of an event, see the Mentions table. NOTE: this provides only a basic tonal assessment of an article and it is recommended that users interested in emotional measures use the Mentions and Global Knowledge Graph tables to merge the complete set of 2,300 emotions and themes from the GKG GCAM system into their analysis of event records. (Source: GDELT database)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
file_path = "/kaggle/input/south-africa-gdelt-riots-2021/Query_roits_2021_test6_allColumns.csv"
df = pd.read_csv(file_path)

# Create a histogram of AvgTone with a bin width of 1 unit
bin_width = 1.0
plt.figure(figsize=(12, 6))
plt.hist(df['AvgTone'], bins=int((df['AvgTone'].max() - df['AvgTone'].min()) / bin_width), color='skyblue')
plt.title('Average sentiment of articles')
plt.xlabel('Average Sentiment')
plt.ylabel('Number of Articles')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()



In [None]:
import pandas as pd
import matplotlib.pyplot as plt

start_date = ''
end_date = ''

# Load the dataset
file_path = "/kaggle/input/south-africa-gdelt-riots-2021/Query_roits_2021_test6_allColumns.csv"
df = pd.read_csv(file_path, parse_dates=['SQLDATE'])

# Group the data by date and calculate the average sentiment (AvgTone) for each day
average_sentiment_per_day = df.groupby('SQLDATE')['AvgTone'].mean()

# Plot the average sentiment over time
plt.figure(figsize=(12, 6))
plt.plot(average_sentiment_per_day.index, average_sentiment_per_day.values, marker='o', linestyle='-')
plt.title('Average Sentiment per Day Over Time')
plt.xlabel('Date')
plt.ylabel('Average Sentiment (AvgTone)')
plt.grid(True)

# Set the date range from '2020-10-15' to '2020-11-08'
date_range = pd.date_range(start='2021-07-09', end='2021-07-21', freq='D')
all_dates = date_range.tolist()

# # Set the x-axis ticks to match the SQLDATE values
# dates_to_highlight = ["2020-11-03", "2020-11-07", "2020-10-22"]  # Add your desired dates here
# title_of_dates = ["Election Day", "Highest Tweets", "Final Presidential Debate"]
# colors = ["red", "green", "yellow"]

# # Calculate the positions for evenly spaced x-axis ticks
# x_positions = [average_sentiment_per_day.index.get_loc(date) for date in dates_to_highlight]

# for i, date in enumerate(dates_to_highlight):
#     title = title_of_dates[i]
#     color = colors[i]
#     date_current = dates_to_highlight[i]
#     position = pd.Timestamp(date_current)
#     plt.axvline(x=position, color=color, linestyle='--', label=title)

# Set the x-axis ticks to be all_dates and their labels
plt.xticks(all_dates, [date.strftime('%Y-%m-%d') for date in all_dates], rotation=45)

plt.xticks(rotation=80)
# Show the plot
plt.tight_layout()
plt.legend()
plt.show()


<a id="section-three"></a>
# **3. Google Trends Data Analysis**

For the Google Trends analysis conducted, we focused on the search interest of three key keywords:

1. **Looting**
2. **State of Emergency**
3. **Protest**

These keywords were selected to examine the search interest and popularity trends surrounding the 2020 U.S. Presidential Election.

<a id="section-three-b"></a>
## Web Search

[Google Trends Analysis: Looting,State of Emergency, Protest](https://trends.google.com/trends/explore?date=2021-07-09%202021-07-21&geo=ZA&q=Looting,State%20of%20Emergency,Protest)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from io import StringIO

# Create a string with the CSV data
csv_data = """Day,Looting_web,State of emergency_web,Protest_web
2021-07-09,<1,<1,5
2021-07-10,1,0,5
2021-07-11,7,<1,11
2021-07-12,68,12,23
2021-07-13,100,16,13
2021-07-14,73,25,7
2021-07-15,44,5,2
2021-07-16,28,3,2
2021-07-17,17,1,1
2021-07-18,13,1,2
2021-07-19,11,<1,1
2021-07-20,8,<1,<1
2021-07-21,6,<1,1"""

# Create a DataFrame from the CSV data
df = pd.read_csv(StringIO(csv_data))

# Replace "<1" and "<1,5" values with appropriate numerical values
df['Looting_web'] = df['Looting_web'].replace(['<1', '<1,5'], [0.5, 1])
df['State of emergency_web'] = df['State of emergency_web'].replace(['<1', '<1,5'], [0.5, 1])
df['Protest_web'] = df['Protest_web'].replace(['<1', '<1,5'], [0.5, 1])

# Convert "Day" column to datetime
df['Day'] = pd.to_datetime(df['Day'])

# Convert "Protest_web" column to numeric
df['Protest_web'] = pd.to_numeric(df['Protest_web'], errors='coerce')

looting_web_df = pd.DataFrame({'Day': df['Day'], 'Looting Web': df['Looting_web']})
state_of_emergency_web_df = pd.DataFrame({'Day': df['Day'], 'State of Emergency Web': df['State of emergency_web']})
protest_web_df = pd.DataFrame({'Day': df['Day'], 'Protest Web': df['Protest_web']})

# Create a figure and axis
fig, ax = plt.subplots(figsize=(12, 6))

# Plot the data
plt.plot(df['Day'], df['Looting_web'], label='"Looting"', marker='o')
plt.plot(df['Day'], df['State of emergency_web'], label='"State of Emergency"', marker='o')
plt.plot(df['Day'], df['Protest_web'], label='"Protest"', marker='o')

# Set y-axis range from 0 to 105
plt.ylim(0, 105)

# Set x-axis ticks to match the dates
plt.xticks(df['Day'], rotation=45)

# Add vertical lines for specific dates
dates_to_highlight = ["2021-07-09", "2021-07-12"]  # Add your desired dates here
title_of_dates = ["Start of Unrest", "SANDF deployed"]
colors = ["red", "green"]

for date_to_highlight, color, title in zip(dates_to_highlight, colors, title_of_dates):
    ax.axvline(pd.to_datetime(date_to_highlight), color=color, linestyle='--', label=title)

# Merge both legends into a single legend
legend = ax.legend(loc='upper left')

# Add labels and title
plt.xlabel('Date')
plt.ylabel('Search Interest')
plt.title('Web Search Trends Data (July 9, 2021 - July 21, 2021)')

# Show the plot
plt.tight_layout()
plt.show()


<a id="section-three-c"></a>
## **News Search**

[Google Trends Analysis:Looting, State of Emergency, Protest](https://trends.google.com/trends/explore?date=2021-07-09%202021-07-21&geo=ZA&gprop=news&q=Looting,State%20of%20Emergency,Protest) 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from io import StringIO

# Create a string with the CSV data
csv_data = """Day,Looting_news,State of emergency_news,Protest_news
2021-07-09,0,0,9
2021-07-10,0,0,12
2021-07-11,0,6,15
2021-07-12,55,23,25
2021-07-13,100,21,25
2021-07-14,54,12,0
2021-07-15,30,0,39
2021-07-16,22,0,0
2021-07-17,0,0,7
2021-07-18,0,0,0
2021-07-19,13,0,0
2021-07-20,11,10,0
2021-07-21,12,0,14
"""



# Create a DataFrame from the CSV data
df = pd.read_csv(StringIO(csv_data))

# Convert "Day" column to datetime
df['Day'] = pd.to_datetime(df['Day'])

# Create DataFrames for each curve
looting_news_df = pd.DataFrame({'Day': df['Day'], 'Looting News': df['Looting_news']})
state_of_emergency_news_df = pd.DataFrame({'Day': df['Day'], 'State of Emergency News': df['State of emergency_news']})
protest_news_df = pd.DataFrame({'Day': df['Day'], 'Protest News': df['Protest_news']})

# Create a figure and axis
fig, ax = plt.subplots(figsize=(12, 6))

# Plot the data
plt.plot(df['Day'], df['Looting_news'], label='Looting', marker='o')
plt.plot(df['Day'], df['State of emergency_news'], label='State of Emergency', marker='o')
plt.plot(df['Day'], df['Protest_news'], label='Protest', marker='o')

# Set y-axis range from 0 to the maximum value in the dataset
plt.ylim(0, df[['Looting_news', 'State of emergency_news', 'Protest_news']].max().max() + 5)

# Set x-axis ticks to match the dates
plt.xticks(df['Day'], rotation=45)


# Add vertical lines for specific dates
dates_to_highlight = ["2021-07-09", "2021-07-12"]  # Add your desired dates here
title_of_dates = ["Start of Unrest", "SANDF deployed"]
colors = ["red", "green"]

for date_to_highlight, color, title in zip(dates_to_highlight, colors, title_of_dates):
    ax.axvline(pd.to_datetime(date_to_highlight), color=color, linestyle='--', label=title)

# Merge both legends into a single legend
legend = ax.legend(loc='upper left')

# Add labels and title
plt.xlabel('Date')
plt.ylabel('Search Interest')
plt.title('News Search Trends Data (July 9, 2021 - July 21, 2021)')

# Show the plot
plt.tight_layout()
plt.show()


<a id="section-four"></a>
# **4. Pattern of Interest Comparisons**


<a id="section-four-a"></a>
## Comparison of Number of Tweets and GDELT Articles per Day

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter

# Assuming unrest_tweets2 is your DataFrame with the 'date' column (format from (1))

# Load the dataset from (2)
file_path = "/kaggle/input/south-africa-gdelt-riots-2021/Query_roits_2021_test6_allColumns.csv"
df = pd.read_csv(file_path)

# Group the data by SQLDATE and count the number of data points for each date
date_counts = df['SQLDATE'].value_counts().sort_index()

# Convert SQLDATE to the same datetime format as in (1)
date_counts.index = pd.to_datetime(date_counts.index, format='%Y%m%d')

# Create a figure and axis
fig, ax = plt.subplots(figsize=(12, 6))

# Plot the first dataset (from (1))
ax.plot(tweets_per_day.index, tweets_per_day.values, marker='o', label='Number of Tweets per Day', color='skyblue')

# Plot the second dataset (converted from SQLDATE)
ax.plot(date_counts.index, date_counts.values, marker='o', label='Number of GDELT Articles per Day', color='orange')

# Set the x-axis format to display only the date
date_formatter = DateFormatter('%Y-%m-%d')
ax.xaxis.set_major_formatter(date_formatter)

# Set the dates starting from the beginning of the event:
specific_dates = [pd.Timestamp('2021-07-09')]
date_range = pd.date_range(start='2021-07-10', end='2021-07-18', freq='D')
all_dates = specific_dates + date_range.tolist()

ax.set_xticks(all_dates)

# Rotate the x-axis labels for better visibility
plt.xticks(rotation=90)

# Set labels and title
ax.set_xlabel('Date')
ax.set_ylabel('Count')
ax.set_title('Number of Tweets and GDELT Articles per Day')

# Add a legend
ax.legend()

plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter

# Assuming unrest_tweets2 is your DataFrame with the 'date' column (format from (1))

# Load the dataset from (2)
file_path = "/kaggle/input/south-africa-gdelt-riots-2021/Query_roits_2021_test6_allColumns.csv"
df = pd.read_csv(file_path)

# Group the data by SQLDATE and count the number of data points for each date
date_counts = df['SQLDATE'].value_counts().sort_index()

# Convert SQLDATE to the same datetime format as in (1)
date_counts.index = pd.to_datetime(date_counts.index, format='%Y%m%d')

# Normalize the first dataset (from (1)) to the range [0, 1]
max_value_tweets = tweets_per_day.max()
normalized_tweets = tweets_per_day / max_value_tweets

# Normalize the second dataset (converted from SQLDATE) to the range [0, 1]
max_value_articles = date_counts.max()
normalized_date_counts = date_counts / max_value_articles

# Create a figure and axis
fig, ax = plt.subplots(figsize=(12, 6))

# Plot the first dataset (normalized from (1)) on the left y-axis
ax.plot(tweets_per_day.index, normalized_tweets.values, marker='o', label='Normalized Number of Tweets per Day', color='skyblue')

# Plot the second dataset (normalized) on the same y-axis
ax.plot(date_counts.index, normalized_date_counts.values, marker='o', label='Normalized Number of GDELT Articles per Day', color='orange')

# Ensure that all dates are displayed on the x-axis
date_range = pd.date_range(start='2021-07-09', end='2021-07-21', freq='D')
ax.set_xticks(date_range)

# Rotate the x-axis labels for better visibility
ax.tick_params(axis='x', rotation=90)

# Add vertical lines for specific dates
dates_to_highlight = ["2021-07-09", "2021-07-12"]  # Add your desired dates here
title_of_dates = ["Start of Unrest", "SANDF deployed"]
colors = ["red", "green"]

for date_to_highlight, color, title in zip(dates_to_highlight, colors, title_of_dates):
    ax.axvline(pd.to_datetime(date_to_highlight), color=color, linestyle='--', label=title)


# Set labels and title
ax.set_xlabel('Date')
ax.set_ylabel('Normalized Count')
ax.set_title('Comparison of Normalized Tweets and Normalized GDELT Articles per Day')

# Add a legend for both datasets
ax.legend(loc='upper right')

plt.tight_layout()
plt.show()

<a id="section-four-b"></a>
# **Comparison of Normalized Tweets and Normalized Google Trends Web Search Data**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from io import StringIO

# Create a string with the CSV data and manually replace "<1" with "0.5"
csv_data = """Day,Looting_web,State of emergency_web,Protest_web
2021-07-09,0.5,0.5,5
2021-07-10,1,0,5
2021-07-11,7,0.5,11
2021-07-12,68,12,23
2021-07-13,100,16,13
2021-07-14,73,25,7
2021-07-15,44,5,2
2021-07-16,28,3,2
2021-07-17,17,1,1
2021-07-18,13,1,2
2021-07-19,11,0.5,1
2021-07-20,8,0.5,0.5
2021-07-21,6,0.5,1"""

# Create a DataFrame from the CSV data
df = pd.read_csv(StringIO(csv_data))

# Convert "Day" column to datetime
df['Day'] = pd.to_datetime(df['Day'])

# Normalize all dataframes to their respective max values
df['Looting_web'] = df['Looting_web'] / df['Looting_web'].max()
df['State of emergency_web'] = df['State of emergency_web'] / df['State of emergency_web'].max()
df['Protest_web'] = df['Protest_web'] / df['Protest_web'].max()

looting_web_df = pd.DataFrame({'Day': df['Day'], 'Looting Web (Normalized)': df['Looting_web']})
state_of_emergency_web_df = pd.DataFrame({'Day': df['Day'], 'State of Emergency Web (Normalized)': df['State of emergency_web']})
protest_web_df = pd.DataFrame({'Day': df['Day'], 'Protest Web (Normalized)': df['Protest_web']})

# Assuming earthquake_tweets is your DataFrame with the 'date' column
unrest_tweets2['date'] = pd.to_datetime(unrest_tweets2["Tweet Created At"])  # Convert the 'date' column to datetime

# Resample the data by day and count the number of tweets
tweets_per_day = unrest_tweets2.resample('D', on='date').size()

# Normalize the tweets_per_day data to its max value
tweets_per_day_normalized = tweets_per_day / tweets_per_day.max()

# Create a figure and axis
fig, ax = plt.subplots(figsize=(12, 6))

# Plot the normalized data
plt.plot(df['Day'], df['Looting_web'], label='Looting Web (Normalized)', marker='o')
plt.plot(df['Day'], df['State of emergency_web'], label='State of Emergency Web (Normalized)', marker='o')
plt.plot(df['Day'], df['Protest_web'], label='Protest Web (Normalized)', marker='o')
plt.plot(tweets_per_day_normalized.index, tweets_per_day_normalized.values, label='Tweets per Day (Normalized)', marker='o')

# Set y-axis range from 0 to 1 (normalized range)
plt.ylim(0, 1.1)

# Set x-axis ticks to match the dates
plt.xticks(df['Day'], rotation=45)

# Add vertical lines for specific dates
dates_to_highlight = ["2021-07-09", "2021-07-12"]  # Add your desired dates here
title_of_dates = ["Start of Unrest", "SANDF deployed"]
colors = ["red", "green"]

for date_to_highlight, color, title in zip(dates_to_highlight, colors, title_of_dates):
    ax.axvline(pd.to_datetime(date_to_highlight), color=color, linestyle='--', label=title)

# Merge all legends into a single legend
legend = ax.legend(loc='upper right')

# Add labels and title
plt.xlabel('Date')
plt.ylabel('Normalized Values')
plt.title('Normalized Data (Web Search Trends and Tweets)')

# Show the plot
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
import ipywidgets as widgets
from ipywidgets import interact

# ... (Previous code to load and preprocess data)
# Create a string with the CSV data and manually replace "<1" with "0.5"
csv_data = """Day,Looting_web,State of emergency_web,Protest_web
2021-07-09,0.5,0.5,5
2021-07-10,1,0,5
2021-07-11,7,0.5,11
2021-07-12,68,12,23
2021-07-13,100,16,13
2021-07-14,73,25,7
2021-07-15,44,5,2
2021-07-16,28,3,2
2021-07-17,17,1,1
2021-07-18,13,1,2
2021-07-19,11,0.5,1
2021-07-20,8,0.5,0.5
2021-07-21,6,0.5,1"""

# Create a DataFrame from the CSV data
df = pd.read_csv(StringIO(csv_data))

# Convert "Day" column to datetime
df['Day'] = pd.to_datetime(df['Day'])

# Normalize all dataframes to their respective max values
df['Looting_web'] = df['Looting_web'] / df['Looting_web'].max()
df['State of emergency_web'] = df['State of emergency_web'] / df['State of emergency_web'].max()
df['Protest_web'] = df['Protest_web'] / df['Protest_web'].max()

looting_web_df = pd.DataFrame({'Day': df['Day'], 'Looting Web (Normalized)': df['Looting_web']})
state_of_emergency_web_df = pd.DataFrame({'Day': df['Day'], 'State of Emergency Web (Normalized)': df['State of emergency_web']})
protest_web_df = pd.DataFrame({'Day': df['Day'], 'Protest Web (Normalized)': df['Protest_web']})

# Create a function to update the plot based on the selected datasets
def update_plot(show_tweets, show_looting, show_state, show_protest):
    fig, ax = plt.subplots(figsize=(12, 6))

    if show_tweets:
        ax.plot(tweets_per_day.index, tweets_per_day_normalized, marker='o', label='Tweets per Day (Normalized)', color='skyblue')
    if show_looting:
        ax.plot(df['Day'], df['Looting_web'], marker='o', label='Looting Web (Normalized)', color='orange')
    if show_state:
        ax.plot(df['Day'], df['State of emergency_web'], marker='o', label='State of Emergency Web (Normalized)', color='green')
    if show_protest:
        ax.plot(df['Day'], df['Protest_web'], marker='o', label='Protest Web (Normalized)', color='red')

    # Add vertical lines for specific dates
    dates_to_highlight = ["2021-07-09", "2021-07-12"]  # Add your desired dates here
    title_of_dates = ["Start of Unrest", "SANDF deployed"]
    colors = ["purple", "yellow"]

    for date_to_highlight, color, title in zip(dates_to_highlight, colors, title_of_dates):
        ax.axvline(pd.to_datetime(date_to_highlight), color=color, linestyle='--', label=title)

    # Merge all legends into a single legend
    legend = ax.legend(loc='upper left')

    # Ensure that all dates are displayed on the x-axis
    date_range = pd.date_range(start='2021-07-09', end='2021-07-21', freq='D')
    ax.set_xticks(date_range)

    # Rotate the x-axis labels for better visibility
    ax.tick_params(axis='x', rotation=45)

    # Set labels and title
    ax.set_xlabel('Date')
    ax.set_ylabel('Normalized Values')
    ax.set_title('Normalized Data (Web Search Trends and Tweets)')

    plt.tight_layout()
    plt.show()

# Create checkboxes to select the datasets
show_tweets_checkbox = widgets.Checkbox(
    value=False,
    description='Show Tweets per day',
    disabled=False
)

show_looting_checkbox = widgets.Checkbox(
    value=False,
    description='Show Looting Web Signature',
    disabled=False
)

show_state_checkbox = widgets.Checkbox(
    value=False,
    description='Show State of Emergency Web Signature',
    disabled=False
)

show_protest_checkbox = widgets.Checkbox(
    value=False,
    description='Show "Protest" Web Signature ',
    disabled=False
)

# Create an interactive plot
interact(update_plot, show_tweets=show_tweets_checkbox, show_looting=show_looting_checkbox,
         show_state=show_state_checkbox, show_protest=show_protest_checkbox)


<a id="section-four-c"></a>
## Comparison of Normalized GDELT Articles and Normalized Google Trends News Search Data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from io import StringIO

# Create a string with the CSV data for the news search trends
csv_data = """Day,Looting_news,State of emergency_news,Protest_news
2021-07-09,0,0,9
2021-07-10,0,0,12
2021-07-11,0,6,15
2021-07-12,55,23,25
2021-07-13,100,21,25
2021-07-14,54,12,0
2021-07-15,30,0,39
2021-07-16,22,0,0
2021-07-17,0,0,7
2021-07-18,0,0,0
2021-07-19,13,0,0
2021-07-20,11,10,0
2021-07-21,12,0,14
"""

# Create a DataFrame from the CSV data
df = pd.read_csv(StringIO(csv_data))

# Convert "Day" column to datetime
df['Day'] = pd.to_datetime(df['Day'])

# Create DataFrames for each curve
looting_news_df = pd.DataFrame({'Day': df['Day'], 'Looting News': df['Looting_news']})
state_of_emergency_news_df = pd.DataFrame({'Day': df['Day'], 'State of Emergency News': df['State of emergency_news']})
protest_news_df = pd.DataFrame({'Day': df['Day'], 'Protest News': df['Protest_news']})

# Normalize each DataFrame to its own max value
looting_news_df['Looting News'] = looting_news_df['Looting News'] / looting_news_df['Looting News'].max()
state_of_emergency_news_df['State of Emergency News'] = state_of_emergency_news_df['State of Emergency News'] / state_of_emergency_news_df['State of Emergency News'].max()
protest_news_df['Protest News'] = protest_news_df['Protest News'] / protest_news_df['Protest News'].max()

# Load the GDELT dataset
file_path = "/kaggle/input/south-africa-gdelt-riots-2021/Query_roits_2021_test6_allColumns.csv"
df_gdelt = pd.read_csv(file_path)

# Group the data by SQLDATE and count the number of data points for each date
date_counts = df_gdelt['SQLDATE'].value_counts().sort_index()

# Convert SQLDATE to the same datetime format as in the news search trends data
date_counts.index = pd.to_datetime(date_counts.index, format='%Y%m%d')

# Normalize the GDELT data with its own max value
date_counts_normalized = date_counts / date_counts.max()

# Create a figure and axis
fig, ax = plt.subplots(figsize=(12, 6))

# Plot the normalized data
plt.plot(df['Day'], looting_news_df['Looting News'], label='Looting News', marker='o')
plt.plot(df['Day'], state_of_emergency_news_df['State of Emergency News'], label='State of Emergency News', marker='o')
plt.plot(df['Day'], protest_news_df['Protest News'], label='Protest News', marker='o')
plt.plot(date_counts_normalized.index, date_counts_normalized.values, label='GDELT Articles (Normalized)', marker='o', color='blue')

# Set y-axis range from 0 to 1 (normalized range)
plt.ylim(0, 1.1)

# Set x-axis ticks to match the dates
plt.xticks(df['Day'], rotation=45)

# Add vertical lines for specific dates
dates_to_highlight = ["2021-07-09", "2021-07-12"]  # Add your desired dates here
title_of_dates = ["Start of Unrest", "SANDF deployed"]
colors = ["red", "green"]

for date_to_highlight, color, title in zip(dates_to_highlight, colors, title_of_dates):
    ax.axvline(pd.to_datetime(date_to_highlight), color=color, linestyle='--', label=title)

# Merge all legends into a single legend
legend = ax.legend(loc='upper left')

# Add labels and title
plt.xlabel('Date')
plt.ylabel('Normalized Values')
plt.title('Comparison of Normalized Data (News Search Trends and GDELT Articles)')

# Show the plot
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from io import StringIO
import ipywidgets as widgets
from ipywidgets import interact

# Create a string with the CSV data for the news search trends
csv_data = """Day,Looting_news,State of emergency_news,Protest_news
2021-07-09,0,0,9
2021-07-10,0,0,12
2021-07-11,0,6,15
2021-07-12,55,23,25
2021-07-13,100,21,25
2021-07-14,54,12,0
2021-07-15,30,0,39
2021-07-16,22,0,0
2021-07-17,0,0,7
2021-07-18,0,0,0
2021-07-19,13,0,0
2021-07-20,11,10,0
2021-07-21,12,0,14
"""

# Create a DataFrame from the CSV data
df = pd.read_csv(StringIO(csv_data))

# Convert "Day" column to datetime
df['Day'] = pd.to_datetime(df['Day'])

# Create DataFrames for each curve
looting_news_df = pd.DataFrame({'Day': df['Day'], 'Looting News': df['Looting_news']})
state_of_emergency_news_df = pd.DataFrame({'Day': df['Day'], 'State of Emergency News': df['State of emergency_news']})
protest_news_df = pd.DataFrame({'Day': df['Day'], 'Protest News': df['Protest_news']})

# Normalize each DataFrame to its own max value
looting_news_df['Looting News'] = looting_news_df['Looting News'] / looting_news_df['Looting News'].max()
state_of_emergency_news_df['State of Emergency News'] = state_of_emergency_news_df['State of Emergency News'] / state_of_emergency_news_df['State of Emergency News'].max()
protest_news_df['Protest News'] = protest_news_df['Protest News'] / protest_news_df['Protest News'].max()

# Load the GDELT dataset
file_path = "/kaggle/input/south-africa-gdelt-riots-2021/Query_roits_2021_test6_allColumns.csv"
df_gdelt = pd.read_csv(file_path)

# Group the data by SQLDATE and count the number of data points for each date
date_counts = df_gdelt['SQLDATE'].value_counts().sort_index()

# Convert SQLDATE to the same datetime format as in the news search trends data
date_counts.index = pd.to_datetime(date_counts.index, format='%Y%m%d')

# Normalize the GDELT data with its own max value
date_counts_normalized = date_counts / date_counts.max()

# Create a function to update the plot based on the selected dataframes
def update_plot(show_looting, show_state, show_protest, show_gdelt):
    fig, ax = plt.subplots(figsize=(12, 6))

    if show_looting:
        ax.plot(df['Day'], looting_news_df['Looting News'], marker='o', label='Looting News (Normalized)', color='orange')
    if show_state:
        ax.plot(df['Day'], state_of_emergency_news_df['State of Emergency News'], marker='o', label='State of Emergency News (Normalized)', color='green')
    if show_protest:
        ax.plot(df['Day'], protest_news_df['Protest News'], marker='o', label='Protest News (Normalized)')
    if show_gdelt:
        ax.plot(date_counts_normalized.index, date_counts_normalized.values, marker='o', label='GDELT Articles (Normalized)', color='blue')

    # Set y-axis range from 0 to 1 (normalized range)
    plt.ylim(0, 1.1)

    # Set x-axis ticks to match the dates
    plt.xticks(df['Day'], rotation=45)

    # Add vertical lines for specific dates
    dates_to_highlight = ["2021-07-09", "2021-07-12"]  # Add your desired dates here
    title_of_dates = ["Start of Unrest", "SANDF deployed"]
    colors = ["red", "green"]

    for date_to_highlight, color, title in zip(dates_to_highlight, colors, title_of_dates):
        ax.axvline(pd.to_datetime(date_to_highlight), color=color, linestyle='--', label=title)

    # Merge all legends into a single legend
    legend = ax.legend(loc='upper left')

    # Add labels and title
    plt.xlabel('Date')
    plt.ylabel('Normalized Values')
    plt.title('Comparison of Normalized Data (News Search Trends and GDELT Articles)')

    plt.tight_layout()
    plt.show()

# Create checkboxes to select the dataframes
show_looting_checkbox = widgets.Checkbox(
    value=False,
    description='Show Looting News',
    disabled=False
)

show_state_checkbox = widgets.Checkbox(
    value=False,
    description='Show State of Emergency News',
    disabled=False
)

show_protest_checkbox = widgets.Checkbox(
    value=False,
    description='Show Protest News',
    disabled=False
)

show_gdelt_checkbox = widgets.Checkbox(
    value=False,
    description='Show GDELT Articles',
    disabled=False
)

# Create an interactive plot
interact(update_plot, show_looting=show_looting_checkbox, show_state=show_state_checkbox, show_protest=show_protest_checkbox, show_gdelt=show_gdelt_checkbox)


<a id="section-five"></a>
## Mathematical Framework

