# Vannevar Labs Dataset for NatSec Hackathon 2024

Thanks for participating in the hackathon! This notebook is available at https://vl-nat-sec-hackathon-may-2024.s3.us-east-2.amazonaws.com/vl-data-download.ipynb, and we will be updating it over the course of the week with additional data and resources.

The dataset we are providing are of Russian social media posts from Telegram and VK that are related to current geopolitical events, a lot of them specifically about events occurring in Ukraine. The data in `attachment_urls` are media files that we will be providing in the same s3 bucket later this week.

If there are any issues with data access please email charu@vannevarlabs.com

In [27]:
!pip install boto3 botocore pandas

In [1]:
import boto3
import pandas as pd
from io import BytesIO
from botocore import UNSIGNED
from botocore.config import Config


# # Create a boto3 session with an anonymous user
# s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))

# bucket_name = 'vl-nat-sec-hackathon-may-2024'
# file_key = 'russia_social_media.csv'

# # Get the object from S3
# response = s3.get_object(Bucket=bucket_name, Key=file_key)

# file_content = response['Body'].read()

# Also available here: https://vl-nat-sec-hackathon-may-2024.s3.us-east-2.amazonaws.com/russia_social_media.csv



In [2]:
# Load the file content into a pandas DataFrame
readdata = pd.read_csv('../../deftech/russia_social_media.csv', nrows=10000)


In [None]:
# Write the first 20 rows of the DataFrame to a CSV file
readdata.head(20).to_csv('first_20_rows.csv', index=False)


In [None]:
res = readdata[readdata['translation'].str.contains('S-300', na=False)]['translation']
import json

# JSON encode the 'res' Series and print
# print(json.dumps(res.head(20).replace("\n", "", regex=True).to_list()))


In [None]:
[print(x, readdata['translation'][x].replace("\n", "") + "\n\n") for x in range(300, 330) ]


In [None]:
res = res.head(20)

In [None]:
[x for x in list(readdata['translation'][:10]) if x != nan]

NameError: name 'nan' is not defined

In [None]:
len(res['translation'])

TypeError: list indices must be integers or slices, not str

In [4]:
import math
text_data = list(readdata['translation'])
text_data = [x for x in text_data if isinstance(x, str) or not math.isnan(x)]



In [17]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import math

from llm import LLM
model = LLM()
# text_data = res
chunk_size = 1000
embeds = []
for start in range(0, len(text_data), chunk_size):
    end = start + chunk_size
    chunk = text_data[start:end]
    filtered_chunk = [x for x in chunk if isinstance(x, str) or not math.isnan(x)]
    embeds.extend([x.embedding for x in model.embed(filtered_chunk).data])
    

In [None]:
embeddings_list = [embed.embedding for embed in embeds.data]

In [19]:
info = cosine_similarity(embeds)



In [None]:
info

array([[1.        , 0.38237012, 0.19522703, ..., 0.33655378, 0.17889909,
        0.18209776],
       [0.38237012, 1.        , 0.32133631, ..., 0.40671039, 0.33983604,
        0.11419932],
       [0.19522703, 0.32133631, 1.        , ..., 0.27212207, 0.24539729,
        0.12478847],
       ...,
       [0.33655378, 0.40671039, 0.27212207, ..., 1.        , 0.26721802,
        0.24380792],
       [0.17889909, 0.33983604, 0.24539729, ..., 0.26721802, 1.        ,
        0.08409598],
       [0.18209776, 0.11419932, 0.12478847, ..., 0.24380792, 0.08409598,
        1.        ]])

In [None]:
res = list(res)

In [24]:
clustered = {}
used_js = set()
for i, row in enumerate(info):
    clustered[i] = []
    for j, value in enumerate(row):
        if value > 0.9 and i != j and i not in used_js:
            clustered[i].append(j)
            used_js.add(j)


In [26]:
for index, similar_indices in clustered.items():
    if len(similar_indices) > 2:
        print(f"Text at index {index}:")
        print(text_data[index].replace("\n", ""))
        for similar_index in similar_indices:
            print(f"Similar text at index {similar_index}:")
            print(text_data[similar_index].replace("\n", ""))
        print("\n" + "-"*80 + "\n")

Text at index 0:

--------------------------------------------------------------------------------

Text at index 1:

--------------------------------------------------------------------------------

Text at index 2:

--------------------------------------------------------------------------------

Text at index 3:

--------------------------------------------------------------------------------

Text at index 4:

--------------------------------------------------------------------------------

Text at index 5:

--------------------------------------------------------------------------------

Text at index 6:

--------------------------------------------------------------------------------

Text at index 7:

--------------------------------------------------------------------------------

Text at index 8:

--------------------------------------------------------------------------------

Text at index 9:

--------------------------------------------------------------------------------

