# Importing libraries and connection to database

In [1]:
from pymongo import MongoClient
import json
import re
import csv
from bson import ObjectId

# Create a MongoClient object and specify the MongoDB connection URL
url = "mongodb://localhost:27017/"
client = MongoClient(url)

# Access a specific database
db = client["YT_db"]

# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


# Reading previously created twitter collection

In [2]:
tw_collection = db.tw_collection

# Creating collection for the test youtube videos 

In [5]:
yt_test_collection = db.yt_test_collection

with open('y_test.csv', 'r') as file:
    csv_data = csv.DictReader(file)
    for row in csv_data:
        yt_test_collection.insert_one(row)

# Matching each yt video to the list of referring tweets

In [7]:
# Collect all id values from yt_test_collection
id_values = [doc['id'] for doc in yt_test_collection.find({}, {'id': 1})]
# Define the batch size
batch_size = 1000

# Split id_values into batches
id_batches = [id_values[i:i+batch_size] for i in range(0, len(id_values), batch_size)]

# Create a dictionary to store the matching tweet_ids for each id
id_tweet_ids = {id_val: set() for id_val in id_values}

# Iterate over each batch and collect the matching tweet_ids
for batch_index, id_batch in enumerate(id_batches):
    pattern = "|".join(map(re.escape, id_batch))

    # Find documents in tw_collection where expanded_url contains any of the id values in the batch
    results = tw_collection.find(
        {"expanded_url": {"$elemMatch": {"$regex": pattern}}},
        {"tweetid": 1, "expanded_url": 1}
    )
   
    # Update id_tweet_ids dictionary with the matching tweet_ids
    for doc in results:
        tweet_id = doc["tweetid"]
        expanded_url_list = doc["expanded_url"]
        for expanded_url in expanded_url_list:
            for id_val in id_batch:
                if id_val in expanded_url:
                    id_tweet_ids[id_val].add(tweet_id)

    # Print progress
    print(f"Processed batch {batch_index+1}/{len(id_batches)}")

    # Update yt_collection with the tweet_ids for each id
for id_val, tweet_ids in id_tweet_ids.items():
    yt_test_collection.update_one(
        {"id": id_val},
        {"$set": {"tweet_ids": list(tweet_ids)}}
    )

# Print completion message
print("Matching tweet_ids updated in yt_collection.")

id_tweet_ids_list = {k: list(v) for k, v in id_tweet_ids.items()}

# Save id_tweet_ids to a file
with open("id_tweet_test_ids.json", "w") as file:
    json.dump(id_tweet_ids_list, file)

print("id_tweet_ids saved to file.")

Processed batch 1/2
Processed batch 2/2
Matching tweet_ids updated in yt_collection.
id_tweet_ids saved to file.


# Saving the test collection to a json

In [8]:
# Custom JSON encoder class to handle ObjectId serialization
class JSONEncoder(json.JSONEncoder):
    def default(self, o):
        if isinstance(o, ObjectId):
            return str(o)
        return super().default(o)

# Retrieve all documents from the collection
all_documents = list(yt_test_collection.find())

# Specify the path and filename for the output JSON file
output_file = "y_test_collection.json"

# Save the collection data to the JSON file
with open(output_file, "w") as file:
    json.dump(all_documents, file, cls=JSONEncoder)


In [9]:
yt_test_collection.find_one()

{'_id': ObjectId('6471c55c3286bb1167a527bf'),
 'id': '7A9D8QfCEKQ',
 'lista_url': "['https://youtu.be/7A9D8QfCEKQ']",
 'tweet_ids': ['1320856307394473984',
  '1321399262551265280',
  '1320914155566731265',
  '1321000433779904517',
  '1321616034999291904',
  '1321400816218050560',
  '1320810069714472966',
  '1320810766711312387']}