# Libaries import and database connection

In [None]:
from pymongo import MongoClient
import json
from bson import ObjectId
import csv
import pandas as pd
import ast
import sys
csv.field_size_limit(sys.maxsize)

pd.set_option('display.max_colwidth', None)
# Create a MongoClient object and specify the MongoDB connection URL
url = "mongodb://localhost:27017/"
client1 = MongoClient(url)

# Access a specific database
db1 = client1["youtube_twitter_db1"]

# Send a ping to confirm a successful connection
try:
    client1.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

# Reading previously created twitter collection  

In [None]:
tw_collection = db1.tw_collection

# Creation new youtube collection from the updated_merged csv

In [3]:
merged_yt_collection = db1.merged_yt_collection

merged_yt_collection = db1.merged_yt_collection
path = "./updated_merged.csv"
with open(path, 'r') as file:
    csv_data = csv.DictReader(file)
    for row in csv_data:
        merged_yt_collection.insert_one(row)

# Creation of a field in the youtube collection containg all the text and hashtags of the tweets referring to each video

In [122]:
def update_video_tweets(yt_collection, tw_collection):
    video_ids = yt_collection.find()  
    i = 0
    
    for video in video_ids:
        if i%10==0:
            print(i)
        i+=1
        video_id = video["id"] 
        tweet_ids_str = video.get("tweet_ids", "")
        tweet_ids = ast.literal_eval(tweet_ids_str)

        # Retrieve corresponding tweets from tw_collection
        tweets = tw_collection.find(
            {"tweetid": {"$in": tweet_ids}}, 
            {"text": 1, "hashtag": 1} 
        )

        # Collect text and hashtag for each tweet
        tweet_text = " "
        for tweet in tweets:             
            text = tweet.get("text") 
            if len(text) > 4:
                tweet_text += " " + text

            hashtag = tweet.get("hashtag")
            if len(hashtag) > 4:
                tweet_text += " " + hashtag

        # Update the video document with the tweet_text field
        yt_collection.update_one(
            {"id": video_id},
            {"$set": {"tweet_text": tweet_text}}
        )


In [None]:
update_video_tweets(yt_collection=merged_yt_collection, tw_collection=tw_collection)

In [5]:

# Custom JSON encoder class to handle ObjectId serialization
class JSONEncoder(json.JSONEncoder):
    def default(self, o):
        if isinstance(o, ObjectId):
            return str(o)
        return super().default(o)

# Retrieve all documents from the collection
all_documents = list(merged_yt_collection.find())

# Specify the path and filename for the output JSON file
output_file = "merged_yt_collection_text1.json"

# Save the collection data to the JSON file
with open(output_file, "w") as file:
    json.dump(all_documents, file, cls=JSONEncoder,indent=4)

print("Collection data saved as JSON.")

Collection data saved as JSON.


In [129]:
merged_yt_collection.find_one()

{'_id': ObjectId('646e576c1aeeb6496207f033'),
 'id': '2bFLr70bNzA',
 'moderationStatus': '0',
 'tweet_ids': "['1319671748170797062', '1319504474956845056', '1319431259194609664', '1319537482082492416', '1319452511527460864', '1319664735915159553', '1319807337243181062']",
 'tweet_text': "  Tony Bubulinski just made a public statement about the Biden family and someone told him if he goes public with this information it will bury all of them. Seems to be another item from Pastor Dana's dream(s) that is coming to fruition before our eyes.  https://t.co/zTQq4dBQQe RT @Wolf_Wolfman: Hunter Biden’s Ex Business Partner Tony Bobulinski Drops a Bombshell https://t.co/ucdwOMq2z3 via @YouTube My guess is tha… Biden cannot be trusted! Watch you back Mr. Bobulinski. #BidenCares #BidenCares @JoeBiden #Trump2020 @realDonaldTrump @FBI #CorruptJoeBiden https://t.co/vlx779ykJU ['BidenCares', 'BidenCares', 'Trump2020', 'CorruptJoeBiden'] BOOM!!!!!! 💥💥💥💥  Hunter Biden’s Ex Business Partner Tony Bobulinsk

# Saving the updated collection as a new csv 

In [128]:
# Retrieve all documents from the collection
cursor = merged_yt_collection.find()

# Convert the cursor to a DataFrame
df = pd.DataFrame(list(cursor))

# Specify the columns to include in the CSV file
columns = ['id', 'moderationStatus', 'tweet_ids', 'tweet_text']

# Save the DataFrame as a CSV file
df.to_csv('merged_yt_collection_text.csv', columns=columns, index=False)