# Database connection

In [24]:
from pymongo import MongoClient
import json
import re
import csv
import pandas as pd
import json
from bson import ObjectId

# Create a MongoClient object and specify the MongoDB connection URL
url = "mongodb://localhost:27017/"
client1 = MongoClient(url)

# Access a specific database
db1 = client1["youtube_twitter_db1"]

# Send a ping to confirm a successful connection
try:
    client1.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


# Twitter collection creation

In [None]:
# Reading CSV and collection creation
tw_collection = db1.tw_collection
i = 0

with open('./df_youtube.csv', 'r') as file:
    csv_data = csv.DictReader(file)
    for row in csv_data:
        if row["urls_list"] is not None:

            try:
                urls_list = json.loads(row["urls_list"].replace("'", '"'))
                expanded_urls = [item.get("expanded_url") for item in urls_list]
                
                #new column creation containing the list of expandend urls
                row["expanded_url"] = expanded_urls
                tw_collection.insert_one(row)

            except (json.JSONDecodeError, TypeError):
                print(f"Error line {i} Url_list {row['urls_list']}")
        
        
        if (i % 10000) == 0:
            print(i)
        i += 1

# Youtube collection creation

In [55]:
# Reading CSV and collection creation

yt_collection = db1.yt_collection
with open('/Users/raffaelerusso/Documents/GitHub/Youtube-Video-Classification-on-Twitter/VideoClassification/y_train.csv', 'r') as file:
    csv_data = csv.DictReader(file)
    for row in csv_data:
       yt_collection.insert_one(row)

# Associate each yt video to a list of the referencing tweet ids

In [58]:
# Collect all id values from yt_collection
id_values = [doc['id'] for doc in yt_collection.find({}, {'id': 1})]

# Define the batch size
batch_size = 1000

# Split id_values into batches
id_batches = [id_values[i:i+batch_size] for i in range(0, len(id_values), batch_size)]

# Create a dictionary to store the matching tweet_ids for each id
id_tweet_ids = {id_val: set() for id_val in id_values}

# Iterate over each batch and collect the matching tweet_ids
for batch_index, id_batch in enumerate(id_batches):
    pattern = "|".join(map(re.escape, id_batch))

    # Find documents in tw_collection where expanded_url contains any of the id values in the batch
    results = tw_collection.find(
        {"expanded_url": {"$elemMatch": {"$regex": pattern}}},
        {"tweetid": 1, "expanded_url": 1}
    )
   
    # Update id_tweet_ids dictionary with the matching tweet_ids
    for doc in results:
        tweet_id = doc["tweetid"]
        expanded_url_list = doc["expanded_url"]
        for expanded_url in expanded_url_list:
            for id_val in id_batch:
                if id_val in expanded_url:
                    id_tweet_ids[id_val].add(tweet_id)

    # Print progress
    print(f"Processed batch {batch_index+1}/{len(id_batches)}")

Processed batch 1/18
Processed batch 2/18
Processed batch 3/18
Processed batch 4/18
Processed batch 5/18
Processed batch 6/18
Processed batch 7/18
Processed batch 8/18
Processed batch 9/18
Processed batch 10/18
Processed batch 11/18
Processed batch 12/18
Processed batch 13/18
Processed batch 14/18
Processed batch 15/18
Processed batch 16/18
Processed batch 17/18
Processed batch 18/18


In [59]:
# Update yt_collection with the tweet_ids for each id
for id_val, tweet_ids in id_tweet_ids.items():
    yt_collection.update_one(
        {"id": id_val},
        {"$set": {"tweet_ids": list(tweet_ids)}}
    )

# Print completion message
print("Matching tweet_ids updated in yt_collection.")

Matching tweet_ids updated in yt_collection.


In [61]:
id_tweet_ids_list = {k: list(v) for k, v in id_tweet_ids.items()}

# Save id_tweet_ids to a file
with open("id_tweet_ids.json", "w") as file:
    json.dump(id_tweet_ids_list, file)

print("id_tweet_ids saved to file.")

id_tweet_ids saved to file.


# Creation of the new y_train CSV with the updated list of tweet ids 

In [66]:
path_labels = "./y_train.csv"
youtube_df = pd.read_csv(path_labels)

# Creation of the new dataframe
df = youtube_df.assign(tweet_ids=lambda x: x["id"].map(id_tweet_ids_list))

# Convert "not moderated" to 0 and "moderated" to 1
df["moderationStatus"] = df["moderationStatus"].replace({"not moderated": 0, "moderated": 1})

# Dropping column "lista_url"
df = df.drop("lista_url",axis=1)

# Save the updated DataFrame to a new CSV file
df.to_csv("updated_merged.csv", index=False)


print("Updated dataset saved to file.")

Updated dataset saved to file.


In [99]:
df.head()

Unnamed: 0,id,moderationStatus,tweet_ids
0,2bFLr70bNzA,0,"['1319671748170797062', '1319504474956845056', '1319431259194609664', '1319537482082492416', '1319452511527460864', '1319664735915159553', '1319807337243181062']"
1,-gWzKGaj6Ss,1,"['1322068734652125184', '1321990250500104192', '1321989314784436225', '1322170572856528896', '1322013423954239489', '1322129221469249536', '1322156265087393793', '1322127027181334528', '1322055781940944897', '1322056591294816256', '1321981266367950849', '1322063743258398723', '1321972961310629889']"
2,BHnJp0oyOxs,0,"['1320793071206977537', '1320824145827950592', '1319972970299838464', '1320772899691745280', '1322224750631018497', '1320770467632852993', '1320804289808371716', '1320755036180213761', '1320848677938909185', '1318469485985148928', '1317074899287105540', '1320806383818809349', '1320960304872787974', '1320795016613273603', '1315132275034652673', '1320747645850292226', '1320789711158272000', '1315131864093515776']"
3,q77rJcVbDAI,0,"['1318484961788387328', '1318473684793831425', '1318437517016788992']"
4,RGhwHbp66P4,0,"['1321324715949002752', '1318460466595758080', '1317760528555692040', '1313146199453642755', '1311457744550064130']"


# Saving the csv as a collection 

In [74]:
# Custom JSON encoder class to handle ObjectId serialization
class JSONEncoder(json.JSONEncoder):
    def default(self, o):
        if isinstance(o, ObjectId):
            return str(o)
        return super().default(o)

# Retrieve all documents from the collection
all_documents = list(yt_collection.find())

# Specify the path and filename for the output JSON file
output_file = "collection_data.json"

# Save the collection data to the JSON file
with open(output_file, "w") as file:
    json.dump(all_documents, file, cls=JSONEncoder)

print("Collection data saved as JSON.")


Collection data saved as JSON.
