# Importing libraries and database connection

In [1]:
from pymongo import MongoClient
import json
import re
import csv
import pandas as pd
from bson import ObjectId
import sys
import numpy as np

csv.field_size_limit(sys.maxsize)
pd.set_option('display.max_colwidth', None)

# Create a MongoClient object and specify the MongoDB connection URL
url = "mongodb://localhost:27017/"
client = MongoClient(url)

# Access a specific database
db  = client["youtube_twitter_filtered"]

# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


# Creation of a collection containing the youtube video id and the corresponding list of tweet ids

In [4]:
yt_collection = db.yt_collection

# Reading CSV and collection creation
path = "./updated_merged.csv"
with open(path, 'r') as file:
    csv_data = csv.DictReader(file)
    
    for row in csv_data:
        yt_collection.insert_one(row)


In [4]:
yt_collection.find_one()

{'_id': ObjectId('6470c440ab1a9c0a9da6d1a6'),
 'id': '2bFLr70bNzA',
 'moderationStatus': '0',
 'tweet_ids': "['1319671748170797062', '1319504474956845056', '1319431259194609664', '1319537482082492416', '1319452511527460864', '1319664735915159553', '1319807337243181062']"}

# Filtered twitter collection creation

In [None]:
i = 0
twitter_collection = db.twitter_collection

# Reading CSV e insert of JSON documents
path = "./tweets_filtrati.csv"

with open(path, 'r') as file:
    csv_data = csv.DictReader(file)
    for row in csv_data:
        
        if i%10000==0:
            print(i)
        i+=1
        
        rt_tweetid = row["rt_tweetid"]
        quoted_tweetid = row["qtd_tweetid"]
        reply_statusid = row["reply_statusid"]
        user_id = row["userid"]

        if "e" in str(user_id):
            row["userid"] = int(np.float64(user_id))
        if len(rt_tweetid)>3 :
            row["rt_tweetid"] = int(np.float64(rt_tweetid))

        if len(quoted_tweetid)>3:
            row["qtd_tweetid"] = int(np.float64(quoted_tweetid))
            
        if len(reply_statusid)>3:
            row["reply_statusid"] = int(np.float64((reply_statusid)))
        twitter_collection.insert_one(row)

# User collection creation

In [None]:
user_collection = db.user_collection
distinct_user_ids = set()  # Set to keep track of distinct user IDs
i = 0

for tweet in twitter_collection.find():
    if i %10000==0:
        print(i)

    i+=1
    userid = tweet["userid"]

    if userid not in distinct_user_ids:
        distinct_user_ids.add(userid)

        user = {
            "userid": tweet["userid"],
            "description": tweet["description"],
            "verified": tweet["verified"],
            "friends_count": tweet["friends_count"],
            "listed_count": tweet["listed_count"],
            "statuses_count": tweet["statuses_count"],
            "followers_count": tweet["followers_count"],
            "favourites_count": tweet["favourites_count"],
            "date_first_tweet": tweet["date_first_tweet"],
            "account_creation_date": tweet["account_creation_date"]
        }

        user_collection.insert_one(user)

# Saving the three collections as json files

In [6]:
# Retrieve all documents from the collection
collection_data = list(twitter_collection.find())

# Convert ObjectId to string representation
for document in collection_data:
    document["_id"] = str(document["_id"])

# Save as JSON file
json_path = "twitter_collection.json"
with open(json_path, "w") as json_file:
    json.dump(collection_data, json_file, indent=4, default=str)

In [None]:
# Retrieve all documents from the collection
collection_data = list(user_collection.find())

# Convert ObjectId to string representation
for document in collection_data:
    document["_id"] = str(document["_id"])

# Save as JSON file
json_path = "user_collection.json"
with open(json_path, "w") as json_file:
    json.dump(collection_data, json_file, indent=4, default=str)

In [None]:
# Retrieve all documents from the collection
collection_data = list(yt_collection.find())

# Convert ObjectId to string representation
for document in collection_data:
    document["_id"] = str(document["_id"])

# Save as JSON file
json_path = "youtube_collection.json"
with open(json_path, "w") as json_file:
    json.dump(collection_data, json_file, indent=4, default=str)