In [2]:
# import pandas as pd
# import json

# # -------------------------------------------------------------------------
# # Function to Convert JSON File to DataFrame
# # -------------------------------------------------------------------------
# def json_to_dataframe(json_file):
#     """
#     Reads a JSON file containing tweet data and converts it into a DataFrame.

#     Parameters:
#     json_file (str): Path to the JSON file.

#     Returns:
#     pd.DataFrame: DataFrame containing the tweet data.
#     """
#     with open(json_file, "r", encoding="utf-8") as f:
#         data = json.load(f)

#     # Normalize JSON into a DataFrame
#     df = pd.json_normalize(data)
#     return df

# # -------------------------------------------------------------------------
# # Process JSON Files for Each Politician
# # -------------------------------------------------------------------------
# def process_politicians_to_parquet(politicians, input_folder, output_folder):
#     """
#     Processes JSON files for each politician, converts them to DataFrames,
#     and saves them as Parquet files.

#     Parameters:
#     politicians (list): List of politician usernames.
#     input_folder (str): Path to the folder containing JSON files.
#     output_folder (str): Path to the folder to save Parquet files.
#     """
#     for politician in politicians:
#         json_file = f"{input_folder}/{politician}_tweets.json"
#         parquet_file = f"{output_folder}/{politician}_tweets.parquet"

#         try:
#             # Convert JSON to DataFrame
#             df = json_to_dataframe(json_file)

#             # Save DataFrame as Parquet
#             df.to_parquet(parquet_file, index=False)
#             print(f"Saved Parquet file for {politician}: {parquet_file}")

#         except FileNotFoundError:
#             print(f"File not found: {json_file}")
#         except json.JSONDecodeError:
#             print(f"Error decoding JSON for {politician}. Please check the file content.")

# # Example usage
# politicians = ["donaldtusk", "elonmusk", "SlawomirMentzen"]  # Replace with your list of politicians
# input_folder = "tweets_data"  # Folder containing JSON files
# output_folder = "parquet_data"  # Folder to save Parquet files

# process_politicians_to_parquet(politicians, input_folder, output_folder)


Saved Parquet file for donaldtusk: parquet_data/donaldtusk_tweets.parquet
Saved Parquet file for elonmusk: parquet_data/elonmusk_tweets.parquet
Saved Parquet file for SlawomirMentzen: parquet_data/SlawomirMentzen_tweets.parquet


In [7]:
import pandas as pd
import json
import os

# -------------------------------------------------------------------------
# Function to Convert JSON File to DataFrame
# -------------------------------------------------------------------------
def json_to_dataframe(json_file):
    """
    Reads a JSON file containing tweet data and converts it into a DataFrame.

    Parameters:
    json_file (str): Path to the JSON file.

    Returns:
    pd.DataFrame: DataFrame containing the tweet data.
    """
    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Normalize JSON into a DataFrame
    df = pd.json_normalize(data)
    return df

# -------------------------------------------------------------------------
# Process JSON Files for Each Politician
# -------------------------------------------------------------------------
def process_politicians_to_parquet(input_folder, output_folder):
    """
    Processes JSON files for each politician, extracts usernames from filenames,
    converts them to DataFrames, and saves them as Parquet files.

    Parameters:
    input_folder (str): Path to the folder containing JSON files.
    output_folder (str): Path to the folder to save Parquet files.
    """
    # List all JSON files in the input folder
    json_files = [f for f in os.listdir(input_folder) if f.endswith("_tweets.json")]

    for json_file in json_files:
        # Extract politician's username from the filename
        politician = json_file.split("_tweets.json")[0]

        # Construct file paths
        input_file_path = os.path.join(input_folder, json_file)
        parquet_file = os.path.join(output_folder, f"{politician}_tweets.parquet")

        try:
            # Convert JSON to DataFrame
            df = json_to_dataframe(input_file_path)

            # Add a column for the politician's username
            df["username"] = politician

            # Save DataFrame as Parquet
            df.to_parquet(parquet_file, index=False)
            print(f"Saved Parquet file for {politician}: {parquet_file}")

        except FileNotFoundError:
            print(f"File not found: {input_file_path}")
        except json.JSONDecodeError:
            print(f"Error decoding JSON for {politician}. Please check the file content.")

# -------------------------------------------------------------------------
# Example Usage
# -------------------------------------------------------------------------
input_folder = "tweets_data"  # Folder containing JSON files
output_folder = "parquet_data"  # Folder to save Parquet files

process_politicians_to_parquet(input_folder, output_folder)


Saved Parquet file for donaldtusk: parquet_data\donaldtusk_tweets.parquet
Saved Parquet file for elonmusk: parquet_data\elonmusk_tweets.parquet
Saved Parquet file for SlawomirMentzen: parquet_data\SlawomirMentzen_tweets.parquet


In [8]:
import os
import pandas as pd

# -------------------------------------------------------------------------
# Function to Merge Parquet Files
# -------------------------------------------------------------------------
def merge_parquet_files(input_folder, output_file):
    """
    Merges multiple Parquet files into a single Parquet file.

    Parameters:
    input_folder (str): Path to the folder containing Parquet files.
    output_file (str): Path to save the merged Parquet file.
    """
    all_parquet_files = [
        os.path.join(input_folder, file) for file in os.listdir(input_folder) if file.endswith(".parquet")
    ]

    if not all_parquet_files:
        print("No Parquet files found to merge.")
        return

    # Read and concatenate all Parquet files
    dfs = [pd.read_parquet(file) for file in all_parquet_files]
    merged_df = pd.concat(dfs, ignore_index=True)

    # Save the merged DataFrame to a single Parquet file
    merged_df.to_parquet(output_file, index=False)
    print(f"Merged Parquet file saved to: {output_file}")

# Example usage
input_folder = "parquet_data"  # Folder containing Parquet files
output_file = "parquet_data/merged/merged_tweets.parquet"  # Path to save the merged Parquet file

merge_parquet_files(input_folder, output_file)


Merged Parquet file saved to: parquet_data/merged/merged_tweets.parquet


In [1]:
import pandas as pd

# -------------------------------------------------------------------------
# Function to Read Merged Parquet File
# -------------------------------------------------------------------------
def read_merged_parquet(parquet_file):
    """
    Reads a merged Parquet file into a Pandas DataFrame.

    Parameters:
    parquet_file (str): Path to the Parquet file.

    Returns:
    pd.DataFrame: DataFrame containing the merged data.
    """
    try:
        df = pd.read_parquet(parquet_file)
        print(f"Successfully loaded data from {parquet_file}")
        return df
    except FileNotFoundError:
        print(f"File not found: {parquet_file}")
        return pd.DataFrame()
    except Exception as e:
        print(f"Error reading Parquet file: {e}")
        return pd.DataFrame()

# Example usage
merged_parquet_file = "parquet_data\merged\merged_tweets.parquet"  # Path to the merged Parquet file
df = read_merged_parquet(merged_parquet_file)

# Display the first few rows of the DataFrame
df.head()


  merged_parquet_file = "parquet_data\merged\merged_tweets.parquet"  # Path to the merged Parquet file


Successfully loaded data from parquet_data\merged\merged_tweets.parquet


Unnamed: 0,id,author_id,edit_history_tweet_ids,created_at,text,possibly_sensitive,context_annotations,reply_settings,conversation_id,lang,...,username,referenced_tweets,entities.urls,entities.annotations,attachments.media_keys,in_reply_to_user_id,entities.mentions,entities.hashtags,attachments.media_source_tweet_id,attachments.poll_ids
0,1874094682952880144,375146901,[1874094682952880144],2024-12-31T14:06:06.000Z,TO ❤️ BĘDZIE 🇵🇱 POLSKI 💥 ROK! Życzę Wam wiary ...,False,[{'domain': {'description': 'Named people in t...,everyone,1874094682952880144,pl,...,donaldtusk,,,,,,,,,
1,1873793926634885589,375146901,[1873793926634885589],2024-12-30T18:11:00.000Z,„Pieniędzy nie ma i nie będzie”. Na moje oko t...,False,[{'domain': {'description': 'Named people in t...,everyone,1873793926634885589,pl,...,donaldtusk,,,,,,,,,
2,1874221010939207978,44196397,[1874221010939207978],2024-12-31T22:28:05.000Z,😂 https://t.co/SQZKxNxpwV,False,[{'domain': {'description': 'Categories within...,everyone,1874221010939207978,art,...,elonmusk,"[{'id': '1874214472128032877', 'type': 'quoted'}]","[{'description': None, 'display_url': 'x.com/t...",,,,,,,
3,1874199431643619474,44196397,[1874199431643619474],2024-12-31T21:02:20.000Z,Brothers in Arms https://t.co/vIZ8ADrXbo,False,[{'domain': {'description': 'Categories within...,everyone,1874199431643619474,en,...,elonmusk,,"[{'description': None, 'display_url': 'pic.x.c...","[{'end': 15, 'normalized_text': 'Brothers in A...",[3_1874199428686356480],,,,,
4,1874086747178414133,3242182113,[1874086747178414133],2024-12-31T13:34:34.000Z,Rok 2024 kończę w znacznie lepszym nastroju ni...,False,,everyone,1874086747178414133,pl,...,SlawomirMentzen,,"[{'description': None, 'display_url': 'x.com/i...",,,,,,,


In [None]:
df