In [None]:
# Counting number of words in all tweets

import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv("../data/tweets.csv")

# Ensure 'Tweet' column is treated as string and split by whitespace
# Then sum the lengths (number of words) in each tweet
total_word_count = df["Tweet"].apply(lambda x: len(str(x).split())).sum()

print("Total number of words in all tweets:", total_word_count)


In [None]:
# Choosing random articles from a CSV file

import pandas as pd
import random

def save_random_articles(csv_file, output_csv, sample_size=40):
    try:
        df = pd.read_csv(csv_file, encoding='utf-8')
    except UnicodeDecodeError:
        print("UTF-8 decoding failed. Trying with cp1252 encoding...")
        df = pd.read_csv(csv_file, encoding='cp1252')

    # Check if sample_size is greater than available rows
    sample_size = min(sample_size, len(df))
    
    # Randomly select sample_size rows
    sampled_df = df.sample(n=sample_size, random_state=42)  # fixed seed for reproducibility
    
    # Save the sampled DataFrame to a new CSV
    sampled_df.to_csv(output_csv, index=False, encoding='utf-8')
    
    print(f"Saved {sample_size} random articles to '{output_csv}'")

# usage
sample_size=40
csv_file = "../data/csv/final_dis.csv"
output_csv = "../data/csv/random_40_articles.csv"
save_random_articles(csv_file, output_csv, sample_size)


In [5]:
# Dividing a CSV file into N equal parts

import pandas as pd
import sys
import os

def divide_csv(input_path, output_dir, num_batches):
    # Step 1: Read the CSV file
    df = pd.read_csv(input_path, encoding='utf-8')

    # Step 2: Check that columns match expectations
    expected_columns = ['Username', 'Tweet', 'Date', 'Replies', 'Retweets', 'Likes', 'Bookmarks', 'Views']
    assert list(df.columns) == expected_columns, "Column names do not match expected format"

    # Step 3: Shuffle the dataframe
    df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)

    # Step 4: Split into N batches
    batch_size = len(df_shuffled) // num_batches
    batches = []
    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = (i + 1) * batch_size if i != num_batches - 1 else len(df_shuffled)
        batch = df_shuffled.iloc[start_idx:end_idx]
        batches.append(batch)

    # Step 5: Save each batch
    for i, batch in enumerate(batches, 1):
        filename = os.path.join(output_dir, f"tweets_{i}.csv")
        batch.to_csv(filename, index=False)
        print(f"Saved {len(batch)} tweets to '{filename}'")

# Example usage
if __name__ == "__main__":
    input_file = "../data/csv/tweets.csv"
    output_folder = "../data/csv"
    number_of_batches = 3

    divide_csv(input_file, output_folder, number_of_batches)


Saved 1186 tweets to '../data/csv\tweets_1.csv'
Saved 1186 tweets to '../data/csv\tweets_2.csv'
Saved 1188 tweets to '../data/csv\tweets_3.csv'
