# X/Twitter Cleaning

In [9]:
import json
import csv
import os
import re

def clean_text(text):
    """Clean text by removing unusual line terminators and normalizing whitespace"""
    if not isinstance(text, str):
        return text
    
    # Remove Line Separator (LS) and Paragraph Separator (PS) characters
    text = text.replace('\u2028', ' ')  # Line Separator
    text = text.replace('\u2029', ' ')  # Paragraph Separator
    
    # Remove other unusual whitespace characters
    text = text.replace('\u00A0', ' ')  # Non-breaking space
    text = text.replace('\u2000', ' ')  # En quad
    text = text.replace('\u2001', ' ')  # Em quad
    text = text.replace('\u2002', ' ')  # En space
    text = text.replace('\u2003', ' ')  # Em space
    text = text.replace('\u2004', ' ')  # Three-per-em space
    text = text.replace('\u2005', ' ')  # Four-per-em space
    text = text.replace('\u2006', ' ')  # Six-per-em space
    text = text.replace('\u2007', ' ')  # Figure space
    text = text.replace('\u2008', ' ')  # Punctuation space
    text = text.replace('\u2009', ' ')  # Thin space
    text = text.replace('\u200A', ' ')  # Hair space
    text = text.replace('\u200B', '')   # Zero width space
    text = text.replace('\u200C', '')   # Zero width non-joiner
    text = text.replace('\u200D', '')   # Zero width joiner
    text = text.replace('\u2060', '')   # Word joiner
    
    # Normalize multiple spaces to single space
    text = re.sub(r'\s+', ' ', text)
    
    # Strip leading/trailing whitespace
    text = text.strip()
    
    return text

In [10]:
import pandas as pd

# Read the dataset
df = pd.read_csv('../data/AAPL/AAPL_tweets_2020-10-01_2025-10-01.csv')

print(f"Original dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")


Original dataset shape: (38553, 17)
Columns: ['symbol', 'tweet_id', 'text', 'created_at', 'user_id', 'username', 'user_name', 'retweet_count', 'like_count', 'reply_count', 'quote_count', 'view_count', 'is_reply', 'conversation_id', 'url', 'lang', 'search_date']


In [11]:
# Apply the cleaning function to the text column
print("Cleaning text column...")
df['text_cleaned'] = df['text'].apply(clean_text)
print("Cleaning completed!")

Cleaning text column...
Cleaning completed!


In [12]:
# Replace the original text column with the cleaned version
df['text'] = df['text_cleaned']

# Drop the temporary 'text_cleaned' column
df = df.drop('text_cleaned', axis=1)

# Keep only the specified columns
df = df[['created_at', 'text', 'like_count', 'retweet_count']]

# Save the cleaned and filtered dataset
output_path = '../data/AAPL/AAPL_tweets_clean.csv'
df.to_csv(output_path, index=False)

print(f"Cleaned dataset saved to: {output_path}")
print(f"Final columns: {df.columns.tolist()}")
print(f"File size: {os.path.getsize(output_path) / (1024*1024):.2f} MB")


Cleaned dataset saved to: ../data/AAPL/AAPL_tweets_clean.csv
Final columns: ['created_at', 'text', 'like_count', 'retweet_count']
File size: 7.43 MB


# Stock Market CSV Cleaning

In [13]:
# Load the stock market data
market_df = pd.read_csv('../data/AAPL/AAPL_2020-10-17_2025-10-16.csv')

print(f"Original market data shape: {market_df.shape}")
print(f"Original market data columns: {market_df.columns.tolist()}")

# Keep only the specified columns
market_df = market_df[['Date', 'Open', 'Close']]

# Convert column headers to lowercase
market_df.columns = market_df.columns.str.lower()

# Save the cleaned market dataset
market_output_path = '../data/AAPL/AAPL_market_clean.csv'
market_df.to_csv(market_output_path, index=False)

print(f"Cleaned market dataset saved to: {market_output_path}")
print(f"Final market data columns: {market_df.columns.tolist()}")
print(f"Market data file size: {os.path.getsize(market_output_path) / (1024*1024):.2f} MB")

Original market data shape: (1254, 6)
Original market data columns: ['Date', 'Close', 'High', 'Low', 'Open', 'Volume']
Cleaned market dataset saved to: ../data/AAPL/AAPL_market_clean.csv
Final market data columns: ['date', 'open', 'close']
Market data file size: 0.06 MB
