In [1]:
import pandas as pd
import re
import emoji

In [2]:
file_path = "financial_tweets_21_to_24.csv"  # Adjust if needed
df = pd.read_csv(file_path, encoding="utf-8")
print("Original shape:", df.shape)

In [3]:
df.head()

In [4]:
columns_to_drop = ['image_url','proxy_image_url','image_dimensions','thumbnail_url','proxy_thumbnail_url','thumbnail_dimensions','url','tweet_type']
df = df.drop(columns=columns_to_drop,axis=1)

In [5]:
df.info()

In [6]:
# Step 1: Drop rows with missing timestamps or descriptions
df = df.dropna(subset=["timestamp", "description"])
# Step 2: Convert timestamp to datetime
df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
df = df.dropna(subset=["timestamp"])

In [7]:
df

In [8]:
# Step 3: Clean description
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+", "", text)              # remove URLs
    text = emoji.replace_emoji(text, replace='')     # remove emojis
    text = re.sub(r"[^\w\s]", "", text)              # remove punctuation
    text = re.sub(r"\s+", " ", text).strip()         # remove extra spaces
    return text

df["clean_text"] = df["description"].apply(clean_text)

In [9]:
df

In [10]:
# Step 4: Remove duplicates and empty clean_text
df = df[df["clean_text"].str.strip() != ""]
df = df.drop_duplicates(subset=["clean_text"])

In [11]:
df

In [12]:
df['financial_info'][0]

In [13]:
df['financial_info'][10]

In [14]:
df['financial_info'][314731]

In [15]:
import ast
def parse_financial_info(val):
    try:
        parsed = ast.literal_eval(val)
        if isinstance(parsed, list):
            return parsed
        return []
    except:
        return []

df["parsed_financial_info"] = df["financial_info"].apply(parse_financial_info)

# Step 3: Extract unique keys and build rows
def flatten_row(dict_list):
    flattened = {}
    if not dict_list:
        return flattened
    for d in dict_list:
        if isinstance(d, dict):
            for key, value in d.items():
                if value is None or value == "":
                    continue
                if key in flattened:
                    flattened[key] += f", {value}"
                else:
                    flattened[key] = str(value)
    return flattened

# Step 4: Apply the flattening function and expand to columns
expanded_df = df["parsed_financial_info"].apply(flatten_row).apply(pd.Series)

# Step 5: Merge with original DataFrame
df_cleaned = pd.concat([df.drop(columns=["parsed_financial_info"]), expanded_df], axis=1)

In [16]:
df_cleaned

In [18]:
df_cleaned.drop(columns='financial_info',axis=1)

In [19]:
df_cleaned.reset_index(drop=True).to_csv('twitter_data_cleaned.csv', index=False)