In [10]:
from pathlib import Path
import json
import re
import pandas as pd
import numpy as np

BASE_DIR = Path("tweet") 
FILE_EXT = ".txt"
RAW_DIR = BASE_DIR / "raw"

In [25]:
all_dfs = []

for company_dir in RAW_DIR.iterdir():
    if not company_dir.is_dir():
        continue

    ticker = company_dir.name.upper()

    # Load ALL files inside the company folder
    files = sorted(f for f in company_dir.iterdir() if f.is_file())

    for f in files:
        records = []
        with f.open("r", encoding="utf-8") as fh:
            for line in fh:
                line = line.strip()
                if not line:
                    continue
                try:
                    obj = json.loads(line)
                    records.append(obj)
                except json.JSONDecodeError:
                    continue

        if not records:
            continue

        df = pd.json_normalize(records, sep=".")
        df["folder_ticker"] = ticker
        df["source_file"] = f.name
        all_dfs.append(df)

tweets_raw = pd.concat(all_dfs, ignore_index=True)
tweets_raw.columns = tweets_raw.columns.str.lower().str.strip()

tweets_raw.shape, tweets_raw.head()

  tweets_raw = pd.concat(all_dfs, ignore_index=True)


((119844, 319),
                        created_at                  id              id_str  \
 0  Fri Jan 03 11:01:06 +0000 2014  419060848288464897  419060848288464897   
 1  Thu Jan 02 23:38:26 +0000 2014  418889049609625600  418889049609625600   
 2  Sun Jan 05 20:12:05 +0000 2014  419924283498831872  419924283498831872   
 3  Thu Jan 09 05:17:01 +0000 2014  421148584059604992  421148584059604992   
 4  Fri Jan 10 00:14:17 +0000 2014  421434786599546880  421434786599546880   
 
                                                 text  \
 0  Weekly Dow #Stocks Trend $DIS $WMT $HD $GS $V ...   
 1  $VZ - A New Year means time for new Dogs of th...   
 2  $VZ The S&amp;Ps Worst Sectors in 2013 http://...   
 3  $VZ - Why T-Mobile Bought Verizons Spectrum -&...   
 4  #VIDEO #AccumulationDistribution in #EXCEL htt...   
 
                                               source  truncated  \
 0  <a href="http://12stocks.com" rel="nofollow">d...      False   
 1  <a href="http://yahoo.com" rel

In [12]:
def extract_symbols(symbol_list):
    if not isinstance(symbol_list, list):
        return []
    out = []
    for s in symbol_list:
        t = s.get("text", "").upper()
        if t:
            out.append(t)
    return out

tweets_raw["symbols"] = tweets_raw["entities.symbols"].apply(extract_symbols)

  tweets_raw["symbols"] = tweets_raw["entities.symbols"].apply(extract_symbols)


In [13]:
tweets_long = tweets_raw.explode("symbols").rename(columns={"symbols": "ticker"})
tweets_long = tweets_long[~tweets_long["ticker"].isna()]
tweets_long.shape

(398104, 320)

In [14]:
def choose_base_text(row):
    rt = row.get("retweeted_status.text")
    if isinstance(rt, str) and rt.strip():
        return rt
    return row.get("text", "")

tweets_long["base_text"] = tweets_long.apply(choose_base_text, axis=1)

In [15]:
tweets_long["created_at"] = pd.to_datetime(tweets_long["created_at"], errors="coerce")
tweets_long = tweets_long.dropna(subset=["created_at"])
tweets_long["date"] = tweets_long["created_at"].dt.date

  tweets_long["created_at"] = pd.to_datetime(tweets_long["created_at"], errors="coerce")


In [16]:
tweets_long["is_retweet"] = np.where(
    tweets_long["retweeted_status.id"].notna(), 1, 0
)

In [17]:
def clean_text(s):
    if not isinstance(s, str):
        return ""
    s = s.lower()
    s = re.sub(r"http\S+", "", s)
    s = re.sub(r"@\w+", "", s)
    s = re.sub(r"\$\w+", "", s)
    s = s.replace("#", "")
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s)
    return s.strip()

tweets_long["clean_text"] = tweets_long["base_text"].apply(clean_text)

In [18]:
tweets_long[["ticker", "date", "clean_text"]].head()

Unnamed: 0,ticker,date,clean_text
0,DIS,2014-01-03,weekly dow stocks trend
0,WMT,2014-01-03,weekly dow stocks trend
0,HD,2014-01-03,weekly dow stocks trend
0,GS,2014-01-03,weekly dow stocks trend
0,V,2014-01-03,weekly dow stocks trend


In [None]:
tweets_long.shape
tweets_long.head()

In [21]:
keep_cols = [
    "ticker",
    "date",
    "created_at",
    "clean_text",
    "base_text",
    "is_retweet",
    "retweet_count",
    "favorite_count",
    "user.followers_count"
]

keep_cols = [c for c in keep_cols if c in tweets_long.columns]

tweets_clean = tweets_long[keep_cols].copy()
tweets_clean.shape, tweets_clean.head()

((398104, 9),
   ticker        date                created_at               clean_text  \
 0    DIS  2014-01-03 2014-01-03 11:01:06+00:00  weekly dow stocks trend   
 0    WMT  2014-01-03 2014-01-03 11:01:06+00:00  weekly dow stocks trend   
 0     HD  2014-01-03 2014-01-03 11:01:06+00:00  weekly dow stocks trend   
 0     GS  2014-01-03 2014-01-03 11:01:06+00:00  weekly dow stocks trend   
 0      V  2014-01-03 2014-01-03 11:01:06+00:00  weekly dow stocks trend   
 
                                            base_text  is_retweet  \
 0  Weekly Dow #Stocks Trend $DIS $WMT $HD $GS $V ...           0   
 0  Weekly Dow #Stocks Trend $DIS $WMT $HD $GS $V ...           0   
 0  Weekly Dow #Stocks Trend $DIS $WMT $HD $GS $V ...           0   
 0  Weekly Dow #Stocks Trend $DIS $WMT $HD $GS $V ...           0   
 0  Weekly Dow #Stocks Trend $DIS $WMT $HD $GS $V ...           0   
 
    retweet_count  favorite_count  user.followers_count  
 0              0               0                   54

In [24]:
import numpy as np
import re

# start from the cleaned tweets dataframe
df = tweets_clean.copy()

# -------------------------
# 1. ACCOUNT / ENGAGEMENT FEATURES
# -------------------------
df["followers"] = df["user.followers_count"].fillna(0)
df["retweets"]  = df["retweet_count"].fillna(0)
df["likes"]     = df["favorite_count"].fillna(0)

# -------------------------
# 2. EVENT / NEWS KEYWORDS
# -------------------------
EVENT_WORDS = [
    "earnings", "eps", "guidance", "forecast",
    "revenue", "downgrade", "upgrade",
    "merger", "acquisition", "acquire",
    "lawsuit", "sec", "fined", "regulation",
    "bankruptcy", "chapter 11",
    "report", "announcement",
    "dividend", "split",
    "plunge", "surge", "breakout", "crash"
]
event_regex = re.compile("|".join(EVENT_WORDS), re.IGNORECASE)

df["has_event_word"] = df["clean_text"].apply(
    lambda x: bool(event_regex.search(x))
)

# -------------------------
# 3. REMOVE NOISE / SPAM
# -------------------------
noise_regex = re.compile(r"(penny|alert|follow|giveaway|\$\$\$)", re.IGNORECASE)
df["is_noise"] = df["clean_text"].apply(lambda x: bool(noise_regex.search(x)))

# keep only non-noise
df = df[df["is_noise"] == False]

# drop very short tweets
df = df[df["clean_text"].str.len() >= 15]

# drop retweets (keep originals only)
df = df[df["is_retweet"] == 0]

# -------------------------
# 4. NOW build the account mask (AFTER filtering)
# -------------------------
account_mask = (
    (df["followers"] >= 1000) |
    (df["retweets"] >= 50)   |
    (df["likes"]    >= 500)
)

# -------------------------
# 5. FINAL HIGH-IMPACT SELECTION
# -------------------------
high_impact = df[account_mask | df["has_event_word"]].copy()

print("Original tweets:", len(tweets_clean))
print("After basic noise filters:", len(df))
print("High-impact tweets:", len(high_impact))
print("Retention rate:", round(len(high_impact) / len(tweets_clean), 4))

high_impact.head()

Original tweets: 398104
After basic noise filters: 249021
High-impact tweets: 81803
Retention rate: 0.2055


Unnamed: 0,ticker,date,created_at,clean_text,base_text,is_retweet,retweet_count,favorite_count,user.followers_count,followers,retweets,likes,has_event_word,is_noise
2,VZ,2014-01-05,2014-01-05 20:12:05+00:00,the s amp ps worst sectors in 2013,$VZ The S&amp;Ps Worst Sectors in 2013 http://...,0,0,0,9,9,0,0,True,False
11,VZ,2014-01-15,2014-01-15 09:28:46+00:00,ahah had to search ticker as well after i saw ...,@maoxian ahah had to search ticker as well aft...,0,0,0,6069,6069,0,0,False,False
11,T,2014-01-15,2014-01-15 09:28:46+00:00,ahah had to search ticker as well after i saw ...,@maoxian ahah had to search ticker as well aft...,0,0,0,6069,6069,0,0,False,False
17,VZ,2014-01-21,2014-01-21 17:04:22+00:00,sorry t mobile verizon is still the mightiest ...,"$VZ - Sorry T-Mobile, Verizon Is Still the Mig...",0,0,0,1461,1461,0,0,False,False
20,IBM,2014-01-21,2014-01-21 15:24:20+00:00,hoy reportaran al cierre de mercado entre otra...,"Hoy reportaran al cierre de mercado $IBM, $TXN...",0,0,0,1073,1073,0,0,True,False


In [26]:
tweets_clean.to_pickle("tweets_clean.pkl")
tweets_clean.to_csv("tweets_clean.csv", index=False)

In [27]:
high_impact.to_pickle("tweets_high_impact.pkl")
high_impact.to_csv("tweets_high_impact.csv", index=False)