In [51]:
import pandas as pd
from pathlib import Path

In [52]:
EXTERNAL_NEWS_DATA = Path("./data/external/labeled/snn_news_messages_combined.csv")
EXTERNAL_PUMP_DATA = Path("./data/external/labeled/snn_pump_messages_combined.csv")

INTERNAL_DATA = Path("./data/internal/labeled/labeled_messages_combined.csv")
SYNTHETIC_DATA = Path("./data/internal/synthetic/synthetic_messages_combined.csv")

TARGET_DIR = Path("./data/internal/training_data")

In [53]:
df_a = pd.read_csv(EXTERNAL_NEWS_DATA)
df_b = pd.read_csv(EXTERNAL_PUMP_DATA)
df_c = pd.read_csv(INTERNAL_DATA)
df_d = pd.read_csv(SYNTHETIC_DATA)

In [54]:
df_a['source'] = 'snn'
df_b['source'] = 'snn'
df_c['source'] = 'telegram'
df_d['source'] = 'synth'

In [55]:
df_real = pd.concat([df_a, df_b, df_c], axis=0)

In [56]:
noisy_examples = df_d[df_d.label == 5]

grouped = df_d[df_d.label != 5].groupby('label')
synth_reduced = grouped.apply(lambda x: x.sample(n=300)).reset_index(drop=True)

  synth_reduced = grouped.apply(lambda x: x.sample(n=300)).reset_index(drop=True)


In [57]:
df_final = pd.concat([df_real, noisy_examples, synth_reduced], axis=0)
display(df_final[df_final.duplicated(subset=['message'])])
df_final = df_final.drop_duplicates(subset=['channel_id', 'message'], keep='first')
df_final

Unnamed: 0,label,id,date,channel_id,message,source
8,5,1497.0,2024-04-05 16:40:18+00:00,1.066638e+09,Waiting ... 🕢 🕗,telegram
29,5,2167.0,2024-07-21 17:05:27+00:00,1.066638e+09,THAT'S AMAZING 🍿,telegram
30,5,2166.0,2024-07-21 17:03:50+00:00,1.066638e+09,SELL NOW 🔻 🔺,telegram
31,5,2164.0,2024-07-21 17:02:14+00:00,1.066638e+09,LET'S GOO ...,telegram
33,1,2162.0,2024-07-21 16:59:01+00:00,1.066638e+09,1 Minute Left 🕦,telegram
...,...,...,...,...,...,...
1192,4,,,,The pump is not happening today. We’ve run int...,synth
1194,4,,,,Pump delayed! There are some factors beyond ou...,synth
1197,4,,,,"Due to some unexpected challenges, today's pum...",synth
1198,4,,,,Important update: today's pump is delayed due ...,synth


Unnamed: 0,label,id,date,channel_id,message,source
0,5,461.0,2019-11-27 07:05:08+00:00,1.412636e+09,Binance_CEX MANA / BTC Take-Profit target 1 ✅ ...,snn
1,5,719.0,2021-04-04 04:55:51+00:00,1.268025e+09,SKL / USDT Long Signal ⬆ ️ Long / Buy ⏺ Entry ...,snn
2,5,6543.0,2018-08-30 11:17:54+00:00,1.392778e+09,LINK / BTC ( BINANCE_CEX ) BUY - 4200 SELL - 4...,snn
3,5,7729.0,2021-10-05 19:12:57+00:00,1.243434e+09,💰 💰 These are all great Live Binance_CEX Futur...,snn
4,5,14844.0,2021-11-12 09:21:50+00:00,1.381991e+09,🚀 🚀 20 % Profit on DGB for our Premium Members...,snn
...,...,...,...,...,...,...
1187,4,,,,"⏳ 💥 Due to some unexpected challenges, today's...",synth
1188,4,,,,Pump delayed! There are some factors beyond ou...,synth
1193,4,,,,🚨 The pump is not happening today. We’ve run i...,synth
1195,4,,,,⚠️ Sorry to inform you that the pump will not ...,synth


In [58]:
df_final.to_csv(TARGET_DIR / 'training_data_no_duplicates_per_channel.csv', index=False)

In [59]:
df_final.value_counts('label')

label
5    2767
1    1617
0     818
2     754
3     654
4     481
Name: count, dtype: int64

In [41]:
processed_files = Path("./data/internal/processed")
labeled_files = Path("./data/internal/labeled")

In [62]:
import pickle

dir = Path('data/external/raw/pkl')
for file in [file for file in dir.iterdir() if str(file).endswith('.pkl')]:
    with open(file, 'rb') as f:
        x = pickle.load(f)

    message = []
    id = []
    date = []
    channel_id = []
    for v in x:
        try:
            message.append(v['message'])
            id.append(v['id'])
            date.append(v['date'])
            channel_id.append(v['peer_id']['channel_id'])
        except Exception as e:
            continue

    df = pd.DataFrame({'id': id, 'date': date, 'channel_id': channel_id, 'message': message})
    df.to_csv(Path('data/external/raw') / f"{file.name.removesuffix('pkl')}.csv", index=False)

In [59]:
df

Unnamed: 0,id,date,channel_id,message
0,4048,2022-01-23 12:33:43+00:00,1512933614,Crypto me kabhi bhi kuch bhi ho sakta wo sabhi...
1,4047,2022-01-23 12:33:16+00:00,1512933614,@cryptokingszz 👈 dekho
2,4046,2022-01-23 12:33:10+00:00,1512933614,@cryptokingszz 👈 dekho
3,4045,2022-01-23 12:32:45+00:00,1512933614,1 mahina to bilkul nhi jaega abhi filhaal
4,4044,2022-01-23 12:32:12+00:00,1512933614,42 to 45 and next target 50 to 52
...,...,...,...,...
1718,112,2021-11-10 16:28:25+00:00,1512933614,Buy AXS ..it will pump soon🚀
1719,109,2021-11-10 14:56:18+00:00,1512933614,@HP_9849 welcome
1720,82,2021-11-10 12:54:44+00:00,1512933614,No
1721,80,2021-11-10 12:52:23+00:00,1512933614,Any coin to buy now?


In [42]:
def load_data(dir: Path):
    """
    Reads all .csv files from the specified folder into a single pandas DataFrame.

    Args:
        folder_path (str or Path): The path to the folder containing .csv files.

    Returns:
        pd.DataFrame: A DataFrame containing the concatenated contents of all .csv files.
    """
    all_files = [file for file in dir.iterdir() if str(file).endswith('.csv')]
    dataframes = [pd.read_csv(file) for file in all_files]
    combined_df = pd.concat(dataframes, ignore_index=True)

    return combined_df

In [45]:
df_p = load_data(processed_files)
df_l = load_data(labeled_files)

In [37]:
df_p = load_data(processed_files)


In [38]:
df_p = df_p.drop_duplicates(subset=['channel_id', 'id'], keep='first')
df_p = df_p.drop_duplicates(subset=['message'], keep='first')
df_p['label'] = 5

Unnamed: 0,id,date,channel_id,message,label
776414,12792,2019-06-18 17:49:55+00:00,1141221379,EDO for premiums ! ❤ ️,5
60762,2215,2019-01-18 07:17:38+00:00,1381990652,Buy OAX around 2325 ( Binance_CEX ) Sell Targe...,5
285379,329,2020-09-27 21:15:09+00:00,1276304238,LINK USDT @USER,5
181230,5233,2021-01-06 23:22:43+00:00,1285164948,UNKNOWN_URL: trdr,5
320708,2953,2022-01-15 07:11:33+00:00,1512933614,What does it mean ?,5
...,...,...,...,...,...
409175,4797,2021-03-30 23:28:14+00:00,1080057698,"Huobi , Binance_CEX ARPA / BTC All entry targe...",5
1126,991,2021-10-22 14:07:20+00:00,1173711569,Ripple CEO Accuses SEC of Preferential Treatme...,5
43610,8250,2021-09-04 08:32:02+00:00,1126647644,There's a nice bull pennant on $ COS,5
73686,404,2018-02-07 14:40:51+00:00,1354862145,"🌋 Total Market Cap : 🌋 $ 394,126 , 471,034 . 0...",5


In [39]:
df_sample = df_p.sample(n=2000)
df_sample

Unnamed: 0,id,date,channel_id,message,label
432151,461,2019-11-27 07:05:08+00:00,1412636305,Binance_CEX MANA / BTC Take-Profit target 1 ✅ ...,5
400774,719,2021-04-04 04:55:51+00:00,1268024695,SKL / USDT Long Signal ⬆ ️ Long / Buy ⏺ Entry ...,5
523000,6543,2018-08-30 11:17:54+00:00,1392777632,LINK / BTC ( BINANCE_CEX ) BUY - 4200 SELL - 4...,5
13220,7729,2021-10-05 19:12:57+00:00,1243433527,💰 💰 These are all great Live Binance_CEX Futur...,5
50061,14844,2021-11-12 09:21:50+00:00,1381990652,🚀 🚀 20 % Profit on DGB for our Premium Members...,5
...,...,...,...,...,...
88007,6651,2020-12-27 04:03:18+00:00,1457621071,┌ 1INCH / USDT ✳ ️ Buying Volume ├ 500.25 K ₮ ...,5
717127,1460,2021-02-05 09:10:48+00:00,1191140932,Stoploss Education : Manual vs Automatic ( imp...,5
188255,1973,2021-07-23 07:17:01+00:00,1497767689,CTK breakout on 4h + 1d . Possible short term ...,5
446631,3086,2018-12-07 18:10:18+00:00,1312345502,DNT Hit Our Lower Buy Zone Below 300 Satoshi N...,5


In [40]:
df_sample = df_sample[['label', 'id', 'date', 'channel_id', 'message']]
df_sample.to_csv('data/external/labeled/snn_news_messages_combined.csv', index=False)

In [None]:
df_sample

In [11]:
df_l = load_data(labeled_files)

In [20]:
df_p

Unnamed: 0,id,date,channel_id,message
0,769,2022-01-21 15:09:02+00:00,1249499797,"📣 Hello everyone , we are aware that our commu..."
1,768,2022-01-21 15:02:35+00:00,1249499797,"ACCORDING TO A THAI GOVERNING PARTY MP , CRYPT..."
2,767,2022-01-21 15:02:17+00:00,1249499797,BITCOIN PLUMMETS TO SIX-MONTH LOWS AT $ 38K . ...
3,766,2022-01-20 17:20:03+00:00,1249499797,"ACCORDING TO A SURVEY , 67 PERCENT OF CANADIAN..."
4,765,2022-01-20 17:18:42+00:00,1249499797,SINGAPORE CRYPTO ATMS HAVE BEEN SHUT DOWN FOLL...
...,...,...,...,...
261563,11,2018-07-17 17:42:07+00:00,1365959417,Qkc going up .. wait and hold
261564,9,2018-07-17 11:56:09+00:00,1365959417,QKC will toch its all target soon
261565,5,2018-07-17 11:51:29+00:00,1365959417,🌐 QKC 🌐 Buy under 1200 🎯 Target 1 : 1300 🎯 Tar...
261566,4,2018-07-17 08:14:24+00:00,1365959417,Welcome to all !


In [21]:
df_l[df_l.label.isna()]

Unnamed: 0,label,id,date,channel_id,message


In [44]:
df_l = df_l.drop_duplicates(subset=['channel_id', 'id'], keep='first')
df_l = df_l.drop_duplicates(subset=['message'], keep='first')
df_l['label'] = df_l['label'].astype(int)
df_l

Unnamed: 0,label,id,date,channel_id,message,views,image
0,5,1507,2024-04-05 17:06:27+00:00,1066638082,THAT'S AMAZING 🍿,64533.0,
1,5,1506,2024-04-05 17:04:57+00:00,1066638082,SELL NOW 🔻 🔺,61012.0,
2,5,1504,2024-04-05 17:02:49+00:00,1066638082,LET'S GOO ...,62428.0,
3,2,1503,2024-04-05 17:00:47+00:00,1066638082,COIN IS : $ STRIKE,64447.0,
4,1,1502,2024-04-05 16:59:01+00:00,1066638082,1 Minute Left 🕦,63993.0,
...,...,...,...,...,...,...,...
6302,5,166,2023-08-24 18:20:27+00:00,1984936323,❌ VIP GROUP IS NOW CLOSED ❌ We've now reached ...,6668.0,
6304,0,164,2023-08-23 09:34:30+00:00,1984936323,‼ ️ PUMP ANNOUNCEMENT ‼ ️ Here's the details :...,7490.0,
6306,5,158,2023-08-20 20:11:31+00:00,1984936323,OK everyone we are opening more VIP slots ! Th...,7342.0,
6407,5,2,2024-08-02 12:42:19+00:00,2233767182,LINK TO TELEGRAM GROUP: t Join our main 2mln s...,10632.0,


In [15]:
df_l.value_counts('label')

label
5    768
2    384
0    287
1    265
4    157
3    145
Name: count, dtype: int64

In [16]:
df_l.to_csv(labeled_files / 'labeled_pump_messages.csv', index=False)

In [46]:
df_l = df_l.drop_duplicates(subset=['channel_id', 'id'], keep='first')
df_p = df_p.drop_duplicates(subset=['channel_id', 'id'], keep='first')
df_merged = df_l.rename(columns={'message': 'message_old', 'date': 'date_old'}).merge(df_p[['channel_id', 'id', 'message', 'date']], on=['channel_id', 'id'], how='left')

In [47]:
df_merged

Unnamed: 0,label,id,date_old,channel_id,message_old,views,image,message,date
0,5.0,1507,2024-04-05 17:06:27+00:00,1066638082,THAT'S AMAZING 🍿,64533.0,,THAT'S AMAZING 🍿,2024-04-05 17:06:27+00:00
1,5.0,1506,2024-04-05 17:04:57+00:00,1066638082,SELL NOW 🔻 🔺,61012.0,,SELL NOW 🔻 🔺,2024-04-05 17:04:57+00:00
2,5.0,1504,2024-04-05 17:02:49+00:00,1066638082,LET'S GOO ...,62428.0,,LET'S GOO ...,2024-04-05 17:02:49+00:00
3,2.0,1503,2024-04-05 17:00:47+00:00,1066638082,COIN IS : $ STRIKE,64447.0,,COIN IS : $ STRIKE,2024-04-05 17:00:47+00:00
4,1.0,1502,2024-04-05 16:59:01+00:00,1066638082,1 Minute Left 🕦,63993.0,,1 Minute Left 🕦,2024-04-05 16:59:01+00:00
...,...,...,...,...,...,...,...,...,...
4016,0.0,164,2023-08-23 09:34:30+00:00,1984936323,‼ ️ PUMP ANNOUNCEMENT ‼ ️ Here's the details :...,7490.0,,‼ ️ PUMP ANNOUNCEMENT ‼ ️ Here's the details :...,2023-08-23 09:34:30+00:00
4017,5.0,163,2023-08-23 06:28:09+00:00,1984936323,"We're announcing a new pump today , and it's g...",6823.0,,"We're announcing a new pump today , and it's g...",2023-08-23 06:28:09+00:00
4018,5.0,158,2023-08-20 20:11:31+00:00,1984936323,OK everyone we are opening more VIP slots ! Th...,7342.0,,OK everyone we are opening more VIP slots ! Th...,2023-08-20 20:11:31+00:00
4019,5.0,2,2024-08-02 12:42:19+00:00,2233767182,LINK TO TELEGRAM GROUP: t Join our main 2mln s...,10632.0,,LINK TO TELEGRAM GROUP: Trading_Signals8 Join ...,2024-08-02 12:42:19+00:00


In [48]:
df_l = df_l.drop_duplicates(subset=['channel_id', 'id'], keep='first')
df_p = df_p.drop_duplicates(subset=['channel_id', 'id'], keep='first')
df_merged = df_l.rename(columns={'message': 'message_old', 'date': 'date_old', 'image': 'image_old', 'views': 'views_old'}).merge(df_p[['channel_id', 'id', 'message', 'date', 'views', 'image']], on=['channel_id', 'id'], how='left')

In [49]:
df_merged

Unnamed: 0,label,id,date_old,channel_id,message_old,views_old,image_old,message,date,views,image
0,5.0,1507,2024-04-05 17:06:27+00:00,1066638082,THAT'S AMAZING 🍿,64533.0,,THAT'S AMAZING 🍿,2024-04-05 17:06:27+00:00,64533.0,
1,5.0,1506,2024-04-05 17:04:57+00:00,1066638082,SELL NOW 🔻 🔺,61012.0,,SELL NOW 🔻 🔺,2024-04-05 17:04:57+00:00,61012.0,
2,5.0,1504,2024-04-05 17:02:49+00:00,1066638082,LET'S GOO ...,62428.0,,LET'S GOO ...,2024-04-05 17:02:49+00:00,62428.0,
3,2.0,1503,2024-04-05 17:00:47+00:00,1066638082,COIN IS : $ STRIKE,64447.0,,COIN IS : $ STRIKE,2024-04-05 17:00:47+00:00,64447.0,
4,1.0,1502,2024-04-05 16:59:01+00:00,1066638082,1 Minute Left 🕦,63993.0,,1 Minute Left 🕦,2024-04-05 16:59:01+00:00,63993.0,
...,...,...,...,...,...,...,...,...,...,...,...
4016,0.0,164,2023-08-23 09:34:30+00:00,1984936323,‼ ️ PUMP ANNOUNCEMENT ‼ ️ Here's the details :...,7490.0,,‼ ️ PUMP ANNOUNCEMENT ‼ ️ Here's the details :...,2023-08-23 09:34:30+00:00,7490.0,
4017,5.0,163,2023-08-23 06:28:09+00:00,1984936323,"We're announcing a new pump today , and it's g...",6823.0,,"We're announcing a new pump today , and it's g...",2023-08-23 06:28:09+00:00,6823.0,
4018,5.0,158,2023-08-20 20:11:31+00:00,1984936323,OK everyone we are opening more VIP slots ! Th...,7342.0,,OK everyone we are opening more VIP slots ! Th...,2023-08-20 20:11:31+00:00,7342.0,
4019,5.0,2,2024-08-02 12:42:19+00:00,2233767182,LINK TO TELEGRAM GROUP: t Join our main 2mln s...,10632.0,,LINK TO TELEGRAM GROUP: Trading_Signals8 Join ...,2024-08-02 12:42:19+00:00,10632.0,


In [50]:
df_merged['label'] = df_merged['label'].astype(int)
df_merged = df_merged[['label', 'id', 'date', 'channel_id', 'message']]

In [51]:
df_merged

Unnamed: 0,label,id,date,channel_id,message
0,5,1507,2024-04-05 17:06:27+00:00,1066638082,THAT'S AMAZING 🍿
1,5,1506,2024-04-05 17:04:57+00:00,1066638082,SELL NOW 🔻 🔺
2,5,1504,2024-04-05 17:02:49+00:00,1066638082,LET'S GOO ...
3,2,1503,2024-04-05 17:00:47+00:00,1066638082,COIN IS : $ STRIKE
4,1,1502,2024-04-05 16:59:01+00:00,1066638082,1 Minute Left 🕦
...,...,...,...,...,...
4016,0,164,2023-08-23 09:34:30+00:00,1984936323,‼ ️ PUMP ANNOUNCEMENT ‼ ️ Here's the details :...
4017,5,163,2023-08-23 06:28:09+00:00,1984936323,"We're announcing a new pump today , and it's g..."
4018,5,158,2023-08-20 20:11:31+00:00,1984936323,OK everyone we are opening more VIP slots ! Th...
4019,5,2,2024-08-02 12:42:19+00:00,2233767182,LINK TO TELEGRAM GROUP: Trading_Signals8 Join ...


In [52]:
df_merged.to_csv("./data/internal/labeled/labeled_messages_combined.csv", index=False)