In [1]:
import pandas as pd
from pyspark.sql.functions import input_file_name, regexp_extract, explode, desc, size
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, StructType, LongType, IntegerType

In [2]:
file_path = 'telegram_2024/urls_per_message'

In [3]:
df = spark.read.parquet(file_path)

                                                                                

In [4]:
df.show()

[Stage 1:>                                                          (0 + 1) / 1]

+--------------------+--------------------+
|                 url|         occurrences|
+--------------------+--------------------+
|HTTPS://VIGILANTN...|[{7966, channel_1...|
|Https://Freedomfo...|[{298301, channel...|
|Https://Twitter.c...|[{233321, channel...|
|Https://blackcz.best|[{565478, channel...|
|Https://fusu.me/i...|[{160001, channel...|
|  Https://ilycat.com|[{294245, channel...|
|Https://onlinelib...|[{39328, channel_...|
|Https://twitter.c...|[{137097, channel...|
|Https://www.Ahega...|[{277461, channel...|
| Https://www.Kek.com|[{302271, channel...|
|  Https://x.com/Beat|[{305701, channel...|
|Https://x.com/fra...|[{585690, channel...|
|       WWW.88PRO.BET|[{251805, channel...|
|     WWW.AMAZON.COM,|[{98311, channel_...|
|      WWW.CARROT.COM|[{705361, channel...|
|WWW.OBITDATA.COM/...|[{1315, channel_1...|
|WWW.OBITDATA.COM/...|[{1313, channel_1...|
|   WWW.TRIMPANCE.COM|[{711839, channel...|
|       WWW.VANCE.COM|[{711230, channel...|
|Www.KnowYourRight...|[{319063, 

                                                                                

In [5]:
df_occurrences_count = df.withColumn("occurrences_count", size("occurrences"))

## 1. Ocurrences

In [6]:
def show_more_occurrences(df):
    print("As 20 URLs com maior número de ocorrências:")
    df \
    .select("url", "occurrences_count") \
    .orderBy(desc("occurrences_count")) \
    .show(20, truncate=False)

In [7]:
show_more_occurrences(df_occurrences_count)

As 20 URLs com maior número de ocorrências:




+------------------------------------------------------------------+-----------------+
|url                                                               |occurrences_count|
+------------------------------------------------------------------+-----------------+
|https://236779.com/#/register?ic=344726                           |5022816          |
|https://trontop.club/#/register?ic=990284                         |2687340          |
|https://www.facebook.com/share/bKvVhdfEndsKtx6c/?mibextid=A7sQZp  |2078453          |
|https://www.facebook.com/share/g/7unKemtXNRMnKvoE/?mibextid=A7sQZp|1877365          |
|https://www.instagram.com/referrealponzi?igsh=MWhmbDNob2M4c2dkcQ==|1817363          |
|https://bit.ly/4cJ4t3k                                            |1781001          |
|https://tronera.art/index.html#/register/555431                   |1595784          |
|https://antblockchain.vip/#/reg?ic=117174                         |1472252          |
|https://tronsvip.com/#/register?ic=838208 

                                                                                

In [8]:
df_occurrences_count.select("url","occurrences","occurrences_count").createOrReplaceTempView('view_links')

## 2. Social Media links

### Twitter

In [13]:
x_links = spark.sql("""
    SELECT *
    FROM view_links
    WHERE url LIKE '%.x.%' OR url LIKE '%.twitter.%'
""")

In [14]:
show_more_occurrences(x_links)

As 20 URLs com maior número de ocorrências:




+---------------------------------------------+-----------------+
|url                                          |occurrences_count|
+---------------------------------------------+-----------------+
|https://www.x.com/catwolfonsol               |21440            |
|www.x.com/nuestraamericaz                    |4236             |
|https://www.twitter.com/BasedMoonCoin        |3972             |
|https://www.twitter.com/GoyDescontrola2      |2339             |
|www.x.com/EnviroTrackPlus                    |1903             |
|http://www.twitter.com/GilletteCatSol        |1681             |
|www.twitter.com/GilletteCatSol               |1681             |
|www.twitter.com/SquidOnEth                   |1495             |
|https://www.x.com/stingonsol                 |1197             |
|https://www.x.com/bowie_musk                 |1148             |
|www.x.com/verein_wir                         |697              |
|https://www.twitter.com/SupereumOnSol        |562              |
|https://w

                                                                                

In [11]:
x_links_count = spark.sql("""
    SELECT COUNT(*) AS count_links
    FROM view_links
    WHERE url LIKE '%.x.%' OR url LIKE '%.twitter.%'
""").show()



+-----------+
|count_links|
+-----------+
|       4324|
+-----------+



                                                                                

In [15]:
from pyspark.sql.functions import regexp_extract

perfis = x_links.withColumn(
    "perfil",
    regexp_extract("url", r"\.com/([^/?]+)", 1)
)

perfis.show()

[Stage 8:>                                                          (0 + 1) / 1]

+--------------------+--------------------+-----------------+---------------+
|                 url|         occurrences|occurrences_count|         perfil|
+--------------------+--------------------+-----------------+---------------+
|Www.twitter.com/f...|[{483739, channel...|                1|      fckmensol|
|http://www.twitte...|[{528298, channel...|                7|       newshour|
|https://mobile.tw...|[{91615, channel_...|                4|      YahooNews|
|https://mobile.tw...|[{418890, channel...|                8|       iol_chat|
|https://www.twitt...|[{211592, channel...|                2|    dogagentsol|
|https://www.twitt...|[{95554, channel_...|                1|           katy|
|https://www.twitt...|[{393264, channel...|                1|   olympiacosfc|
|https://www.twitt...|[{379166, channel...|                1|    pepegotrekt|
|https://www.twitt...|[{86362, channel_...|                1|  pikachuonsol_|
|https://www.twitt...|[{572466, channel...|                2|   

                                                                                

In [23]:
perfis.select("perfil","occurrences_count").createOrReplaceTempView("view_perfil")
perfil_activty = spark.sql("""
    SELECT perfil, SUM(occurrences_count) AS total
    FROM view_perfil
    GROUP BY perfil
    ORDER BY total DESC
""")
perfil_activty.show()

                                                                                

+---------------+-----+
|         perfil|total|
+---------------+-----+
|   catwolfonsol|21440|
|nuestraamericaz| 4241|
|  BasedMoonCoin| 3972|
| GilletteCatSol| 3362|
|GoyDescontrola2| 2339|
|EnviroTrackPlus| 1903|
|     SquidOnEth| 1495|
|     stingonsol| 1197|
|     bowie_musk| 1148|
|               |  887|
|     verein_wir|  697|
|  SupereumOnSol|  562|
|  TrumpsFight26|  554|
|   Bonk1776bnwo|  408|
|    clippy_coin|  339|
|     rocko_meme|  292|
|      EvinToken|  276|
|             en|  255|
|nuestraamericaZ|  170|
|  FrontWolf2020|  129|
+---------------+-----+
only showing top 20 rows



### Instagram

In [9]:
instagram_links = spark.sql("""
    SELECT *
    FROM view_links
    WHERE url LIKE '%.instagram.%'
""")

In [10]:
show_more_occurrences(instagram_links)

As 20 URLs com maior número de ocorrências:




+--------------------------------------------------------------------------+-----------------+
|url                                                                       |occurrences_count|
+--------------------------------------------------------------------------+-----------------+
|https://www.instagram.com/referrealponzi?igsh=MWhmbDNob2M4c2dkcQ==        |1817363          |
|https://www.instagram.com/aya4.54444?igsh=endjZDRwaWp5bjNh                |480820           |
|https://www.instagram.com/cryptotradingsites                              |231791           |
|https://www.instagram.com/tpusdt                                          |110988           |
|https://www.instagram.com/ahmed199815111?igsh=YzljYTk1ODg3Zg==‌‏          |82776            |
|https://www.instagram.com/ahmed199815111?igsh=YzljYTk1ODg3Zg              |81750            |
|https://www.instagram.com/deloreanlabs                                    |74818            |
|https://www.instagram.com/highprofit2024?igsh=NzB

                                                                                

In [11]:
instagram_links_counts = spark.sql("""
    SELECT COUNT(*) as count_links
    FROM view_links
    WHERE url LIKE '%.instagram.%'
""").show()



+-----------+
|count_links|
+-----------+
|     287578|
+-----------+



                                                                                

In [13]:
from pyspark.sql.functions import regexp_extract

perfis = instagram_links.withColumn(
    "perfil",
    regexp_extract("url", r"\.com/([^/?]+)", 1)
)

perfis.show()

[Stage 8:>                                                          (0 + 1) / 1]

+--------------------+--------------------+-----------------+-------------------+
|                 url|         occurrences|occurrences_count|             perfil|
+--------------------+--------------------+-----------------+-------------------+
|http://www.instag...|[{140662, channel...|                1|TheAdrenalineDealer|
|https://l.faceboo...|[{293671, channel...|                1|              l.php|
|https://l.faceboo...|[{294090, channel...|                1|              l.php|
|https://www.insta...|[{337260, channel...|                2|          134shirts|
|https://www.insta...|[{2662, channel_2...|                1|          13newsnow|
|https://www.insta...|[{22251, channel_...|                1|            716111k|
|https://www.insta...|[{5646, channel_2...|                3| RCOFinanceofficial|
|https://www.insta...|[{299867, channel...|                2|        __adi__0_7_|
|https://www.insta...|[{15558, channel_...|                1|          __seul777|
|https://www.ins

                                                                                

In [13]:
perfis.select("perfil","occurrences_count").createOrReplaceTempView("view_perfil")
perfil_activty = spark.sql("""
    SELECT perfil, SUM(occurrences_count) AS total
    FROM view_perfil
    GROUP BY perfil
    ORDER BY total DESC
""")
perfil_activty.show()



+-------------------+-------+
|             perfil|  total|
+-------------------+-------+
|                  p|1943976|
|     referrealponzi|1817363|
|         aya4.54444| 485697|
|     ahmed199815111| 254996|
| cryptotradingsites| 231791|
|               reel| 115406|
|             tpusdt| 110988|
|       deloreanlabs|  74818|
|     highprofit2024|  49484|
|         guard_gems|  45916|
|        lattesolana|  39237|
|     usdt_advetiser|  35075|
|          elboss318|  32147|
|          crypt0_c0|  28284|
|   team_vendetta199|  28235|
|            arbcapx|  24786|
|      arsen.bravado|  19139|
|          richy.cto|  18680|
|super_doge_official|  18630|
|theofficialdopameme|  17604|
+-------------------+-------+
only showing top 20 rows



                                                                                

### Youtube

In [24]:
youtube_links = spark.sql("""
    SELECT *
    FROM view_links
    WHERE url LIKE '%.youtube.%'
""")

In [25]:
show_more_occurrences(youtube_links)

As 20 URLs com maior número de ocorrências:




+-------------------------------------------------------------+-----------------+
|url                                                          |occurrences_count|
+-------------------------------------------------------------+-----------------+
|https://www.youtube.com/channel/UCgla8DhgElOOjWnjdmKSDRA     |69356            |
|https://www.youtube.com/watch?v=e0Ws9DGoc6I                  |11370            |
|https://www.youtube.com/watch?v=RyQmife0bfA&t=7s             |9978             |
|https://www.youtube.com/watch?v=lWv9Cjhgs6U                  |7169             |
|www.youtube.com/c/Lexcentoundici                             |5546             |
|https://www.youtube.com/watch?v=169Xaec3--w                  |4591             |
|https://www.youtube.com/watch?v=DmnlNoD6FIk&t=38s            |4124             |
|https://www.youtube.com/watch?v=SkjC_Uj_akI&t=21             |3801             |
|https://www.youtube.com/watch?v=4PztzgggXAI                  |3609             |
|https://www.you

                                                                                

In [29]:
youtube_links.select("url","occurrences_count").createOrReplaceTempView('view_yt')
spark.sql("""
    SELECT *
    FROM view_yt
    WHERE url LIKE '%channel%'
    ORDER BY occurrences_count DESC
""").show(truncate = False)



+-----------------------------------------------------------------+-----------------+
|url                                                              |occurrences_count|
+-----------------------------------------------------------------+-----------------+
|https://www.youtube.com/channel/UCgla8DhgElOOjWnjdmKSDRA         |69356            |
|https://www.youtube.com/channel/UC5l18oylJ8o7ihugk4F-3nw/join    |2603             |
|https://www.youtube.com/channel/UC5l18oylJ8o7ihugk4F-3nw         |2302             |
|https://m.youtube.com/channel/UCUqNGw9Rc0si0WSPoJQdOQg           |2256             |
|https://www.youtube.com/channel/UCi0TjvTDys40EG2Qj9lXpnw         |2164             |
|https://www.youtube.com/channel/UCR0yfkqti22gsf06VsjD4FQ         |1710             |
|https://www.youtube.com/channel/UC5-FYgXJmuU36P2wIxL7doA         |1669             |
|https://www.youtube.com/channel/UCVs56bIj6dlk-OD1LZZoqyA         |1380             |
|https://www.youtube.com/channel/UCy3hm3yyJ_K6wqu44kZb

                                                                                

In [33]:
youtube_links_count = spark.sql("""
    SELECT COUNT(*) as count_links
    FROM view_links
    WHERE url LIKE '%.youtube.%'
""").show()



+--------+
|count(1)|
+--------+
|  564393|
+--------+



                                                                                

### Facebook

In [14]:
facebook_links = spark.sql("""
    SELECT *
    FROM view_links
    WHERE url LIKE '%.facebook.%'
""")

In [15]:
show_more_occurrences(facebook_links)

As 20 URLs com maior número de ocorrências:




+------------------------------------------------------------------------+-----------------+
|url                                                                     |occurrences_count|
+------------------------------------------------------------------------+-----------------+
|https://www.facebook.com/share/bKvVhdfEndsKtx6c/?mibextid=A7sQZp        |2078453          |
|https://www.facebook.com/share/g/7unKemtXNRMnKvoE/?mibextid=A7sQZp      |1877365          |
|https://www.facebook.com/groups/free4alll/?ref=share&mibextid=NSMWBT    |310686           |
|https://www.facebook.com/groups/796913931881126/?ref=share              |235478           |
|https://www.facebook.com/share/T2YbdadPZXnuihUD/?mibextid=A7sQZp        |123257           |
|https://www.facebook.com/share/g/DjmYFzB1RYUjm9Md/?mibextid=K35XfP      |120639           |
|https://www.facebook.com/groups/545361580615128                         |69356            |
|https://www.facebook.com/share/g/sGy4WuDzzqUfLupK/                   

                                                                                

In [16]:
facebook_links_count = spark.sql("""
    SELECT COUNT(*) as count_links
    FROM view_links
    WHERE url LIKE '%.facebook.%'
""").show()



+-----------+
|count_links|
+-----------+
|     102421|
+-----------+



                                                                                

### Tiktok

In [28]:
tiktok_links = spark.sql("""
    SELECT *
    FROM view_links
    WHERE url LIKE '%.tiktok.%'
""")

In [29]:
show_more_occurrences(tiktok_links)

As 20 URLs com maior número de ocorrências:




+--------------------------------------------------------+-----------------+
|url                                                     |occurrences_count|
+--------------------------------------------------------+-----------------+
|https://www.tiktok.com/t/ZPRovQbbG/                     |17080            |
|https://vm.tiktok.com/ZMMsANEGX/                        |7399             |
|https://vm.tiktok.com/ZMMsARWnX/                        |7399             |
|https://vm.tiktok.com/ZMMsS6XK8/                        |7399             |
|https://shop.tiktok.com/view/product/1729513948866774547|1032             |
|https://vt.tiktok.com/ZSNPHxyga/                        |578              |
|https://vt.tiktok.com/ZSNPHj1cL/                        |578              |
|https://vt.tiktok.com/ZSNPHUcY8/                        |577              |
|https://vm.tiktok.com/ZMref4edG/                        |372              |
|https://vt.tiktok.com/ZS29aCbpX/                        |294              |

                                                                                

In [35]:
tiktok_links = spark.sql("""
    SELECT COUNT(*) AS count_links
    FROM view_links
    WHERE url LIKE '%.tiktok.%'
""").show()



+-----------+
|count_links|
+-----------+
|     142988|
+-----------+



                                                                                