In [None]:
import numpy as np
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt
from datetime import date
import nltk
import wordcloud
import emoji
import re

In [None]:

def load_messages_to_dataframe(html_file, year=None):
    """
    Parse messages from the HTML file and load them into a Pandas DataFrame.

    Parameters:
        html_file (str): Path to the exported Telegram chat HTML file.
        year (int): Optional. Filter messages by year.

    Returns:
        pd.DataFrame: DataFrame containing the parsed messages.
    """

    with open(html_file, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')

    # Initialize a list to store message data
    data = []

    # Loop through all 'message default clearfix' divs
    for message in soup.find_all('div', class_='message default clearfix'):
        # Extract the timestamp from the 'pull_right date details' div
        date_div = message.find('div', class_='pull_right date details')
        if date_div and 'title' in date_div.attrs:
            date_str = date_div['title']  # Example: "20.01.2024 20:02:09 UTC+01:00"
            try:
                message_date = datetime.strptime(date_str, "%d.%m.%Y %H:%M:%S UTC%z")
            except ValueError:
                continue  # Skip if the date format is unexpected

            # Filter by year (if specified)
            if year and message_date.year != year:
                continue

            # Extract the sender's name from the 'from_name' div
            from_name_div = message.find('div', class_='from_name')
            sender = from_name_div.text.strip() if from_name_div else "(Unknown)"

            # Extract the message text from the 'text' div
            text_div = message.find('div', class_='text')
            message_text = text_div.text.strip() if text_div else "(No text)"
            
            #extract reactions
            reactions = ""
            for reaction in message.find_all('div', class_='reactions'):
            #reactions_div = message.find
                emoji_text = reaction.find('div',class_='emoji').text.strip()
                emoji_owner = reaction.find('div', class_='initials')['title']
                reactions = reactions+f"{emoji_text},{emoji_owner}"
            
            #extract photo sender
            photos =[]
            for photo in message.find_all('div', class_="media_wrap clearfix"):
                a_tag= photo.find('a', class_="photo_wrap clearfix pull_left")
                photo_href = a_tag['href'] if a_tag else None
                body_div = a_tag.find_parent('div', class_='body')  if a_tag else None
                photo_owner = body_div.find('div', class_='from_name').text if body_div else None
                photos.append([photo_owner,photo_href])
                #print(photo_owner, photo_href)
                #photo_owner = photo.find()
                #print(photo_href)
            # Append the message data to the list
            data.append({
                'timestamp': message_date,
                'sender': sender,
                'message': message_text,
                'reactions': reactions, 
                'photo': photos
            })

    df = pd.DataFrame(data)
    return df




html_file = "path/to/dir"
#year = 2024  # The year to filter messages by
author = "Jav"
#filtered_messages = extract_messages_by_year(html_file, year)
#filtered_messages = extract_messages_by_author(html_file, author)
df = load_messages_to_dataframe(html_file, year=None)

In [None]:
df

In [None]:
df.loc[df['photo'].apply(lambda x: len(x) > 0), 'photo']#[47]


In [None]:
def get_date(timestamp):
    result = timestamp.date()
    return result

def get_time(timestamp):
    result = timestamp.time()
    return result


def date_to_integer(date_obj):
    """
    Converts a date object to an integer representing the number of days
    elapsed since January 1st of the same year.
    
    Args:
    - date_obj (datetime.date): The date object to convert.
    
    Returns:
    - int: Number of days since January 1st.
    """
    date_obj = date_obj.date()
    # Reference point: January 1st of the same year
    reference_date = date(date_obj.year, 1, 1)
    # Calculate the difference in days
    days_elapsed = (date_obj - reference_date).days +1
    return days_elapsed


months = pd.Series(['Jan', 'Feb', 'MÃ¤r', 'Apr', 'Mai', 'Jun', 
'Jul', 'Aug', 'Sep', 'Okt', 'Nov', 'Dez'])

df["date"] = df["timestamp"].apply(get_date)    
df["time"] = df["timestamp"].apply(get_time)    
df["month"] = df["timestamp"].apply(lambda x: x.month).astype('category')
df["day"] = df["timestamp"].apply(lambda x: x.day)    
df["date_int"] = df["timestamp"].apply(date_to_integer)

In [None]:
import nltk
import re
import os

language="german"
df_jav = df.loc[df["sender"] == "Jav"]
text_jav = df_jav['message'].str.cat(sep='').lower()
print(type(text_jav),len(text_jav))
print(text_jav)

sentences = nltk.sent_tokenize(text_jav,language='german')


print(len(sentences),sentences)
type(sentences)

In [None]:
#get most common words off tokens

In [None]:
import spacy
import de_core_news_lg
import de_core_news_md
from collections import Counter

from spacy import displacy

word_counter = Counter()

nlp = spacy.load("de_core_news_lg")
all_tokens = ""
for sentence in sentences:
    doc = nlp(sentence)
    #lemmas = [token.lemma_ for token in doc]
    #print("Lemmas:", lemmas)
    
    # Get the tokens
    tokens = [
        token.text.lower()
        for token in doc
        if not token.is_punct and not token.is_space and not token.is_stop and not token.text=="mal"
        and not token.text=="no" and not token.text=="text"
    ]    
    print("tokens ",tokens)
    all_tokens = all_tokens + " ".join(tokens)
    
    # Update the word frequencies
    word_counter.update(tokens)

    # Find the most common words
most_common_words = word_counter.most_common(50)  # Change 10 to any number of top words you want
print("Most common words:", most_common_words)
    
    


In [None]:
# choose which user:
text = text_jav

In [None]:
nlp = spacy.load("de_core_news_lg")

text = re.sub('([.,!?()])', r' \1 ', text)
text = re.sub('\s{2,}', ' ', text)

#text = '\n'.join(text_jens)
#print(text)
doc = nlp(text)

# Generating a word cloud with the adjetives of the story
words = ' '.join(
    [ 
     token.text for token in doc
        if not token.is_punct and not token.is_space and not token.is_stop 
        and not token.text=="mal"
        and not token.text=="no" and not token.text=="text"
    ])
print(words)

# general wordcloud

In [None]:

# Generate word cloud
wc = wordcloud.WordCloud().generate(words)

# Showing word cloud
plt.figure(figsize=(15, 15),dpi=500)
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

# adjectives

In [None]:
words = ' '.join(
    [ 
        token.norm_
        for token in doc
        if not token.is_punct and not token.is_space and not token.text=="mal" and not token.is_stop
        and not token.text=="no" and not token.text=="text" and token.pos_ in ['ADJ'] and not token.text.lower()=="janosch"
    ])


# Generate word cloud
wc = wordcloud.WordCloud().generate(words)

#print(doc)

# Showing word cloud
plt.figure(figsize=(15, 15))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()



In [None]:
# Verbs

In [None]:
words = ' '.join(
    [ 
        token.lemma_
        for token in doc
        if not token.is_punct and not token.is_space and not token.is_stop
        and not token.text=="no" and not token.text=="text" and token.pos_ in ['VERB']
    ])

# Generate word cloud
wc = wordcloud.WordCloud().generate(words)

# Showing word cloud
plt.figure(figsize=(15, 15))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

# nouns

In [None]:
words = ' '.join(
    [ 
        token.norm_
        for token in doc
        if not token.is_punct and not token.is_space and not token.is_stop
        and not token.text=="no" and not token.text=="text" and token.pos_ in ['NOUN']
    ])

# Generate word cloud
wc = wordcloud.WordCloud().generate(words)

# Showing word cloud
plt.figure(figsize=(15, 15))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
# named entities

In [None]:
words = ' '.join(
    [ 
        token.norm_
        for token in doc
        if not token.is_punct and not token.is_space #and not token.is_stop
        and not token.text=="no" and not token.text=="text" and token.pos_ in ['PROPN']
    ])
tokens = [ 
        token.norm_
        for token in doc
        if not token.is_punct and not token.is_space and not token.is_stop
        and not token.text=="no" and not token.text=="text" and token.pos_ in ['PROPN']
    ]



word_counter = Counter()
word_counter.update(tokens)
most_common_words = word_counter.most_common(50)
#print("Most common words:", most_common_words)

# Generate word cloud
wc = wordcloud.WordCloud().generate(words)

# Showing word cloud
plt.figure(figsize=(15, 15))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

# emoji analysis

In [None]:
#Show used emojis
emoji_user_dict = {}
for user in user_names:
    df_emoji = df.loc[df["sender"] == user]
    text = df_emoji['message'].str.cat(sep='\n')
    out = (pd.DataFrame(emoji.emoji_list(text)).value_counts('emoji')
             .rename_axis('Smiley').rename('Count').reset_index()
             .assign(Type=lambda x: x['Smiley'].apply(emoji.demojize)))
    results = out.iloc[:10]
    emoji_user_dict[user] = results.to_numpy()

print(emoji_user_dict)

In [None]:
#plot used emojis of each user

top_num = 10


top_emojis_per_user = {}
for user, smileys in emoji_user_dict.items():
    # Sort emojis by count (descending), then take the top 10
    sorted_smileys = sorted(smileys, key=lambda x: int(x[1]), reverse=True)
    top_emojis_per_user[user] = sorted_smileys[:top_num]


unique_smileys = set() #in case users have share top 10 emojis
for smileys in top_emojis_per_user.values():
    for _, _, demojized in smileys:
        unique_smileys.add(demojized)
unique_smileys = list(unique_smileys)  # Convert to list for indexing

users = list(emoji_user_dict.keys())
counts_matrix = {
    user: [next((int(smiley[1]) for smiley in top_emojis_per_user[user] if smiley[2] == demojized), 0)
           for demojized in unique_smileys]
    for user in users
}


# Plotting with specified colors
fig, ax = plt.subplots(figsize=(12, 6))

# Color sequence: red, baby blue, green, pink, yellow
colors = ['red', '#89CFF0', 'green', 'pink', 'yellow']

for i, user in enumerate(users):
    ax.plot(
        unique_smileys,
        counts_matrix[user],
        marker='o',  # Markers for better visibility
        label=user,
        color=colors[i % len(colors)]  # Cycle through the color list
    )


ax.set_xlabel('Smileys (Demojinized)', fontsize=12)
ax.set_ylabel('Anzahl Gesendet', fontsize=12)
ax.set_title(f'Top {top_num} Smileys der Gruppe', fontsize=14)
ax.legend(title='User', fontsize=10)
ax.grid(True, linestyle='--', alpha=0.6)
plt.xticks(rotation=90, fontsize=12)
plt.tight_layout()

plt.show()

# Reactions Emojis

In [None]:
# show emojis of users used in Telegram Reactions

text = df['reactions'].str.cat(sep='\n')
out = (pd.DataFrame(emoji.emoji_list(text)).value_counts('emoji')
         .rename_axis('Smiley').rename('Count').reset_index()
         .assign(Type=lambda x: x['Smiley'].apply(emoji.demojize)))
out.iloc[:20]

# Messages Emojis

In [None]:
#show emojis send in Messages

text = df['message'].str.cat(sep='\n')
out = (pd.DataFrame(emoji.emoji_list(text)).value_counts('emoji')
         .rename_axis('Smiley').rename('Count').reset_index()
         .assign(Type=lambda x: x['Smiley'].apply(emoji.demojize)))
out.iloc[:20]