In [91]:
import pandas as pd
import re
import pandas as pd
import re
import emoji
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import word_tokenize, sent_tokenize
import string
from nltk import FreqDist
from nltk import bigrams, trigrams, pos_tag
from nltk.corpus import stopwords
from textstat.textstat import textstatistics 
from collections import Counter
from chat_analysis import *
import os
import requests
from io import BytesIO
import zipfile
import configparser

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/bojansimoski/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/bojansimoski/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [97]:
config = configparser.ConfigParser()
# Read the configuration file
config.read('config.ini')
url = config.get('credentials', 'surfdrive_url')

#### This code preprocesses the whatsapp files, creates a separate csv per participant, with a datetime, username, message format

In [87]:
output_chat_data_folder_path = 'output_chat_data/'

def tokenize_messages(message):
    tokens = word_tokenize(message)
    tokens = [word.lower() for word in tokens if word.isalpha()]
    return tokens

# Word Count
def word_count(message):
    return len(tokenize_messages(message))

# Use of Punctuation
def punctuation_count(message):
    return sum(1 for char in message if char in string.punctuation)

# TTR is a measure of lexical richness that compares the number of unique words (types) to 
# the total number of words (tokens) in the text.
# Unlike vocabulary diversity, which is calculated per message,
# TTR can be calculated over varying lengths of text and is normalized for text length.
# def type_token_ratio(message):
#     words = nltk.word_tokenize(message)
#     return len(set(words)) / len(words) if words else 0

# Compute readability indices like Flesch Reading Ease or Gunning Fog Index to assess the complexity of the text.
# These scores can indicate how accessible or challenging the text is for readers.
def readability_score(message):
    return round(textstatistics().flesch_reading_ease(' '.join(message)), 2)

# Lexical density is the proportion of content words (nouns, verbs, adjectives, adverbs) in the text
# compared to the total number of words. A higher lexical density might indicate a more content-focused or formal style
def lexical_density(message):
    content_pos = {'NN', 'VB', 'JJ', 'RB'}  # Nouns, Verbs, Adjectives, Adverbs
    words = nltk.word_tokenize(message)
    tags = pos_tag(words)
    content_words = sum(1 for word, tag in tags if tag in content_pos)
    return content_words / len(words) if words else 0


# Applied on the whole message corpus
# Vocabulary Diversity: This function calculates the ratio of unique words to total words in the aggregated text.
# It first tokenizes the text into words, converts them to lowercase for standardization,
# and then calculates the ratio.
def vocabulary_diversity(corpus):
    words = word_tokenize(corpus)
    words = [word.lower() for word in words if word.isalpha()]
    return len(set(words)) / len(words) if words else 0
    
# Average Sentence Length: This function calculates the average number of words per sentence in
# the aggregated text. It tokenizes the text into sentences and then counts the words in each sentence.
def average_sentence_length(corpus):
    sentences = sent_tokenize(corpus)
    if len(sentences) == 0:
        return 0
    return sum(len(word_tokenize(sentence)) for sentence in sentences) / len(sentences)

# Analyze the frequency of bigrams (pairs of words) or trigrams (triplets of words). 
# This can reveal common phrases or topics in the chat data.
# N-grams can be particularly insightful for identifying colloquial expressions or recurring themes.
def ngram_frequency(corpus, n=2):
    # Tokenize the corpus and generate n-grams
    words = nltk.word_tokenize(corpus)
    if n == 2:
        ngrams = list(bigrams(words))
    elif n == 3:
        ngrams = list(trigrams(words))
    else:
        raise ValueError("n should be 2 for bigrams or 3 for trigrams")

    # Calculate frequency distribution
    ngram_freq = Counter(ngrams)
    return ngram_freq
    
# Function to calculate bigram frequency for each message
def calculate_bigram_frequency(message):
    return corpus_bigram_freq

# Function to calculate trigram frequency for each message
def calculate_trigram_frequency(message):
    return corpus_trigram_freq
    
def pos_distribution(corpus):
    # Tokenize the corpus and get POS tags
    words = nltk.word_tokenize(corpus)
    pos_tags = nltk.pos_tag(words)

    # Calculate frequency distribution of POS tags
    pos_freq = FreqDist(tag for (word, tag) in pos_tags)
    return pos_freq

def word_length_distribution(corpus):
    # Tokenize the corpus and get word lengths
    words = nltk.word_tokenize(corpus)
    word_lengths = [len(word) for word in words if word.isalpha()]

    # Calculate frequency distribution of word lengths
    length_freq = FreqDist(word_lengths)
    return length_freq
    
# The term "stop word frequency" refers to the distribution of stop words in a text or corpu
def stop_word_frequency(corpus):
    stop_words = set(stopwords.words('english'))
    words = nltk.word_tokenize(corpus)
    total_words = len(words)
    stop_words_count = sum(1 for word in words if word.lower() in stop_words)

    return stop_words_count / total_words if total_words > 0 else 0
    
def get_top_ngrams(corpus, n=2, min_frequency=3):
    # Tokenize the corpus into words
    words = word_tokenize(corpus)
    
    # Exclude specific punctuation (e.g., ".")
    filtered_words = [word.lower() for word in words if word.isalpha() and word != "."]

    # Generate n-grams
    ngrams_list = list(bigrams(words)) if n == 2 else []

    # Calculate frequency distribution of n-grams
    ngram_freq = Counter(ngrams_list)

    # Filter and print n-grams with frequency greater than min_frequency
    top_ngrams = [(ngram, count) for ngram, count in ngram_freq.items() if count > min_frequency]

    for ngram, count in top_ngrams:
        print(f"{n}-gram: {ngram}, Frequency: {count}")

def get_top_words_by_pos(corpus):
    # Tokenize the corpus into words
    words = word_tokenize(corpus)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]
    print(filtered_words)
    # Perform POS tagging
    pos_tags = pos_tag(filtered_words)

    # Separate words by POS category
    verbs = [word for word, pos in pos_tags if pos.startswith('VB')]
    adjectives = [word for word, pos in pos_tags if pos.startswith('JJ')]
    nouns = [word for word, pos in pos_tags if pos.startswith('NN')]
    pronouns = [word for word, pos in pos_tags if pos.startswith('PRP')]

    # Count occurrences of each category
    verb_counts = Counter(verbs)
    adjective_counts = Counter(adjectives)
    noun_counts = Counter(nouns)
    pronoun_counts = Counter(pronouns)

    # Get the top 10 used words for each category along with frequency
    top_verbs = [(word, count) for word, count in verb_counts.most_common(10)]
    top_adjectives = [(word, count) for word, count in adjective_counts.most_common(10)]
    top_nouns = [(word, count) for word, count in noun_counts.most_common(10)]
    top_pronouns = [(word, count) for word, count in pronoun_counts.most_common(10)]

    return top_verbs, top_adjectives, top_nouns, top_pronouns
        
def extract_emojis(s):
    # Extract emojis using the emoji package
    all_emojis = ''.join(c for c in s if c in emoji.EMOJI_DATA)
    
    # Regex pattern for keyboard emoticons
    emoticon_pattern = re.compile(r'(:\)|;\)|:\(|:\D|:P|:O|:\||>:O|:\/|:\[|:\]|:\{|:\}|<3)')

    # Find all emoticons in the string
    found_emoticons = emoticon_pattern.findall(s)
    all_emoticons = ''.join(found_emoticons)

    # Combine emojis and emoticons
    all_emojis_emoticons = all_emojis + all_emoticons

    # Remove emojis and emoticons from the original message
    # Create a pattern that matches all found emojis and emoticons
    combined_pattern = re.compile('|'.join(re.escape(c) for c in all_emojis) + '|' + emoticon_pattern.pattern)
    
    cleaned_message = combined_pattern.sub(r'', s)

    # Return a tuple of cleaned message and emojis/emoticons
    return cleaned_message, all_emojis_emoticons or 'N/A'  # Returns 'N/A' if none found


# Regular expression pattern for parsing each txt file line
pattern = re.compile(rb'\[((\d{2}/\d{2}/\d{4})|(\d{2}\.\d{2}\.\d{2})), (\d{2}:\d{2}:\d{2})\] (.*?): (.*)\r\n')

# Send an HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Create a BytesIO object to hold the downloaded ZIP file content
    zip_content = BytesIO(response.content)

    # Use the zipfile module to extract the contents
    with zipfile.ZipFile(zip_content, 'r') as zip_ref:
        for file_info in zip_ref.infolist():
            # Lists to store the extracted data from file
            datetimes = []
            usernames = []
            messages = []
            emojis = []
            # Check if the file has a .txt extension
            if file_info.filename.endswith('.txt'):
                # Extract the content of the text file
                with zip_ref.open(file_info.filename) as txt_file:
                    # Read and print the content of the text file line by line
                    for line in txt_file:
                        match = pattern.match(line)
                        if match:
                            group = match.groups()
                            # Combine date and time into a single string
                            datetime_str = group[0].decode('utf-8') + ' ' + group[3].decode('utf-8')
                            datetimes.append(datetime_str)
                            usernames.append(group[4].decode('utf-8'))
                            messages.append(group[5].decode('utf-8'))
                            emojis.append('')
                            
                
                    # Creating a DataFrame
                    df_chats = pd.DataFrame({
                        'datetime': datetimes,
                        'username': usernames,
                        'message': messages,
                        'emojis': emojis
                    })
                                
                    df_chats['datetime'] = pd.to_datetime(df_chats['datetime'], errors='coerce')
                    # Filter rows where datetime is greater than January 25, 2024 - start of the data donation experiment
                    df_chats = df_chats[df_chats['datetime'] > '2024-01-25']
                    # Applied lexical functions per message
                    df_chats[['message', 'emojis']] = df_chats['message'].apply(extract_emojis).tolist()
                    df_chats['word_count'] = df_chats['message'].apply(word_count)
                    df_chats['punctuation_count'] = df_chats['message'].apply(punctuation_count)
                    df_chats['readability_score'] = df_chats['message'].apply(readability_score)
                    df_chats['lexical_density'] = df_chats['message'].apply(lexical_density)
                    
                    # Extracting unique usernames
                    unique_usernames = df_chats['username'].unique()
                    
                    # Creating a dictionary to hold the DataFrames for each unique username
                    df_dict = {username: df_chats[df_chats['username'] == username] for username in unique_usernames}
                    
                    for df in df_dict.values():
                        # Extracting the username from the first row of the DataFrame
                        username = df['username'].iloc[0]
                        # Sanitize the username to ensure it's safe for use as a file name
                        sanitized_username = "".join([c for c in username if c.isalpha() or c.isdigit() or c==' ']).rstrip()
                        
                        # Lexical features applied on the whole corpus
                        all_messages = '. '.join(df['message'].astype(str))
                        words = word_tokenize(all_messages)
                        unique_words = set(words)
                        ttr = len(unique_words) / len(words) if words else 0
                        df['type_token_ratio'] = ttr
                        df['vocabulary_diversity'] = vocabulary_diversity(all_messages)
                        df['average_sentence_length'] = average_sentence_length(all_messages)    
                        df['stop_word_freq'] = stop_word_frequency(all_messages)
                    
                        # print('USERNAME: ', username,' \n')
                        # print('2-GRAM FREQUENCY \n')
                        # print(get_top_ngrams(all_messages, n=2))
                        # print('\nPOS DISTRIBUTION \n')
                        # fdist = pos_distribution(all_messages)
                        # for f in fdist:
                        #     print (f, fdist[f])
                        # print('\nPOS TOP 10 \n')
                        # print(get_top_words_by_pos(all_messages))
                        # print('\n WORD LENGTH DISTRIBUTION \n')
                        # word_length_d = word_length_distribution(all_messages)
                        # for k in word_length_d:
                        #     print (k, word_length_d[k])
                
                        
                        # Constructing the filename
                        filename = output_chat_data_folder_path + f'{sanitized_username}_chat_llm.csv'
                    
                        filename_whole_corpus = output_chat_data_folder_path +  f'{sanitized_username}_all.txt'
                    
                        f = open(filename_whole_corpus,'w')
                        f.write(all_messages) #Give your csv text here.
                        ## Python will convert \n to os.linesep
                        f.close()
                        # Saving the DataFrame to a CSV file
                        df.to_csv(filename, index=False)

else:
    print(f"Failed to download file. Status code: {response.status_code}")

  df_chats['datetime'] = pd.to_datetime(df_chats['datetime'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:,'type_token_ratio'] = ttr
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:,'vocabulary_diversity'] = vocabulary_diversity(all_messages)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:,'average_sen

In [48]:
# # Convert DataFrame to CSV string
# csv_content = df.to_csv(index=False)

# # Set the target URL where you want to upload the CSV file
# upload_url = 'https://surfdrive.surf.nl/files/index.php/s/yauByOwvpZmc5EE'

# # Step 1: Download the existing ZIP folder
# response = requests.get(upload_url)

# if response.status_code == 200:
#     # Create a BytesIO object to hold the downloaded ZIP file content
#     zip_content = BytesIO(response.content)

#     # Step 2: Add the new CSV file to the downloaded ZIP folder
#     with zipfile.ZipFile(zip_content, 'a') as zip_file:
#         zip_file.writestr('example.csv', csv_content)

#     # Set the target URL for uploading the updated ZIP folder (same as the original)
#     updated_upload_url = upload_url

#     # Update the existing ZIP folder directly using PUT
#     put_response = requests.put(updated_upload_url, data=zip_content.getvalue())

#     # Check if the update was successful
#     if put_response.status_code == 200:
#         print(f"ZIP folder successfully updated at {updated_upload_url}")
#     else:
#         print(f"Failed to update ZIP folder. Status code: {put_response.status_code}")
#         print(put_response.text)
# else:
#     print(f"Failed to download existing ZIP folder. Status code: {response.status_code}")