In [1]:
import pandas as pd
import re
import pandas as pd
import re
import emoji
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import word_tokenize, sent_tokenize
import string
from nltk import FreqDist
from nltk import bigrams, trigrams, pos_tag
from nltk.corpus import stopwords
from textstat.textstat import textstatistics 
from collections import Counter
from chat_analysis import *
import os
import requests
from io import BytesIO
import zipfile
import configparser
import numpy as np
import inspect
import sys
# access parent directory from notebooks directory
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir)
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/bojansimoski/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/bojansimoski/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/bojansimoski/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/bojansimoski/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### Notebook for preprocessing of participants chat files

In [2]:
config = configparser.ConfigParser()
# Read the configuration file
config.read('config.ini')
url = config.get('credentials', 'surfdrive_url_movez_chat')

#### This code preprocesses the whatsapp files, creates a separate csv per participant, with a datetime, username, message format

In [3]:
output_chat_data_folder_path = 'output_chat_data/'
# Regular expression pattern for parsing each txt file line
pattern_rest = re.compile(rb'\[((\d{2}/\d{2}/\d{4})|(\d{2}\.\d{2}\.\d{2})), (\d{2}:\d{2}:\d{2})\] (.*?): (.*)\r\n')
pattern_te = re.compile(rb'(\d{1,2}/\d{1,2}/\d{2,4},? \d{2}:\d{2}|\d{2}-\d{2}-\d{4} \d{2}:\d{2}) - (.+?): (.*)\n')
# Send an HTTP GET request to the URL (\d{2}-\d{2}-\d{4} \d{2}:\d{2}) 
response = requests.get(url)
# dictionary with all the individuals dataframes
df_dict = {}
# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Create a BytesIO object to hold the downloaded ZIP file content
    zip_content = BytesIO(response.content)
    csv_file_to_read = 'movez_chats/movez_participants.csv'
    # Use the zipfile module to extract the contents
    with zipfile.ZipFile(zip_content, 'r') as zip_ref:
        zip_ref.extract(csv_file_to_read)
        #extract the participants anonymization file
        df_users = pd.read_csv(csv_file_to_read)

        for file_info in zip_ref.infolist():
            # Lists to store the extracted data from file
            datetimes = []
            usernames = []
            messages = []
            emojis = []
            # Check if the file has a .txt extension
            if file_info.filename.endswith('.txt'):
                # Extract the content of the text file
                with zip_ref.open(file_info.filename) as txt_file:
                    # Read and print the content of the text file line by line
                    if file_info.filename.endswith('Thabo.txt') or file_info.filename.endswith('Ying.txt'):
                        for line in txt_file:
                                match = pattern_te.match(line)
                                if match:
                                    group = match.groups()
                                    datetimes.append(group[0].decode('utf-8'))
                                    usernames.append(group[1].decode('utf-8'))
                                    messages.append(group[2].decode('utf-8'))
                                    emojis.append('')
                    else:
                        for line in txt_file:
                            match = pattern_rest.match(line)
                            if match:
                                group = match.groups()
                                # Combine date and time into a single string
                                datetime_str = group[0].decode('utf-8') + ' ' + group[3].decode('utf-8')
                                datetimes.append(datetime_str)
                                usernames.append(group[4].decode('utf-8'))
                                messages.append(group[5].decode('utf-8'))
                                emojis.append('')                            
                            
                
                    # Creating a DataFrame
                    df_chats = pd.DataFrame({
                        'datetime': datetimes,
                        'username': usernames,
                        'message': messages,
                        'emojis': emojis
                    })
                                
                    df_chats['datetime'] = pd.to_datetime(df_chats['datetime'], errors='coerce')
                    # Filter rows where datetime is greater than January 25, 2024 - start of the data donation experiment
                    df_chats = df_chats[df_chats['datetime'] > '2024-01-25']
                    # Applied lexical functions per message
                    df_chats[['message', 'emojis']] = df_chats['message'].apply(extract_emojis).tolist()
                    df_chats['word_count'] = df_chats['message'].apply(word_count)
                    df_chats['punctuation_count'] = df_chats['message'].apply(punctuation_count)
                    df_chats['readability_score'] = df_chats['message'].apply(readability_score)
                    df_chats['lexical_density'] = df_chats['message'].apply(lexical_density)
                    
                    # Extracting unique usernames
                    unique_usernames = df_chats['username'].unique()
                    
                    # Creating a dictionary to hold the DataFrames for each unique username
                    df_dict.update({username: df_chats[df_chats['username'] == username] for username in unique_usernames})
    
    df_summary = pd.DataFrame(columns=['username','word_count_median','punctuation_count_avg','vocabulary_diversity','emoji_avg','lexical_density','readability_score'])
    
    #creating corpus level features here
    for df in df_dict.values():
        # Extracting the username from the first row of the DataFrame
        df['username'] = df['username'].map(df_users.set_index('username')['index'])
        username = df['username'].iloc[0]
        # Sanitize the username to ensure it's safe for use as a file name
        sanitized_username = "".join([c for c in username if c.isalpha() or c.isdigit() or c==' ']).rstrip()
        
        # Lexical features applied on the whole corpus
        all_messages = '. '.join(df['message'].astype(str))
        # words = word_tokenize(all_messages)
        # unique_words = set(words)
        # ttr = len(unique_words) / len(words) if words else 0

        df_summary = pd.concat([df_summary, pd.DataFrame([{'username': username, 'word_count_median': df['word_count'].median(),
                                        'punctuation_count_avg': round(df['punctuation_count'].mean(),2), 
                                        'vocabulary_diversity': round(vocabulary_diversity(all_messages),2),
                                        'emoji_avg': df['emojis'].count()/ df.shape[0], 'lexical_density' : round(df['lexical_density'].mean(),2),
                                        'readability_score' : round(df['readability_score'].mean(),2) }])], ignore_index=True)

        # saved to separate csv files
        df_grams = get_top_ngrams(all_messages, n=2)
        df_grams.to_csv(output_chat_data_folder_path + username + '_ngram.csv',index=False)
        
        df_pos_distribution = pos_distribution(all_messages)
        df_pos_distribution.to_csv(output_chat_data_folder_path + username + '_pos_distribution.csv',index=False)

        df_top_pos = get_top_words_by_pos(all_messages)
        df_top_pos.to_csv(output_chat_data_folder_path + username + '_top_10_pos.csv',index=False)
        
        df_word_length_distribution = word_length_distribution(all_messages)
        df_word_length_distribution.to_csv(output_chat_data_folder_path + username + '_word_distribution.csv',index=False)

        
        # Constructing the filename
        filename = output_chat_data_folder_path + f'{sanitized_username}_chat_llm.csv'
        filename_whole_corpus = output_chat_data_folder_path +  f'{sanitized_username}_all.txt'
        f = open(filename_whole_corpus,'w')
        f.write(all_messages) #Give your csv text here.
        ## Python will convert \n to os.linesep
        f.close()
        # Saving the DataFrame to a CSV file
        df.to_csv(filename, index=False)

        df_summary.to_csv(output_chat_data_folder_path+'0_chat_preprocess_summary.csv',index=False)

else:
    print(f"Failed to download file. Status code: {response.status_code}")

  df_chats['datetime'] = pd.to_datetime(df_chats['datetime'], errors='coerce')
  df_chats['datetime'] = pd.to_datetime(df_chats['datetime'], errors='coerce')
  df_chats['datetime'] = pd.to_datetime(df_chats['datetime'], errors='coerce')
  df_chats['datetime'] = pd.to_datetime(df_chats['datetime'], errors='coerce')
  df_chats['datetime'] = pd.to_datetime(df_chats['datetime'], errors='coerce')
  df_summary = pd.concat([df_summary, pd.DataFrame([{'username': username, 'word_count_median': df['word_count'].median(),
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['username'] = df['username'].map(df_users.set_index('username')['index'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats 