In [2]:
import pandas as pd
import json
import numpy as np
import fr_core_news_sm
import nltk
from nltk.stem.snowball import SnowballStemmer
from tqdm.notebook import tqdm
from nltk.corpus import stopwords


#nltk.download()
#nltk.download('stopwords')

In [3]:
non_noun=['©',',','?','.','!','ã',' ','"','','ã§a','\n','(','etre','avoir','cela','caest','faire','aller',
          'tout','bon','plus','bien','ahah','voir','être','si','pouvoir','haha','non',';)','🤣','quand','aussi','quoi',
         'bah','c’','trop','dire','venir','oui',')','comme','savoir','^^','petit','falloir','vouloir','@antoine','ah','chez','-',
         'prendre','liberg','..','...','hahaha',':)','ha',':','','🐟','moins','pense','passer','encore','n\'','  ','   ','aa',
         'da','de','e','ra','thomas','taire']
my_list = stopwords.words('english') + stopwords.words('french') + non_noun
nlp = fr_core_news_sm.load()

In [4]:
#Coding problem from Facebook menssenger side
def parse_obj(obj):
    for key in obj:
        if isinstance(obj[key], str):
            obj[key] = obj[key].encode('latin_1').decode('utf-8')
        elif isinstance(obj[key], list):
            obj[key] = list(map(lambda x: x if type(x) != str else x.encode('latin_1').decode('utf-8'), obj[key]))
        pass
    return obj



def load_all_messages(path):
    # Open first the first message
    file = open(path + 'message_1.json')
    
    #Here we have the decoder from messnenger
    data = json.load(file, object_hook=parse_obj)
    
    #
    df = pd.json_normalize(data['messages'])
    
    #Then open the other ones and append them
    #Would need to change that to apply to every number of files needed
    for i in np.arange(2,6) : 
        file = open(path + 'message_'+str(i)+'.json', encoding='utf8')
        data = json.load(file, object_hook=parse_obj)
        df_temp = pd.json_normalize(data['messages'])
        df=df.append(df_temp)
    return (df)



In [5]:
def clean_data(df):
    #We want a usable time stamp
    df['date_time']=pd.to_datetime(df['timestamp_ms'], unit='ms') 
    
    #Way easier to work with lower cases for text
    df['content']=df['content'].str.lower()
    
    #Let's not work first with every data --> Only text
    df.drop(columns=['timestamp_ms','gifs','is_unsent','photos','type','videos','audio_files','sticker.uri',
                     'call_duration','share.link','share.share_text','users','files'],inplace=True)

    df['year']=df['date_time'].dt.year
    #df=df[df['year']==2021]
    df['hour']=df['date_time'].dt.hour
    df['weekday']=df['date_time'].dt.weekday
    
    #We can exclude some non participing people
    df=df[~df['sender_name'].isin([''])]
    
    df['content']=df.content.fillna('')
    #df=df[~df['content'].isna()].reset_index()
    
    
    return (df)

In [6]:
def token_lemma ( df): 
    df['parsed_content'] = df['content'].apply(lambda x: [y.lemma_ for y in  nlp(x)])
    temmenized=df.explode('parsed_content')[['sender_name','parsed_content']]
    temmenized['parsed_content']=temmenized['parsed_content'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
    
    
    temmenized['nb_use']=1
    return(temmenized)

def clean_token( df) :
    #because the lemmatization is long let's update easier the stop word list
    df=df[~(df['parsed_content'].isin(final_stopwords_list_1+my_list))]
    return (df)
    

In [7]:
def add_information( df): 
    #number of word per message
    df['word_number'] = df['parsed_content'].apply(lambda x: len(x))
    
    #if the next message same sender =1
    df['answered_himself']=df.sender_name.eq(df.sender_name.shift())
    
    #number of reaction
    df['reactions']=df.reactions.fillna('')
    df['reaction_received'] = df['reactions'].apply(lambda x: len(x))
    
    return(df)

In [8]:
def groupby_sender(df):
    df['is_message']=1
    df['has_reac']=df['reaction_received'] >0
    filter_col = [col for col in df if col.startswith('reac_from_')]

    df_grouped=df.groupby('sender_name').agg({'word_number': ['sum', 'max','mean','median'], 
                                   'answered_himself':['sum','mean'],
                                   'reaction_received':['sum','mean'],
                                    'has_reac':['sum','mean'],
                                    'is_message':['count']})
    
    
    df_grouped_has_reacfrom=df[filter_col+['sender_name']].groupby('sender_name').sum()
    df_grouped=df_grouped.merge(df_grouped_has_reacfrom,left_index=True, right_index=True)
    return(df_grouped)

In [9]:
def reactions_from(df):
    sender_names=df.sender_name.unique()
    
    df_react=pd.json_normalize(df['reactions'])
    nb_col=df_react.shape[1]
    
    for col_i in np.arange(nb_col):
        df_react[col_i]=pd.json_normalize(df_react[col_i])['actor']
        
    for sender in sender_names:
        df['reac_from_'+sender]=(df_react==sender).any(axis=1)
        
    return(df)

In [10]:
def by_sender(df,temmenized,sender_name):
    
    if sender_name != 'all' :
        df=df[df['sender_name']==sender_name]
        temmenized=temmenized[temmenized['sender_name']==sender_name].dropna()
        
        
        
    day_max_message=df['date_time'].dt.date.value_counts().head(2)
    
    hours=df[['content','hour']].groupby(by='hour').count().T/360
    
    day=df[['content','weekday']].groupby(by='weekday').count().T/50
    
    word_max_freq=temmenized['parsed_content'].value_counts().head(2)
    
    
    
    return(day_max_message,hours,day,word_max_freq)

by_sender(df_2021,temmenized,'Thomas Liberge')

NameError: name 'df_2021' is not defined

In [None]:
path = "../0. Data/"
df=load_all_messages(path)
df=clean_data(df)
temmenized= token_lemma ( df)

In [None]:
temmenized= clean_token ( temmenized)
df=df.reset_index(drop=True)

df= add_information( df)
df= reactions_from(df)

df_2021=df[df['year']==2021].reset_index(drop=True)
df_2020=df[df['year']==2020].reset_index(drop=True)

df_grouped_2021=groupby_sender(df_2021)
df_grouped_2020=groupby_sender(df_2020)

In [None]:


with pd.ExcelWriter("../2. Output/Data.xlsx", engine='openpyxl',mode='w') as writer:
    
        
        df_grouped_2021.to_excel(writer,sheet_name='grouped',startrow=1)
        df_grouped_2020.to_excel(writer,sheet_name='grouped',startrow=15)
        
        sender_list =  np.concatenate((df.sender_name.unique(),['all']))
        for sender in sender_list : 
            print(sender)
            day_max_message,hours,day,word_max_freq =  by_sender(df,temmenized,sender)
            
            day_max_message.to_excel(writer,sheet_name=sender,startrow=0)
            hours.to_excel(writer,sheet_name=sender,startrow=5)
            day.to_excel(writer,sheet_name=sender,startrow=10)
            word_max_freq.to_excel(writer,sheet_name=sender,startrow=15)
            
        

In [None]:
df.sender_name.unique().values()

In [None]:
v2=temmenized[temmenized['parsed_content']!='  ']
v2['parsed_content'].value_counts()