In [9]:
import pandas as pd
import json
import numpy as np
import fr_core_news_sm
import nltk
from nltk.stem.snowball import SnowballStemmer
from tqdm.notebook import tqdm
from nltk.corpus import stopwords


#nltk.download()
#nltk.download('stopwords')

In [142]:
my_list=['©',',','?','.','!','ã',' ','"','','ã§a','\n','(','etre','avoir','cela','caest','faire']
final_stopwords_list_1 = stopwords.words('english') + stopwords.words('french') + my_list
nlp = fr_core_news_sm.load()

In [204]:
#Coding problem from Facebook menssenger side
def parse_obj(obj):
    for key in obj:
        if isinstance(obj[key], str):
            obj[key] = obj[key].encode('latin_1').decode('utf-8')
        elif isinstance(obj[key], list):
            obj[key] = list(map(lambda x: x if type(x) != str else x.encode('latin_1').decode('utf-8'), obj[key]))
        pass
    return obj



def load_all_messages(path):
    # Open first the first message
    file = open(path + 'message_1.json')
    
    #Here we have the decoder from messnenger
    data = json.load(file, object_hook=parse_obj)
    
    #
    df = pd.json_normalize(data['messages'])
    
    #Then open the other ones and append them
    #Would need to change that to apply to every number of files needed
    for i in np.arange(2,6) : 
        file = open(path + 'message_'+str(i)+'.json', encoding='utf8')
        data = json.load(file)
        df_temp = pd.json_normalize(data['messages'])
        df=df.append(df_temp)
    return (df)



In [210]:
def clean_data(df):
    #We want a usable time stamp
    df['date_time']=pd.to_datetime(df['timestamp_ms'], unit='ms') 
    
    #Way easier to work with lower cases for text
    df['content']=df['content'].str.lower()
    
    #Let's not work first with every data --> Only text
    df.drop(columns=['timestamp_ms','gifs','is_unsent','photos','type','videos','audio_files','sticker.uri',
                     'call_duration','share.link','share.share_text','users','files'],inplace=True)

    df['year']=df['date_time'].dt.year
    #df=df[df['year']==2021]
    
    #We can exclude some non participing people
    df=df[~df['sender_name'].isin(['ThoJean Delavega','Paul Foulonneau','Harry Hrr','Maximilien Waeters'])]
    
    df['content']=df.content.fillna('')
    #df=df[~df['content'].isna()].reset_index()
    
    
    return (df)

In [211]:
def token_lemma ( df): 
    df['parsed_content'] = df['content'].apply(lambda x: [y.lemma_ for y in  nlp(x)])
    temmenized=df.explode('parsed_content')[['sender_name','parsed_content']]
    temmenized['parsed_content']=temmenized['parsed_content'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
    
    temmenized=temmenized[~(temmenized['parsed_content'].isin(final_stopwords_list_1+my_list))]
    temmenized['nb_use']=1
    return(temmenized)

In [212]:
def add_information( df): 
    #number of word per message
    df['word_number'] = df['parsed_content'].apply(lambda x: len(x))
    
    #if the next message same sender =1
    df['answered_himself']=df.sender_name.eq(df.sender_name.shift())
    
    #number of reaction
    df['reactions']=df.reactions.fillna('')
    df['reaction_number'] = df['reactions'].apply(lambda x: len(x))
    
    return(df)

In [241]:
def groupby_sender(df):
    df['is_message']=1
    df['has_reac']=df['reaction_number'] >0
    df_grouped=df.groupby('sender_name').agg({'word_number': ['sum', 'max','mean','median'], 
                                   'answered_himself':['sum','mean'],
                                   'reaction_number':['sum','mean'],
                                    'has_reac':['sum','mean'],
                                   'is_message':['count']})
    return(df_grouped)

In [214]:
path = "../0. Data/"
df=load_all_messages(path)
df=clean_data(df)
temmenized= token_lemma ( df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [242]:
df_2021=df[df['year']==2021]
df_2021= add_information( df_2021)
df_grouped=groupby_sender(df_2021)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [243]:
df_grouped

Unnamed: 0_level_0,word_number,word_number,word_number,word_number,answered_himself,answered_himself,reaction_number,reaction_number,has_reac,has_reac,is_message
Unnamed: 0_level_1,sum,max,mean,median,sum,mean,sum,mean,sum,mean,count
sender_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
Adrien Goutard,1902,48,7.231939,6.0,32,0.121673,155,0.589354,116,0.441065,263
Alex Dns,14638,423,9.419562,7.0,625,0.402188,1172,0.754183,790,0.508366,1554
Alexandre Durand,465,35,7.622951,7.0,13,0.213115,31,0.508197,20,0.327869,61
Antoine Gilles,953,80,5.919255,5.0,35,0.217391,73,0.453416,50,0.310559,161
Antoine Hamon,11307,148,4.963565,4.0,938,0.411765,639,0.280509,519,0.227831,2278
Arthur de Saint-Pierre,6074,167,9.344615,8.0,81,0.124615,393,0.604615,274,0.421538,650
Louis Jss,20242,103,8.01981,5.0,1016,0.402536,1295,0.513074,937,0.371236,2524
Robin Goutard,23688,159,7.043711,5.0,1285,0.382099,931,0.276836,758,0.225394,3363
Thomas Liberge,9424,119,6.044901,5.0,540,0.346376,825,0.529185,553,0.354715,1559


In [240]:
df_2021[df_2021['word_number']==159]

Unnamed: 0,sender_name,reactions,content,date_time,year,parsed_content,word_number,answered_himself,reaction_number,is_message
1215,Robin Goutard,,1/ on pense faire les courses jeudi histoire d...,2021-12-27 20:53:57.183,2021,"[1/, on, pense, faire, le, course, jeudi, hist...",159,True,0,1
