In [None]:
import json
import pandas as pd
import datetime
import re
import sys

In [None]:
%run Config.ipynb

In [None]:
file = open('result.json',encoding='utf8')

telegram = json.load(file)
about = telegram['about']
chats = telegram['chats']['list']

In [None]:
#try to retrieve ownChatId if it isn't set manually
for chat in chats:
    if chat['type']=='saved_messages':
        saved_messages={'id':str(chat['id']),'name': 'Saved Messages',
                          'sent':len(chat.get('messages')),
                          'received':0,
                          'total':len(chat.get('messages'))
                         }
        ownChatId = chat['id']
        break
if ownChatId ==0:
    sys.exit("Your Chat ID couldn't be retrieved automatically. Please set it in the Config File.")
    print(ownChatId)

In [None]:
# Add whitespaces to distinguish duplicate chat names
names=[]
for chat in chats:
    if (chat['type'] in ['personal_chat','private_group','private_supergroup']) and str(chat.get('name'))!='None':
        name=chat['name']
        nameCount=names.count(name)
        names.append(name)
        chat['name']=name+' '*nameCount

In [None]:
messages = [{'chat':chat['name'],'type':chat['type'],'message':msg} 
            for chat in chats for msg in chat['messages'] if chat['type'] in ['personal_chat','private_group','private_supergroup'] and "action" not in msg.keys()]

sentMessages=[msg['message'] for msg in messages if msg['message'].get('from_id')==ownChatId]
receivedMessages=[msg['message'] for msg in messages if msg['message'].get('from_id')!=ownChatId]

sentPrivateMessages=[msg['message'] for msg in messages if msg['message'].get('from_id')==ownChatId and msg['type']=='personal_chat']
receivedPrivateMessages=[msg['message'] for msg in messages if msg['message'].get('from_id')!=ownChatId and msg['type']=='personal_chat']

In [None]:
firstSentMessages=[]
firstReceivedMessages=[]
for chat in chats:
    if (chat['type'] in ['personal_chat','private_group','private_supergroup'])and str(chat.get('name'))!='None':
        for msg in chat['messages']:
            if msg.get('from_id')==ownChatId and msg.get('type')=='message' and msg.get('file') == None:
                firstSentMessages.append({'text':msg.get('text'),
                                      'chat':str(chat['name']),
                                      'datetime':msg.get('date')
                                     })
                break
        for msg in chat['messages']:
            if msg.get('from_id')!=ownChatId and msg.get('type')=='message' and msg.get('file') == None:
                if chat['type'] in ['private_group','private_supergroup']:
                    firstReceivedMessages.append({'text':msg.get('text'),
                                      'chat':str(msg['from'])+' (in Group \"'+chat['name']+'\")',
                                      'datetime':msg.get('date')
                                     })
                else:                        
                    firstReceivedMessages.append({'text':msg.get('text'),
                                      'chat':str(chat['name']),
                                      'datetime':msg.get('date')
                                     })
                break
            

firstSentMessages.sort(key=lambda k: k['datetime'])
firstReceivedMessages.sort(key=lambda k: k['datetime'])

firstSentMessage=firstSentMessages[0]
firstReceivedMessage=firstReceivedMessages[0]

In [None]:
contentTypes = {}
for msg in sentMessages:
    if msg.get('file'):
        if msg.get('media_type'):
            if contentTypes.get(msg.get('media_type')):
                contentTypes[msg.get('media_type')] = contentTypes.get(msg.get('media_type'))+1
            else:
                contentTypes[msg.get('media_type')] = 1
        elif msg.get('mime_type'):
            if contentTypes.get(msg.get('mime_type')):
                contentTypes[msg.get('mime_type')] = contentTypes.get(msg.get('mime_type'))+1
            else:
                contentTypes[msg.get('mime_type')] = 1
        else:
            if contentTypes.get('others'):
                contentTypes['others'] = contentTypes.get('others')+1
            else:
                contentTypes['others'] = 1
    elif msg.get('photo'):
        if contentTypes.get('image'):
            contentTypes['image'] = contentTypes.get('image')+1
        else:
            contentTypes['image'] = 1
    else:
        if contentTypes.get('text'):
            contentTypes['text'] = contentTypes.get('text')+1
        else:
            contentTypes['text'] = 1

cleanedContentTypes={'Text':0,'Image':0, 'Video':0, 'Audio':0, 'Sticker':0, 'Voice/Video Message':0, 'Other Files':0}
cleanedContentTypes['Text']=contentTypes.pop('text',0)
cleanedContentTypes['Image']=contentTypes.pop('image',0)+contentTypes.pop('image/jpeg',0)+contentTypes.pop('image/png',0)++contentTypes.pop('image/gif',0)
cleanedContentTypes['Video']=contentTypes.pop('video_file',0)+contentTypes.pop('video/mp4',0)+contentTypes.pop('video/mpeg',0)
cleanedContentTypes['Audio']=contentTypes.pop('audio_file',0)+contentTypes.pop('audio/x-wav',0)
cleanedContentTypes['Sticker']=contentTypes.pop('sticker',0)
cleanedContentTypes['GIF']=contentTypes.pop('animation',0)
cleanedContentTypes['Voice/Video Message']=contentTypes.pop('voice_message',0)+contentTypes.pop('video_message',0)

cleanedContentTypes['Other Files']=sum(contentTypes.values())

cleanedContentTypes={x:y for x,y in cleanedContentTypes.items() if y!=0}

In [None]:
messageDates = pd.DataFrame([[datetime.date.fromisoformat(msg['message']['date'][0:10]),''] for msg in messages], columns=['Date','Count'])
messageDateFrequencies = messageDates.groupby('Date',as_index=False).count()
messageDateFrequencies['Weekday']= messageDateFrequencies['Date'].apply(lambda x: x.weekday())

daysSinceStart = (messageDateFrequencies.tail(1)['Date'].values[0]-messageDateFrequencies.head(1)['Date'].values[0]).days
base=messageDateFrequencies.tail(1)['Date'].values[0]
generalDates = [base - datetime.timedelta(days=x) for x in range(daysSinceStart)]

messagesPerWeekday = messageDateFrequencies.groupby('Weekday', as_index=False).sum()
messagesPerWeekday['Weekday']=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
messagesPerWeekday['perDay']=[messagesPerWeekday['Count'][weekday]/len([date for date in generalDates if date.weekday()==weekday]) for weekday in messagesPerWeekday.index]

In [None]:
sentMessageTimes = pd.DataFrame([[datetime.time.fromisoformat(msg['date'][11:-3]).strftime('%H:%M'),''] for msg in sentPrivateMessages],columns=['Time','Sent'])
receivedMessageTimes = pd.DataFrame([[datetime.time.fromisoformat(msg['date'][11:-3]).strftime('%H:%M'),''] for msg in receivedPrivateMessages],columns=['Time','Received'])
sentMessageTimes=sentMessageTimes.groupby('Time', as_index=False).count()
receivedMessageTimes=receivedMessageTimes.groupby('Time', as_index=False).count()

totalMessagesPerHour = pd.DataFrame([datetime.time(h,m).strftime('%H:%M') for h in range(0,24,1) for m in range(0,60,1)],columns=['Time'])
totalMessagesPerHour = totalMessagesPerHour.join(sentMessageTimes.set_index('Time'), on='Time').join(receivedMessageTimes.set_index('Time'), on='Time')

totalMessagesPerHour['Sent']=totalMessagesPerHour['Sent']/daysSinceStart*60
totalMessagesPerHour['Received']=totalMessagesPerHour['Received']/daysSinceStart*60
totalMessagesPerHour['Total']=totalMessagesPerHour['Sent']+totalMessagesPerHour['Received']

totalMessagesPerHour

In [None]:
frequentChats=[]
for chat in chats:
    if (chat['type'] in ['personal_chat','private_group','private_supergroup'])and str(chat.get('name'))!='None':
        sentCount=len([msg for msg in chat['messages'] if msg.get('from_id')==ownChatId and "action" not in msg.keys()])
        receivedCount=len([msg for msg in chat['messages'] if msg.get('from_id')!=ownChatId and "action" not in msg.keys()])
        totalCount=sentCount+receivedCount
        frequentChats.append({'id':str(chat['id']),'name':chat['name'],
                              'sent':sentCount,
                              'received':receivedCount,
                              'total':totalCount
                             })
frequentChats.append(saved_messages)        

frequentChats.sort(key=lambda k: k['total'],reverse=True)
frequentChats = pd.DataFrame(frequentChats)

In [None]:
sentVoiceMessages = [{'chat': msg['chat'], 'duration':msg['message'].get('duration_seconds',0),'type':'sent'} 
                   for msg in messages if msg['message'].get('media_type')=='voice_message' 
                   and msg['message'].get('from_id')==ownChatId]
receivedVoiceMessages = [{'chat': msg['chat'], 'duration':msg['message'].get('duration_seconds',0),'type':'received'} 
                   for msg in messages if msg['message'].get('media_type')=='voice_message' 
                   and msg['message'].get('from_id')!=ownChatId]

voiceMessageTotal = pd.DataFrame(sentVoiceMessages+receivedVoiceMessages).groupby(['chat']).sum()
voiceMessageSent = pd.DataFrame(sentVoiceMessages).groupby(['chat']).sum()
voiceMessageReceived = pd.DataFrame(receivedVoiceMessages).groupby(['chat']).sum()

voiceMessages = pd.merge(voiceMessageSent, voiceMessageReceived, on='chat',how='outer')
voiceMessages = pd.merge(voiceMessages, voiceMessageTotal, on='chat')

voiceMessages = voiceMessages.rename(columns={'duration_x': 'Sent', 'duration_y': 'Received', 'duration': 'Total'}).sort_values(by=['Total'],ascending=False)

In [None]:
texts=[str(msg['text']).lower() for msg in sentMessages if msg['text']!='']
frequentMessages = pd.DataFrame(texts,columns=['Text'])
frequentMessages['Count']=''
frequentMessages = frequentMessages.groupby('Text',as_index=False).count().sort_values(by=['Count'],ascending=False)

In [None]:
regex_pattern = re.compile(pattern = "[^"
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F3FA"  # symbols & pictographs
        u"\U0001F400-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
def emojify(text):   
    return regex_pattern.sub(r'',text)

emojis = pd.DataFrame([[emoji,''] for emoji in ''.join([msg for msg in [emojify(text) for text in texts] if msg!=''])],columns=['Emoji','Count'])
frequentEmojis = emojis.groupby('Emoji',as_index=False).count().sort_values(by=['Count'],ascending=False)

In [None]:
stickerMessages = [msg for msg in sentMessages if msg.get('media_type')=='sticker']
stickerEmojis = pd.DataFrame([[msg['sticker_emoji'],''] for msg in stickerMessages if 'sticker_emoji' in msg.keys()],columns=['Emoji','Count'])

frequentStickerEmojis = stickerEmojis.groupby('Emoji', as_index=False).count().sort_values(by=['Count'],ascending=False)

In [None]:
messageMonths = pd.DataFrame([[datetime.date.fromisoformat(msg['message']['date'][0:8]+'01'),''] for msg in messages],columns=['Month','Count'])
messageMonthFrequencies = messageMonths.groupby('Month',as_index=False).count()

In [None]:
messagesPerMonth = pd.DataFrame([[msg['message']['date'][5:7],''] for msg in messages],columns=['Month','Count'])
messagesPerMonth = messagesPerMonth.groupby('Month',as_index=False).count()

messagesPerMonth['perDay']=[messagesPerMonth['Count'][month]/len([date for date in generalDates if date.month==month+1]) for month in messagesPerMonth.index]
messagesPerMonth['Month']=['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']

In [None]:
messageDatetimes = [{'datetime':datetime.datetime.strptime(msg['message']['date'],'%Y-%m-%dT%H:%M:%S'),'chat':msg['chat']} 
                    for msg in messages if str(msg['chat'])!='None']
messageDatetimes.sort(key=lambda k:k['datetime'])
messageDatetimes = [{'time':message['datetime'],'totalCount':tc,'chat':message['chat']} for tc,message in enumerate(messageDatetimes)]

chatCounter = {}
for msg in messageDatetimes:
    if msg['chat'] in chatCounter.keys():
        chatCounter[msg['chat']]=chatCounter[msg['chat']]+1
    else:
        chatCounter[msg['chat']]=1
    msg['chatCount']=chatCounter[msg['chat']]