In [115]:
import pandas as pd
import re
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = list(stopwords.words('english')) 

def parse_file(text_file):    
    pat = re.compile(r'^(\d\d\/\d\d\/\d\d.*?)(?=^^\d\d\/\d\d\/\d\d|\Z)', re.S | re.M)
    with open(text_file,encoding='utf8') as f:
        data = [m.group(1).strip().replace('\n', ' ') for m in pat.finditer(f.read())]
    sender = []; message = []; datetime = []
    for row in data:
        datetime.append(row.split(' - ')[0])
        try:
            s = re.search('m - (.*?):', row).group(1)
            sender.append(s)
        except:
            sender.append('')
        try:
            message.append(row.split(': ', 1)[1])
        except:
            message.append('')
    df = pd.DataFrame(zip(datetime, sender, message), columns=['timestamp', 'sender', 'message'])
    df['timestamp'] = pd.to_datetime(df.timestamp, format='%d/%m/%y, %I:%M %p')
    df = df[df.sender != ''].reset_index(drop=True)    
    return df

def dftostring(df):
    messages = ' '.join(df.values)
    messages = messages.lower()
    messages = messages.strip()
    messages = messages.replace('<media omitted>','media_omitted')
    return(messages)
    
def getwordscount(messages,top=None):
    finalCount = Counter()
    #words = [w for w in messages.split(" ") if w not in stop_words]
    words = [w for w in word_tokenize(messages) if w not in stop_words]
    finalCount.update(words)  # update final count using the words list
    if top:
        return finalCount.most_common(top)
    else:
        return finalCount 
    
# def count_emoji(countall):
#     emoji_lst = []
#     topemoji = [for i in countall if i in emojilst]
    
    

In [111]:
df = parse_file('chat.txt')
# df.head()

In [116]:
def generate_analysis(name,df_s):
    print(f'Message send by {name} is {df_s.shape[0]}')
    allmessages_text = dftostring(df_s['message'])
    countaall = getwordscount(allmessages_text)
    #print(countaall)
    print('Top 50 words:',getwordscount(allmessages_text,50))
    print('')
    
sender = list(df.sender.unique())
generate_analysis('All',df)
for s in sender:
    generate_analysis(s,df[df.sender==s])

Message send by All is 1935
Top 50 words: [('media_omitted', 233), ('?', 188), ('h', 171), ('hai', 159), (':', 152), ('https', 124), ('ky', 90), ('ka', 84), ('.', 80), ('nhi', 77), ('ha', 73), ('kya', 64), ('bhi', 63), ('ko', 56), ('mai', 55), (',', 55), ('nai', 51), ('main', 49), ('...', 48), ('fir', 46), ('tu', 44), ('na', 42), ('ek', 39), ('raha', 38), ('kuch', 38), ('hu', 37), ('tha', 34), ('hi', 34), ('ho', 33), ('kr', 33), ('p', 33), ('n', 32), ('ke', 32), ('-', 32), ('abe', 31), ('kar', 30), ('k', 29), ('wo', 28), ('yeh', 28), ('acha', 28), ('toh', 26), ('bol', 25), ('pe', 25), ('voh', 25), ('app', 24), ('hua', 24), ('se', 24), ('atharv', 24), ('sahi', 22), (')', 22)]

Message send by Kanak Dahake Jr. is 825
Top 50 words: [('?', 156), ('hai', 133), ('media_omitted', 92), ('kya', 61), (':', 56), ('ka', 52), ('nai', 51), ('.', 43), ('https', 43), ('mai', 41), ('ha', 39), ('na', 29), ('yeh', 28), ('voh', 25), ('abe', 24), ('ko', 24), ('toh', 23), ('pe', 23), ('kar', 22), ('ke', 22)