In [79]:
import pandas as pd
import re
from collections import Counter
import nltk
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english')) 

def parse_file(text_file):    
    pat = re.compile(r'^(\d\d\/\d\d\/\d\d.*?)(?=^^\d\d\/\d\d\/\d\d|\Z)', re.S | re.M)
    with open(text_file,encoding='utf8') as f:
        data = [m.group(1).strip().replace('\n', ' ') for m in pat.finditer(f.read())]
    sender = []; message = []; datetime = []
    for row in data:
        datetime.append(row.split(' - ')[0])
        try:
            s = re.search('m - (.*?):', row).group(1)
            sender.append(s)
        except:
            sender.append('')
        try:
            message.append(row.split(': ', 1)[1])
        except:
            message.append('')
    df = pd.DataFrame(zip(datetime, sender, message), columns=['timestamp', 'sender', 'message'])
    df['timestamp'] = pd.to_datetime(df.timestamp, format='%d/%m/%y, %I:%M %p')
    df = df[df.sender != ''].reset_index(drop=True)    
    return df

def dftostring(df):
    messages = ' '.join(df.values)
    messages = messages.lower()
    messages = messages.strip()
    messages = messages.replace('<media omitted>','<media_omitted>')
    return(messages)
    
def getwordscount(messages,top=None):
    finalCount = Counter()
    words = [w for w in messages.split(" ") if w not in stop_words]
    finalCount.update(words)  # update final count using the words list
    if top:
        return finalCount.most_common(top)
    else:
        return finalCount 

In [80]:
df = parse_file('chat.txt')
df.head()

Unnamed: 0,timestamp,sender,message
0,2018-11-15 20:32:00,Kanak Dahake Jr.,<Media omitted>
1,2018-11-15 20:47:00,P1,😆😆😆
2,2018-11-15 20:47:00,P1,kama nimitya
3,2018-11-15 20:47:00,P1,aaji kde
4,2018-11-15 20:50:00,Kanak Dahake Jr.,Sala 😂😂


In [81]:
def generate_analysis(name,df_s):
    print(f'Message send by {name} is {df_s.shape[0]}')
    allmessages_text = dftostring(df['message'])
    print('Top 50 words:',getwordscount(allmessages_text,50))
    print('')

In [82]:
sender = list(df.sender.unique())
generate_analysis('All',df)
for s in sender:
    generate_analysis(s,df[df.sender==s])

Message send by All is 1935
Top 50 words: [('<media_omitted>', 233), ('h', 168), ('hai', 146), ('', 124), ('ky', 89), ('nhi', 77), ('ka', 73), ('ha', 70), ('bhi', 63), ('ko', 55), ('mai', 54), ('main', 49), ('nai', 46), ('tu', 42), ('fir', 42), ('kya', 40), ('ek', 39), ('raha', 38), ('kuch', 37), ('hu', 36), ('na', 36), ('kr', 33), ('p', 33), ('tha', 33), ('hi', 33), ('ke', 32), ('-', 32), ('ho', 31), ('abe', 31), ('n', 31), ('kar', 29), ('k', 29), ('acha', 28), ('wo', 27), ('toh', 26), ('yeh', 26), ('bol', 25), ('voh', 25), ('pe', 24), ('se', 24), ('atharv', 24), ('hua', 23), ('sahi', 22), ('koi', 22), ('app', 21), ('kam', 20), ('😂', 20), ('aur', 20), ('ab', 20), ('😅', 20)]

Message send by Kanak Dahake Jr. is 825
Top 50 words: [('<media_omitted>', 233), ('h', 168), ('hai', 146), ('', 124), ('ky', 89), ('nhi', 77), ('ka', 73), ('ha', 70), ('bhi', 63), ('ko', 55), ('mai', 54), ('main', 49), ('nai', 46), ('tu', 42), ('fir', 42), ('kya', 40), ('ek', 39), ('raha', 38), ('kuch', 37), ('hu'