In [None]:
#Importing Libraries
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# #read the text file
# path = "data2.txt"
# f = open(path, 'r', encoding='utf-8')
# data = f.read()
# print(data)

In [None]:
#read the text file
d = open('data3.txt','r',encoding='utf-8')
data = d.read()
data 

In [None]:

# Regular expression pattern
pattern = r'(\d{1,2}\/\d{1,2}\/\d{2}),?\s(\d{1,2}:\d{2})\s?(am|pm)?\s-\s(.+?):\s(.+)'

# Create an empty list to store the rows of the DataFrame
rows = []

# Loop over the messages and extract the date, time, am/pm, username, and message using the pattern
for message in data.split('\n'):
    match = re.search(pattern, message)
    if match is not None:
        date = match.group(1)
        time = match.group(2)
        ampm = match.group(3)
        username = match.group(4)
        msg = match.group(5)
    
        # Append the row to the list
        rows.append({
            'only_date': date,
            't': time,
            'ampm' : ampm,
            'username': username,
            'message': msg
        })

# Create a Pandas DataFrame from the list of rows
df = pd.DataFrame(rows)

In [None]:
df['only_date'] = pd.to_datetime(df['only_date'], format='%d/%m/%y')
# Convert the date column to the '%Y-%m-%d' format
df['only_date'] = df['only_date'].dt.strftime('%Y-%m-%d')

In [None]:
# converting into 24hrs
df['time'] = df.apply(lambda x: f"{x['t']} {x['ampm']}", axis=1)

df = df.drop(['t', 'ampm'], axis=1)

In [None]:
def convert_time(time_str):
    time_obj = pd.to_datetime(time_str, format='%I:%M %p')
    return time_obj.strftime('%H:%M')

df['only_time'] = df['time'].apply(convert_time)

# Drop the original 'time' column if desired
df = df.drop('time', axis=1)


In [None]:
# converting into 24hrs
df['date'] = df.apply(lambda x: f"{x['only_date']} {x['only_time']}", axis=1)

In [None]:
df[['year', 'month_num', 'day']] = df['only_date'].str.split('-', expand=True)


In [None]:
# Convert the month number to month name
df['month'] = pd.to_datetime(df['month_num'], format='%m').dt.strftime('%B')

# Convert the date to day name
df['day_name'] = pd.to_datetime(df['only_date']).dt.strftime('%A')

In [None]:
# Split the time column into two separate columns for hour and minute
df[['hour', 'minute']] = df['only_time'].str.split(':', expand=True)

In [None]:
df

In [None]:
#add period column that shows data capture between which 24 hour format
period = []
for hour in df[['day_name', 'hour']]['hour']:
    h = int(hour)+1
    if hour == 23:
        period.append(str(hour) + "-" + str('00'))
    elif hour == 0:
        period.append(str('00') + "-" + str(hour + 1))
    else:
        period.append(str(hour) + "-" + str(h))
df['period'] = period

In [None]:
df

In [None]:
#Total Messages
df.shape[0]

In [None]:
#Total Number of words
words = []
for message in df['message']:
  words.extend(message.split())

print(len(words))

In [None]:
#Number of Media Files shared
df[df['message'] == '<Media omitted>'].shape[0]

In [None]:
#Number of Links Shared
from urlextract import URLExtract
extract = URLExtract()

links = []
for message in df['message']:
    links.extend(extract.find_urls(message))

print(len(links))

In [None]:
import matplotlib.pyplot as plt

x = df['username'].value_counts()
user_names = x.index
msg_count = x.values

plt.bar(user_names, msg_count)
plt.xticks(rotation='vertical')
plt.show()

In [None]:
new_df = round(((df['username'].value_counts() / df.shape[0]) * 100), 2).reset_index().rename(
        columns={'index': 'name', 'user': 'percent'})

new_df

In [None]:
import string

def remove_stop_words(message):
  f = open('stop_hinglish.txt', 'r')
  stop_words = f.read()
  y = []
  for word in message.lower().split():
      if word not in stop_words:
          y.append(word)
  return " ".join(y)

def remove_punctuation(message):
  x = re.sub('[%s]'% re.escape(string.punctuation), '', message)
  return x

#Data Cleaning
temp = df[df['username'] != 'group_notification'] #remove group notification
temp = temp[temp['message'] != '<Media omitted>'] #remove media message
temp['message'] = temp['message'].apply(remove_stop_words) #remove stopwords
temp['message'] = temp['message'].apply(remove_punctuation) #remove punctuations

#Draw the wordCloud
from wordcloud import WordCloud
plt.figure(figsize=(20, 10))
wc = WordCloud(width=1000,height=750,min_font_size=10,background_color='white')
cloud = wc.generate(temp['message'].str.cat(sep=" "))
plt.imshow(cloud)

In [None]:
#Find the Top 20 Most Common Words

temp = df[df['username'] != 'group_notification'] #remove group notification
temp = temp[temp['message'] != '<media omitted>']  #remove media msg
temp['message'] = temp['message'].apply(remove_stop_words) #remove stop words
temp['message'] = temp['message'].apply(remove_punctuation) #remove punctuations

words = []
for message in temp['message']:
  words.extend(message.split())

#apply counter
from collections import Counter
most_common_df = pd.DataFrame(Counter(words).most_common(20))
most_common_df

In [None]:
import emoji

emojis = []
for message in df['message']:
  emojis.extend([c for c in message if c in emoji.EMOJI_DATA])

pd.DataFrame(Counter(emojis).most_common(len(Counter(emojis))))

In [None]:
#Time-based Analysis

#Monthly Chats Timeline

timeline = df.groupby(['year', 'month_num', 'month']).count()['message'].reset_index()
month_timeline = []

for i in range(timeline.shape[0]):
  month_timeline.append(timeline['month'][i] + "-" + str(timeline['year'][i]))

timeline['time'] = month_timeline

#draw plot
plt.figure(figsize=(12,6))
plt.plot(timeline['time'], timeline['message'])
plt.xticks(rotation='vertical')
plt.show()

In [None]:
#Daily Timeline

daily_timeline = df.groupby('only_date').count()['message'].reset_index()

plt.figure(figsize=(12,6))
plt.plot(daily_timeline['only_date'], daily_timeline['message'])
plt.show()


In [None]:
#Day-based Activity Map

busy_day = df['day_name'].value_counts()
plt.figure(figsize=(12, 6))
plt.bar(busy_day.index, busy_day.values, color='purple')
plt.title("Busy Day")
plt.xticks(rotation='vertical')
plt.show()

In [None]:
#Monthly Activity Map

busy_month = df['month'].value_counts()
plt.figure(figsize=(12, 6))
plt.bar(busy_month.index, busy_month.values, color='red')
plt.title("Busy Month")
plt.xticks(rotation='vertical')
plt.show()

In [None]:
#Which Time User Remains Active

import seaborn as sns
plt.figure(figsize=(18, 9))
sns.heatmap(df.pivot_table(index='day_name', columns='period', values='message', 
            aggfunc='count').fillna(0))
plt.yticks(rotation='vertical')
plt.show()