In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import re
import emoji
from collections import Counter
import matplotlib.dates as mdates

In [None]:
sns.set(rc={'figure.figsize':(11, 4)})

In [None]:
chat_file_path = r"PATH"
gf_df = pd.read_csv(chat_file_path)
gf_df.head()

In [None]:
gf_df.rename(columns={
    'Date2':'Date',
    'UserName':'sender',
    'MessageBody':'Message'
},inplace=True)
gf_df.columns = [col.lower() for col in gf_df]
gf_df['datetime'] = gf_df['date'] + ' ' + gf_df['time']
gf_df['datetime'] = pd.to_datetime(gf_df['datetime'])
gf_df.set_index('datetime',inplace=True)
gf_df['mediatype'] = gf_df['mediatype'].astype('category')

gf_df.drop(['medialink','quotedmessage','quotedmessagedate','quotedmessagetime','date','time'],axis = 1, inplace=True)

media_dict = {"image" : "Image","GIF":"GIF","document":"Document","video":"Video",
              "recorded audio":"Audio","sticker":"Sticker"}
gf_df['mediatype'] = gf_df['mediatype'].map(media_dict)

In [None]:
gf_df.describe()
gf_df.info()
gf_df.head()

In [None]:
# bf_deleted = gf_df['message'].value_counts()['You deleted this message']
# gf_deleted = gf_df['message'].value_counts()['This message was deleted']
# print(f'bf deleted {bf_deleted} messages, while gf deleted {gf_deleted}')
message_counts = gf_df.groupby(['sender']).count()[['message']]
print(message_counts)
msg_count_plot = sns.catplot( x = 'sender', data = gf_df, kind = 'count')
msg_count_plot.fig.suptitle('Messages')

In [None]:

mediatype_by_from = gf_df.groupby(['mediatype','sender']).size().unstack().sort_values("bf",ascending = False)
print(mediatype_by_from)
media_order = list(mediatype_by_from.index)

fig, (bf_ax,gf_ax) = plt.subplots(ncols=2,sharey=True)
fig.tight_layout(pad = 5.0)
fig.suptitle("Total Media Sent")

sns.barplot(y = 'mediatype',x = 'mediatype', ax = bf_ax,data = gf_df[(gf_df.sender == 'bf')], orient = 'h',estimator = len,order = media_order)
sns.barplot(y = 'mediatype',x = 'mediatype', ax = gf_ax,data = gf_df[(gf_df.sender == 'gf')], orient = 'h',estimator = len, order = media_order)
bf_ax.invert_xaxis()
bf_ax.yaxis.tick_right()

gf_ax.set_title("gf")
bf_ax.set_title("bf")
for ax in (bf_ax,gf_ax):
    ax.set(xlabel="",ylabel="")


In [None]:
gf_df['letter_count']=gf_df['message'].apply(lambda s : len(s))
gf_df['word_count']=gf_df['message'].apply(lambda s : len(s.split(' ')))

sum_letters = gf_df.groupby('sender')['letter_count'].sum()
sum_words = gf_df.groupby('sender')['word_count'].sum()
print(sum_words)
print(sum_letters)

mean_length = gf_df.groupby('sender')['letter_count'].mean()
mean_words = gf_df.groupby('sender')['word_count'].mean()

print(mean_length)
print(mean_words)

In [None]:
words_in_message = gf_df['word_count'].value_counts()
top_20_word_counts = words_in_message.head(20)
top_20_word_counts.plot.bar()
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.title('Number of Words per Message')

In [None]:
gf_df['date'] = gf_df.index.date
gf_df['date'].value_counts().head(10).plot.barh()
plt.xlabel('Message Count')
plt.ylabel('Dates')
plt.title('Top 10 Messaging Days')

In [None]:
gf_df['hour'] = gf_df.index.hour
busy_hours = gf_df['hour'].value_counts()

busy_hours.sort_index(inplace=True)
busy_hours.plot.bar()
plt.xlabel('Message Count')
plt.ylabel('Hour')
plt.title('Messages by Hour of Day')

In [None]:
gf_df.head(10)
def extract_emojis(series):
    emojis=[]
    for string in series:
        my_str = str(string)
        for each in my_str:
            if each in emoji.UNICODE_EMOJI:
                emojis.append(each)
    return emojis
is_gf = gf_df['sender'] == "gf"
is_bf = gf_df['sender'] == "bf"
gf_messages = gf_df[is_gf]
bf_messages = gf_df[is_bf]
bfmojis = extract_emojis(bf_messages['message'])
gfmojis = extract_emojis(gf_messages['message'])


bf_emoji_df = pd.DataFrame(Counter(bfmojis).items())
gf_emoji_df = pd.DataFrame(Counter(gfmojis).items())
emoji_df = pd.merge(bf_emoji_df,gf_emoji_df,on=0)
emoji_df.columns = ['Emoji','bf','gf']
emoji_df.sort_values(by='gf',inplace=True,ascending=False)
print(emoji_df.head(10))
bf_sum = emoji_df['bf'].sum()
gf_sum = emoji_df['gf'].sum()
perc_Diff = (bf_sum-gf_sum)/(gf_sum)
print(bf_sum,gf_sum,perc_Diff)


In [None]:
call_log_file = r"PATH"

log_file = open(call_log_file,"r")
call_log_file_string = log_file.read()
#call_log_file_string = call_log_file_string.replace(r"PATH","")
call_dict = eval(call_log_file_string)

In [None]:
for k,v in call_dict.items():
    print(k)
    print(v)
    print("#################################\n#################################")

In [None]:
 for call,call_string in call_dict.items():
    
    # Remove random characters that start call lines
    call_string = re.sub(r'.+(O|o)utgoing','Outgoing',call_string)
    call_string = re.sub(r'.+(I|i)ncoming','Incoming',call_string)
    call_string = re.sub(r'.+(M|m)issed','Missed',call_string)
    # Remove the random characters that begin some calls before "Call info"
    call_string = re.sub(r'(.|\n)+\nCall info','Call Info',call_string)
    # Fix Odd names
    name_dict = {FRIEND:FULL FRIEND NAME}
    for name,replacement in name_dict.items():
        call_string = re.sub(f'(\n|.+){name}.+\n',f'\n{replacement}\n',call_string)
        call_dict[call] = call_string
  
    # Correct Multiline Incomin error
    call_string = re.sub(r'\n(I|i)ncomin\n.+\n','\nIncoming ',call_string)
    
    # Replace single character line errors
    call_string = re.sub(r'\n(.|om)\n','\n',call_string)
    
    # Remove \. me line errors
    call_string = re.sub(r'\n\\. me\n','\n',call_string)
    
    # Remove all extra new lines from each scraped call
    call_string = re.sub(r'\n\s*\n','\n',call_string)
    
    print(call)
    print(call_string)
    print("#################################\n#################################")

In [None]:
for call,call_string in call_dict.items():
    call_string = re.sub(r'\n\s*\n','\n',call_string)
    call_string = str.splitlines(call_string)
    call_dict[call] = call_string
    

call_df = pd.DataFrame.from_dict(call_dict,orient='index')
call_df.to_csv(call_csv)  


In [None]:
clean_calls_file = call_csv = r"PATH"
 
clean_calls_df = pd.read_csv(clean_calls_file)
clean_calls_df['DateTime'] = pd.to_datetime(clean_calls_df['DateTime'])
clean_calls_df = clean_calls_df.set_index('DateTime')
clean_calls_df['Year'] = clean_calls_df.index.year
clean_calls_df['Month'] = clean_calls_df.index.month
clean_calls_df['Time'] = clean_calls_df.index.time


clean_calls_df.set_index('DateTime',inplace=True)
clean_calls_df.head(2)

In [None]:
clean_calls_df['Time'] = clean_calls_df.index.time
clean_calls_df.sample(5,random_state=0)
clean_calls_df.dtypes


In [None]:
clean_calls_df.replace(0, np.nan, inplace=True)
# clean_calls_df['Hours'==0.0] = np.nan
# clean_calls_df['Minutes'==0.0] = np.nan
# clean_calls_df['Seconds'==0.0] = np.nan
clean_calls_df.head()

In [None]:
is_gf = clean_calls_df['Caller'] == "gf"
gfs_calls = clean_calls_df[is_gf]

data_columns = ['Hours','Minutes','Seconds','CallBytes']

gf_daily_sum = gfs_calls[data_columns].resample('D').sum()
gf_monthly_sum = gfs_calls[data_columns].resample('M').sum()
gf_weekly_sum = gfs_calls[data_columns].resample('W').sum()
gf_monthly_mean = gfs_calls[data_columns].resample('M').mean()
gf_weekly_mean = gfs_calls[data_columns].resample('W').mean()

In [None]:
print(gfs_calls['Minutes'].mean())
total_hours = gfs_calls['Hours'].sum()
total_days = total_hours / 24
total_minutes = gfs_calls['Minutes'].sum()
total_seconds = gfs_calls['Seconds'].sum()
daily_average = gf_daily_sum['Minutes'].mean()
print(total_days,total_hours,total_minutes,total_seconds,daily_average,sep="\n")

In [None]:
start,end = '2018','2020'
fig,ax = plt.subplots()
ax.plot(gf_monthly_sum.loc[start:end, 'Hours'],
    marker='.', linestyle='-', linewidth=0.5, label='Total Monthly Call Length')
ax.set_ylabel('Call Duration in Hours')
ax.legend();

In [None]:
start,end = '2018','2020'
fig,ax = plt.subplots()
ax.plot(gf_monthly_mean.loc[start:end, 'Minutes'],
    marker='.', markersize=8, linestyle='-', label='Average Call Length in Minutes')
ax.set_ylabel('Call Duration in Minutes')
ax.legend();

In [None]:
clean_calls_df['hour'] = clean_calls_df.index.hour
busy_hours = gfs_calls['hour'].value_counts()

busy_hours.sort_index(inplace=True)
busy_hours.plot.bar()
plt.xlabel('# of Calls')
plt.ylabel('Hour')
plt.title('# of Calls by Hour of Day')

In [None]:
clean_calls_df['Date'] = pd.to_datetime(clean_calls_df['Date'])
month_group = gfs_calls.groupby(pd.Grouper(freq='M'))

busy_months = month_group['Date'].count()


#month_group.sort_index(inplace=True)

fig,ax = plt.subplots()

busy_months.plot.bar()
plt.xlabel('Month')
plt.ylabel('# of Calls')
plt.title('# of Calls by Month')


# fig,ax1 = plt.subplots()
# plt.barh(month_group)
# monthyearFmt = mdates.DateFormatter('%Y %B')
# ax1.xaxis.set_major_formatter(monthyearFmt)