In [1]:
import pandas as pd
import os
import re
from datetime import datetime, timedelta
from dateutil.relativedelta import *

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [9]:
current_wd  = "C:\\Users\\joann\\OneDrive\\Desktop\\My Files\\Github\\NUSWhispers Scrapper\\nuswhispers-explorer"

In [10]:
df = pd.read_csv(current_wd + "\\data\\final.csv")
df_categories = pd.read_csv(current_wd + "\\data\\categories.csv")

In [11]:
df["Reference"] = df["Reference"].apply(lambda x: "#" + str(x))

In [12]:
df_categories.head()

Unnamed: 0,Reference,Category,Date
0,#101093,['Advice'],8 hours ago
1,#101090,['Rant'],8 hours ago
2,#101089,['Advice'],8 hours ago
3,#101088,"['Romance', 'Rant']",8 hours ago
4,#101086,['Rant'],8 hours ago


In [13]:
df.shape

(240, 5)

In [14]:
df_full = pd.merge(df, df_categories, left_on="Reference", right_on="Reference", how="left")

# Reaction Clean Up

In [15]:
df_full["Reaction"] = df_full["Reaction"].apply(lambda x: x.replace("[", ""))
df_full["Reaction"] = df_full["Reaction"].apply(lambda x: x.replace("]", ""))
df_full["Reaction"] = df_full["Reaction"].apply(lambda x: x.replace('"', ''))
df_full["Reaction"] = df_full["Reaction"].apply(lambda x: x.replace("'", ""))

In [16]:
df_full["Reaction_Category"] = df_full['Reaction']

In [17]:
df_full["Reaction_Category"] = df_full["Reaction_Category"].apply(lambda x: ''.join([i for i in x if not i.isdigit()]))
df_full["Reaction_Category"] = df_full["Reaction_Category"].apply(lambda x: x.replace("people", ""))
df_full["Reaction_Category"] = df_full["Reaction_Category"].apply(lambda x: x.replace("person", ""))
df_full["Reaction_Category"] = df_full["Reaction_Category"].apply(lambda x: x.replace(":", ""))
df_full["Reaction_Category"] = df_full["Reaction_Category"].apply(lambda x: x.replace(" ", ""))
df_full["Reaction_Category"] = df_full["Reaction_Category"].apply(lambda x: x.replace(".K", ""))

In [18]:
reactions = df_full['Reaction_Category'].str.split(",", expand=True)

In [19]:
df_full_clean = df_full.copy()
df_full_clean = df_full_clean.loc[:,["Reference"]]

In [20]:
df_full_clean = df_full_clean.join(reactions)

In [21]:
df_full_clean.head()

Unnamed: 0,Reference,0,1,2
0,#101093,Sad,Like,
1,#101090,Haha,Like,Care
2,#101089,Haha,Like,
3,#101088,Like,Sad,
4,#101086,Haha,Like,


In [22]:
stacked = df_full_clean.melt('Reference', value_name='Count').drop("variable", axis=1)

In [23]:
stacked.head()

Unnamed: 0,Reference,Count
0,#101093,Sad
1,#101090,Haha
2,#101089,Haha
3,#101088,Like
4,#101086,Haha


In [24]:
counts = stacked.groupby('Reference')['Count'].value_counts()

In [25]:
result = counts.unstack(level=-1).fillna(0).astype(int)

In [26]:
result = result.reset_index()

In [27]:
df_full_reaction = pd.merge(df_full, result, left_on="Reference", right_on="Reference", how="left")

In [28]:
df_full_reaction = df_full_reaction.rename(columns={"NoReactions": "No Reactions"})

In [29]:
def get_reactions(data, category):
    status = []
    for i in range(0, len(data)):
        if (data[category][i] == 1):
            if (category != "No Reactions"):
                name = '{}: (\d+)'.format(category)
                status.append(re.search(name, data["Reaction"][i]).group(1))
            else:
                status.append("0")
        else:
            status.append("0")
            
    return status

In [30]:
df_full_reaction["Angry_Reactions"] = get_reactions(df_full_reaction, "Angry")
df_full_reaction["Care_Reactions"] = get_reactions(df_full_reaction, "Care")
df_full_reaction["Haha_Reactions"] = get_reactions(df_full_reaction, "Haha")
df_full_reaction["Like_Reactions"] = get_reactions(df_full_reaction, "Like")
df_full_reaction["Love_Reactions"] = get_reactions(df_full_reaction, "Love")
df_full_reaction["Sad_Reactions"] = get_reactions(df_full_reaction, "Sad")
df_full_reaction["Wow_Reactions"] = get_reactions(df_full_reaction, "Wow")
df_full_reaction["No_Reactions"] = get_reactions(df_full_reaction, "No Reactions")

In [31]:
df_full_reaction_clean = df_full_reaction.loc[:,['Reference', 'Content', 'Reaction','Comment', 'Share', 'Category',
       'Date', 'Angry_Reactions', 'Care_Reactions',
       'Haha_Reactions', 'Like_Reactions', 'Love_Reactions', 'Sad_Reactions',
       'Wow_Reactions', 'No Reactions']]

# Category Clean Up

In [32]:
def removing_leading_whitespaces(text):
     return re.sub(r"^\s+","",text)

In [33]:
df_full_reaction_clean["Category"] = df_full_reaction_clean["Category"].apply(lambda x: x.replace("[", ""))
df_full_reaction_clean["Category"] = df_full_reaction_clean["Category"].apply(lambda x: x.replace("]", ""))
df_full_reaction_clean["Category"] = df_full_reaction_clean["Category"].apply(lambda x: x.replace("'", ""))
df_full_reaction_clean["Category"] = df_full_reaction_clean["Category"].apply(lambda x: x.replace(" ", ""))
df_full_reaction_clean["Category"] = df_full_reaction_clean["Category"].apply(lambda x: x.replace("AskProfBen", "Ask Prof Ben"))
df_full_reaction_clean["Category"] = df_full_reaction_clean["Category"].apply(lambda x: x.replace("LostandFound", "Lost and Found"))

In [34]:
df_full_reaction_clean["Category_Clean"] = df_full_reaction_clean['Category']

In [35]:
category = df_full_reaction_clean['Category_Clean'].str.split(",", expand=True)

In [36]:
df_full_reaction_clean_category = df_full_reaction_clean.copy()
df_full_reaction_clean_category = df_full_reaction_clean_category.loc[:,["Reference"]]

In [37]:
df_full_reaction_clean_category = df_full_reaction_clean_category.join(category)

In [38]:
df_full_reaction_clean_category.head()

Unnamed: 0,Reference,0,1,2,3,4,5
0,#101093,Advice,,,,,
1,#101090,Rant,,,,,
2,#101089,Advice,,,,,
3,#101088,Romance,Rant,,,,
4,#101086,Rant,,,,,


In [39]:
stacked = df_full_reaction_clean_category.melt('Reference', value_name='Count').drop("variable", axis=1)

In [40]:
stacked["Count"].value_counts()

Advice            130
Rant               95
Romance            50
                   31
Ask Prof Ben       21
Nostalgia          13
Funny               7
Lost and Found      6
Name: Count, dtype: int64

In [41]:
counts = stacked.groupby('Reference')['Count'].value_counts()

In [42]:
counts

Reference  Count         
#100643    Rant              1
#100646    Ask Prof Ben      1
#100649    Advice            1
           Romance           1
#100650    Advice            1
           Rant              1
           Romance           1
#100653    Rant              1
           Romance           1
#100654    Advice            1
#100658    Advice            1
           Rant              1
#100659    Advice            1
           Romance           1
#100662    Rant              1
#100663    Ask Prof Ben      1
#100664    Advice            1
#100665    Advice            1
           Rant              1
           Romance           1
#100668    Advice            1
#100669    Advice            1
           Rant              1
#100670                      1
#100671    Rant              1
           Romance           1
#100672    Nostalgia         1
           Rant              1
           Romance           1
#100673                      1
#100675    Advice            1
#100677    Ra

In [43]:
result = counts.unstack(level=-1).fillna(0).astype(int)

In [44]:
result = result.reset_index()

In [45]:
df_full_reaction_clean.shape

(240, 16)

In [46]:
df_full_reaction_category = pd.merge(df_full_reaction_clean, result, left_on="Reference", right_on="Reference", how="left")

In [47]:
df_full_reaction_category.head()

Unnamed: 0,Reference,Content,Reaction,Comment,Share,Category,Date,Angry_Reactions,Care_Reactions,Haha_Reactions,Like_Reactions,Love_Reactions,Sad_Reactions,Wow_Reactions,No Reactions,Category_Clean,Unnamed: 17,Advice,Ask Prof Ben,Funny,Lost and Found,Nostalgia,Rant,Romance
0,#101093,tldr: what to do with my father? basically the...,"Sad: 3 people, Like: 1 person",0 Comments,4 comments,Advice,8 hours ago,0,0,0,1,0,3,0,0,Advice,0,1,0,0,0,0,0,0
1,#101090,Just a rant. Is it true that guys discuss thei...,"Haha: 28 people, Like: 21 people, Care: 8 people",56 comments,2 shares,Rant,8 hours ago,0,8,28,21,0,0,0,0,Rant,0,0,0,0,0,0,1,0
2,#101089,I recently matched with a guy who lives at Pas...,"Haha: 214 people, Like: 50 people",175 comments,56 shares,Advice,8 hours ago,0,0,214,50,0,0,0,0,Advice,0,1,0,0,0,0,0,0
3,#101088,I have been feeling uneasy for the past few da...,"Like: 5 people, Sad: 2 people",12 comments,2 shares,"Romance,Rant",8 hours ago,0,0,0,5,0,2,0,0,"Romance,Rant",0,0,0,0,0,0,1,1
4,#101086,"After reading many posts here, it's come to my...","Haha: 20 people, Like: 16 people",53 comments,5 shares,Rant,8 hours ago,0,0,20,16,0,0,0,0,Rant,0,0,0,0,0,0,1,0


In [49]:
df_full_reaction_category.to_csv(current_wd + "\\data\\full_data.csv", index=0)

# Dates Clean up

In [50]:
df_full_reaction_category["Date"].value_counts()

a month ago    28
4 days ago     18
6 days ago     17
a day ago      12
2 days ago     12
19 days ago    12
3 days ago     10
8 hours ago     9
7 days ago      9
8 days ago      8
9 days ago      8
10 days ago     8
25 days ago     7
21 days ago     7
5 days ago      7
18 days ago     7
17 days ago     7
14 days ago     7
11 days ago     7
16 days ago     6
20 days ago     6
23 days ago     6
24 days ago     6
13 days ago     5
15 days ago     4
12 days ago     4
22 days ago     3
Name: Date, dtype: int64

In [51]:
date = datetime.now()
date

datetime.datetime(2022, 1, 5, 16, 19, 14, 844070)

In [52]:
date - relativedelta(days =+ 3)

datetime.datetime(2022, 1, 2, 16, 19, 14, 844070)

In [53]:
df_full_reaction_category["Date"][0].split(" ")

['8', 'hours', 'ago']

In [54]:
df_full_reaction_category["Date_Length"] = df_full_reaction_category["Date"].apply(lambda x: x.split(" ")[1])
df_full_reaction_category["Date_Count"] = df_full_reaction_category["Date"].apply(lambda x: x.split(" ")[0])

In [55]:
df_full_reaction_category.tail(20)

Unnamed: 0,Reference,Content,Reaction,Comment,Share,Category,Date,Angry_Reactions,Care_Reactions,Haha_Reactions,Like_Reactions,Love_Reactions,Sad_Reactions,Wow_Reactions,No Reactions,Category_Clean,Unnamed: 17,Advice,Ask Prof Ben,Funny,Lost and Found,Nostalgia,Rant,Romance,Date_Length,Date_Count
220,#100681,"Hi, I need advice. Is it anything alarming not...","Like: 38 people, Haha: 13 people, Care: 7 people",88 comments,15 shares,Advice,a month ago,0,7,13,38,0,0,0,0,Advice,0,1,0,0,0,0,0,0,month,a
221,#100677,Even though wfh allows some to bond with their...,Like: 35 people,8 comments,1 share,Rant,a month ago,0,0,0,35,0,0,0,0,Rant,0,0,0,0,0,0,1,0,month,a
222,#100675,"Hi parents, I want to know how much allowance ...","Like: 35 people, Haha: 8 people, Care: 3 people",77 comments,13 shares,Advice,a month ago,0,3,8,35,0,0,0,0,Advice,0,1,0,0,0,0,0,0,month,a
223,#100672,I just really needed to get this off my chest ...,"Like: 22 people, Care: 15 people, Sad: 5 people",21 comments,3 shares,"Romance,Rant,Nostalgia",a month ago,0,15,0,22,0,5,0,0,"Romance,Rant,Nostalgia",0,0,0,0,0,1,1,1,month,a
224,#100669,I posted this somewhere else but I need more s...,"Like: 11 people, Haha: 7 people, Care: 2 people",27 comments,5 shares,"Rant,Advice",a month ago,0,2,7,11,0,0,0,0,"Rant,Advice",0,1,0,0,0,0,1,0,month,a
225,#100671,"I am starting to find dating boring, tiring an...","Like: 100 people, Care: 27 people, Haha: 23 pe...",80 comments,13 shares,"Romance,Rant",a month ago,0,27,23,100,0,0,0,0,"Romance,Rant",0,0,0,0,0,0,1,1,month,a
226,#100670,My boyfriend is bi and his exes were all guys....,"Haha: 142 people, Like: 99 people",156 comments,108 shares,,a month ago,0,0,142,99,0,0,0,0,,1,0,0,0,0,0,0,0,month,a
227,#100668,Got offered a job as a contracted therapist at...,"Like: 11 people, Haha: 5 people, Care: 2 people",27 comments,4 shares,Advice,a month ago,0,2,5,11,0,0,0,0,Advice,0,1,0,0,0,0,0,0,month,a
228,#100665,My bf and I just broken up 2 months ago and I ...,"Like: 154 people, Care: 67 people, Sad: 49 people",110 comments,67 shares,"Romance,Rant,Advice",a month ago,0,67,0,154,0,49,0,0,"Romance,Rant,Advice",0,1,0,0,0,0,1,1,month,a
229,#100664,"Hello everyone here, first time posting and I ...","Haha: 59 people, Like: 34 people",76 comments,6 shares,Advice,a month ago,0,0,59,34,0,0,0,0,Advice,0,1,0,0,0,0,0,0,month,a


In [56]:
df_full_reaction_category["Date_Count"] = df_full_reaction_category["Date_Count"].apply(lambda x: x.replace("a", "1"))

In [57]:
def get_date(x):
    date = datetime(2021, 7, 17)
    count = int(x["Date_Count"])
    
    if ("days" in x["Date_Length"]):
        original_date = date - relativedelta(days =+ count)
    else:
        original_date = date - relativedelta(months =+ count)
    return original_date

In [58]:
df_full_reaction_category["Date_Clean"] = df_full_reaction_category.apply(get_date, axis=1)

# Comment and Share clean

In [59]:
df_full_reaction_category["Comment"] = df_full_reaction_category["Comment"].apply(lambda x: x.replace("Comments", ""))
df_full_reaction_category["Comment"] = df_full_reaction_category["Comment"].apply(lambda x: x.replace("Comment", ""))
df_full_reaction_category["Share"] = df_full_reaction_category["Share"].apply(lambda x: x.replace("Shares", ""))
df_full_reaction_category["Share"] = df_full_reaction_category["Share"].apply(lambda x: x.replace("Share", ""))

In [60]:
df_full_reaction_category["Share"] = df_full_reaction_category["Share"].apply(lambda x: x.replace("Comment", ""))

In [63]:
def clean_share(x):
    if ("K" in x["Share"]):
        number = x["Share"].replace("K", "")
        number = ' '.join(number.split())
        number = float(number)
        status = number*1000
    else:
        status = x["Share"]
    return status
        

In [64]:
df_full_reaction_category["Share_Clean"] = df_full_reaction_category.apply(clean_share, axis=1)

In [65]:
df_full_reaction_category

Unnamed: 0,Reference,Content,Reaction,Comment,Share,Category,Date,Angry_Reactions,Care_Reactions,Haha_Reactions,Like_Reactions,Love_Reactions,Sad_Reactions,Wow_Reactions,No Reactions,Category_Clean,Unnamed: 17,Advice,Ask Prof Ben,Funny,Lost and Found,Nostalgia,Rant,Romance,Date_Length,Date_Count,Date_Clean,Share_Clean
0,#101093,tldr: what to do with my father? basically the...,"Sad: 3 people, Like: 1 person",0,4 comments,Advice,8 hours ago,0,0,0,1,0,3,0,0,Advice,0,1,0,0,0,0,0,0,hours,8,2020-11-17,4 comments
1,#101090,Just a rant. Is it true that guys discuss thei...,"Haha: 28 people, Like: 21 people, Care: 8 people",56 comments,2 shares,Rant,8 hours ago,0,8,28,21,0,0,0,0,Rant,0,0,0,0,0,0,1,0,hours,8,2020-11-17,2 shares
2,#101089,I recently matched with a guy who lives at Pas...,"Haha: 214 people, Like: 50 people",175 comments,56 shares,Advice,8 hours ago,0,0,214,50,0,0,0,0,Advice,0,1,0,0,0,0,0,0,hours,8,2020-11-17,56 shares
3,#101088,I have been feeling uneasy for the past few da...,"Like: 5 people, Sad: 2 people",12 comments,2 shares,"Romance,Rant",8 hours ago,0,0,0,5,0,2,0,0,"Romance,Rant",0,0,0,0,0,0,1,1,hours,8,2020-11-17,2 shares
4,#101086,"After reading many posts here, it's come to my...","Haha: 20 people, Like: 16 people",53 comments,5 shares,Rant,8 hours ago,0,0,20,16,0,0,0,0,Rant,0,0,0,0,0,0,1,0,hours,8,2020-11-17,5 shares
5,#101084,I have been in my current job for about 1.5 ye...,Like: 16 people,0,5 comments,"Rant,Advice",8 hours ago,0,0,0,16,0,0,0,0,"Rant,Advice",0,1,0,0,0,0,1,0,hours,8,2020-11-17,5 comments
6,#101073,Hello everyone I’m just so sad and I need a pl...,"Like: 8 people, Care: 6 people, Sad: 6 people",15 comments,2 shares,"Rant,Advice",8 hours ago,0,6,0,8,0,6,0,0,"Rant,Advice",0,1,0,0,0,0,1,0,hours,8,2020-11-17,2 shares
7,#101071,I only feel romantic but not family love. How?...,"Like: 4 people, Care: 1 person",0,10 comments,Rant,8 hours ago,0,1,0,4,0,0,0,0,Rant,0,0,0,0,0,0,1,0,hours,8,2020-11-17,10 comments
8,#101068,TLDR Story: Invested 2 years my life in the gi...,"Like: 11 people, Haha: 1 person, Sad: 1 person",16 comments,2 shares,Advice,8 hours ago,0,0,1,11,0,1,0,0,Advice,0,1,0,0,0,0,0,0,hours,8,2020-11-17,2 shares
9,#101076,I hope to find a girl (70-75 kg+) to build rel...,"Haha: 42 people, Like: 18 people",40 comments,10 shares,Romance,a day ago,0,0,42,18,0,0,0,0,Romance,0,0,0,0,0,0,0,1,day,1,2021-06-17,10 shares


In [66]:
df_full_reaction_category.columns

Index(['Reference', 'Content', 'Reaction', 'Comment', 'Share', 'Category',
       'Date', 'Angry_Reactions', 'Care_Reactions', 'Haha_Reactions',
       'Like_Reactions', 'Love_Reactions', 'Sad_Reactions', 'Wow_Reactions',
       'No Reactions', 'Category_Clean', '', 'Advice', 'Ask Prof Ben', 'Funny',
       'Lost and Found', 'Nostalgia', 'Rant', 'Romance', 'Date_Length',
       'Date_Count', 'Date_Clean', 'Share_Clean'],
      dtype='object')

In [67]:
final = df_full_reaction_category.loc[:,['Reference', 'Date_Clean','Content', 'Comment', 'Share_Clean',
                                         'Angry_Reactions', 'Care_Reactions', 'Haha_Reactions',
                                       'Like_Reactions', 'Love_Reactions', 'Sad_Reactions', 'Wow_Reactions',
                                       'No Reactions', '', 'Advice', 'Ask Prof Ben', 'Funny',
       'Lost and Found', 'Nostalgia', 'Rant', 'Romance']]

In [68]:
final = final.rename(columns={'': 'No Category',
                              'Angry_Reactions':'Angry',
                              'Care_Reactions':'Care',
                              'Haha_Reactions':'Haha',
                              'Like_Reactions':'Like',
                              'Love_Reactions':'Love',
                              'Sad_Reactions':'Sad',
                              'Wow_Reactions':'Wow',
                              'Share_Clean': 'Share'})

In [69]:
final.to_csv(current_wd + "\\data\\full_data.csv", index=0)