# Data Preparation and Selection for comments

In [163]:
import collections
import numpy as np
import pandas as pd
import re
from datetime import datetime, timedelta
import random


from argparse import Namespace

In [164]:
args = Namespace(                                 
    reddit_comment_dataset_csv="reddit_comment_DB.csv", #this is the raw dataset awaiting to be pre-processed
    hardwarezone_comment_dataset_csv = "hwz_all_data.csv",
    output_filtered_csv="filtered_comment_lgbtq.csv",
)

In [165]:
reddit_df = pd.read_csv(args.reddit_comment_dataset_csv,  encoding='latin-1')

reddit_df = reddit_df[['body', 'created_utc']].rename(columns={'body': 'comment', 'created_utc': 'create_utc'})
reddit_df = reddit_df.dropna()
reddit_df

Unnamed: 0,comment,create_utc
0,>While this definitely doesnât amount to for...,1.694425e+09
1,Ohh thanks for the shout out!! ðð,1.694423e+09
2,i think itâs rather ironic that anti-LGBTQ g...,1.694430e+09
3,"As a Christian myself, I find it hilarious how...",1.694434e+09
4,I just took 1 look at the slides and groaned. ...,1.694475e+09
...,...,...
49130,Firing squad,1.585388e+09
49131,They ought to when citizens cannot keep themse...,1.585381e+09
49132,Exactly lol. Age isn't this magical number tha...,1.585378e+09
49133,Unless it becomes law that anyone who is sick ...,1.585394e+09


In [166]:
hwz_df = pd.read_csv(args.hardwarezone_comment_dataset_csv,  encoding='latin-1')
hwz_df = hwz_df[['content', 'time_stamp']].rename(columns={'content': 'comment', 'time_stamp': 'create_utc'})
print(hwz_df.shape)
hwz_df = hwz_df.dropna()
print(hwz_df.shape)

(62094, 2)
(61930, 2)


In [167]:
# Function to convert timestamp to Unix time
def convert_to_unix(timestamp_str):
    try:
        # Try parsing the timestamp with the first format
        timestamp = datetime.strptime(timestamp_str, "%b %d, %Y")
    except:
        # If timestamp_str is None, return None
        start_date = datetime(2024, 2, 4)
        end_date = datetime(2024, 2, 8)

        # Generate a random timedelta within the range
        random_timedelta = random.randint(0, (end_date - start_date).days)

        # Add the random timedelta to the start date
        timestamp = start_date + timedelta(days=random_timedelta)

    # Convert datetime object to Unix timestamp and return
    return int(timestamp.timestamp())

def conv(timestamp):
    return "{:.6e}".format(timestamp)

# Apply the function to the 'create_utc' column
hwz_df['create_utc'] = hwz_df['create_utc'].apply(convert_to_unix)
#hwz_df['create_utc'] = hwz_df['create_utc'].apply(conv)


hwz_df



Unnamed: 0,comment,create_utc
1,"A bizarre spat recently broke out in Shanghai,...",1655827200
2,Will Lee Ang direct this movie?,1655827200
3,I wonder the father turn gay before or after h...,1655827200
4,How the father so power can sex with him if he...,1655827200
5,Need to test all 3 for monkeypox.,1655827200
...,...,...
62089,The irony is that some women are actually usin...,1680537600
62090,"gravity_infinity said:\n\n\nTransgender ""woman...",1680537600
62091,blurredprint said:\nSong boh see how that coac...,1680537600
62092,"MegatonNev said:\nbiologically, we are designe...",1680537600


In [168]:
reddit_df['community'] = 'Reddit'
hwz_df['community'] = 'Hardware Zone'
raw_comments = pd.concat([reddit_df, hwz_df], axis=0)
raw_comments.info

<bound method DataFrame.info of                                                  comment    create_utc  \
0      >While this definitely doesnât amount to for...  1.694425e+09   
1                Ohh thanks for the shout out!! ðð  1.694423e+09   
2      i think itâs rather ironic that anti-LGBTQ g...  1.694430e+09   
3      As a Christian myself, I find it hilarious how...  1.694434e+09   
4      I just took 1 look at the slides and groaned. ...  1.694475e+09   
...                                                  ...           ...   
62089  The irony is that some women are actually usin...  1.680538e+09   
62090  gravity_infinity said:\n\n\nTransgender "woman...  1.680538e+09   
62091  blurredprint said:\nSong boh see how that coac...  1.680538e+09   
62092  MegatonNev said:\nbiologically, we are designe...  1.680538e+09   
62093  LostandFound87 said:\nwhen u cant beat them, j...  1.680538e+09   

           community  
0             Reddit  
1             Reddit  
2         

In [169]:
raw_comments.comment.value_counts()  # full data

comment
[deleted]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              2225
[removed]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               637
F       

In [170]:
# to remove noise from raw data sets [deleted] [removed] F Yes
raw_comments = raw_comments.drop(raw_comments[raw_comments['comment']=='[deleted]'].index)
raw_comments = raw_comments.drop(raw_comments[raw_comments['comment']=='[removed]'].index)
raw_comments = raw_comments.drop(raw_comments[raw_comments['comment']=='F'].index)
raw_comments = raw_comments.drop(raw_comments[raw_comments['comment']=='Yes'].index)

In [171]:
raw_comments.comment.value_counts()

comment
Facebook links are not allowed on this subreddit.\n\n*I am a bot, and this action was performed automatically. Please [contact the moderators of this subreddit](/message/compose/?to=/r/singapore) if you have any questions or concerns.*                                                                                                                                                                                                                    17
#NAME?                                                                                                                                                                                                                                                                                                                                                                                                                                                         12
Yes.                                                                                        

In [172]:
# Preprocess the tweets
def preprocess_text(text):
    text = str(text)
    text = text.lower()
    text = re.sub(r"@\w+", r"", text)             # remove @user_name
    text = re.sub(r"([.,!?])", r" \1 ", text)     # E.g., convert "end." to "end . "
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)  # replace special character strings with empty string
    return text
    
raw_comments.comment = raw_comments.comment.apply(preprocess_text)

In [173]:
raw_comments.head()

Unnamed: 0,comment,create_utc,community
0,while this definitely doesn t amount to forei...,1694425000.0,Reddit
1,ohh thanks for the shout out ! !,1694423000.0,Reddit
2,i think it s rather ironic that anti lgbtq gro...,1694430000.0,Reddit
3,"as a christian myself , i find it hilarious ho...",1694434000.0,Reddit
4,i just took look at the slides and groaned . w...,1694475000.0,Reddit


In [174]:
from datetime import datetime

def preprocess_timestamp(timestamp):
    # Convert string to integer
    timestamp = int(timestamp)
    # Convert Unix timestamp to datetime object
    return datetime.fromtimestamp(timestamp)

# Assuming 'create_utc' is a column in the 'raw_comments' DataFrame
raw_comments['create_utc'] = raw_comments['create_utc'].apply(preprocess_timestamp)


In [175]:
raw_comments.head()

Unnamed: 0,comment,create_utc,community
0,while this definitely doesn t amount to forei...,2023-09-11 17:38:49,Reddit
1,ohh thanks for the shout out ! !,2023-09-11 17:00:14,Reddit
2,i think it s rather ironic that anti lgbtq gro...,2023-09-11 18:56:33,Reddit
3,"as a christian myself , i find it hilarious ho...",2023-09-11 20:04:08,Reddit
4,i just took look at the slides and groaned . w...,2023-09-12 07:33:28,Reddit


In [176]:
# df to csv file with header
raw_comments.to_csv(args.output_filtered_csv, index=False, header=True) 

In [177]:
# to read the filtered csv file
df = pd.read_csv(args.output_filtered_csv, parse_dates=['create_utc'])

# Ensure the column is in datetime format
df['create_utc'] = pd.to_datetime(df['create_utc'], errors='coerce')

# Find the minimum and maximum dates in the dataframe
min_date = df['create_utc'].min()
max_date = df['create_utc'].max()

# Extract years from the dates
min_year = min_date.year
max_year = max_date.year

print(f"The data range is from {min_year} to {max_year} in terms of years.")


The data range is from 2012 to 2024 in terms of years.
