# function estimation using neural network

In [104]:
import numpy as np
import pandas as pd 
import os
import glob

In [105]:
from preprocess_tweets import normalizeTweet

In [106]:
# Mode A: read in a csv file
os.chdir("C:/Users/Gary/Desktop/Year 1 Sem 2/CS5246 Text Mining/Group Project/depression-detector/data/")
file = "reddit_all.csv"
df_orig = pd.read_csv(file)  

In [107]:
# # Mode B: read in a folder of csv files
# os.chdir("C:/Users/Gary/Desktop/Year 1 Sem 2/CS5246 Text Mining/Group Project/depression-detector/data/CSV Files Per Label")

# extension = 'csv'
# all_filenames = [i for i in glob.glob('*.{}'.format(extension))]

# # combine all files in the list
# df_orig = pd.concat([pd.read_csv(f) for f in all_filenames ])

In [108]:
#show first 5 rows
df_orig.head()

Unnamed: 0,title,selftext,subreddit,date,num_comments,score,label
0,Some questions regarding anxiety a...,I am not diagnosed as having ...,Anxiety,2023-02-21 18:45:35,0,1,0
1,i feel so angry and jealous ...,"so, i’m 17, i’ve had anxiety ...",Anxiety,2023-02-21 18:45:23,0,1,0
2,do you guys think it's ok th...,Because of agoraphobia I haven't ...,Anxiety,2023-02-21 18:37:51,0,1,0
3,How to get through performance ...,I (29M) just got dumped by m...,Anxiety,2023-02-21 18:35:17,0,1,0
4,My friend just got jumped.,He's alr now but I don't kno...,Anxiety,2023-02-21 18:31:29,0,1,0


In [109]:
df_orig.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134181 entries, 0 to 134180
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   title         134181 non-null  object
 1   selftext      127204 non-null  object
 2   subreddit     134181 non-null  object
 3   date          134181 non-null  object
 4   num_comments  134181 non-null  int64 
 5   score         134181 non-null  int64 
 6   label         134181 non-null  int64 
dtypes: int64(3), object(4)
memory usage: 7.2+ MB


In [110]:
#replace selftext with title if selftext is [removed], [deleted], empty,nan, or as per title
df_orig['selftext'] = df_orig['selftext'].replace(np.nan, '', regex=True)
df_orig['selftext'] = np.where(df_orig['selftext'].isin(['[removed]', '[deleted]', '', 'as per title']), df_orig['title'], df_orig['selftext'])

In [111]:
df_orig['selftext'].head()

0    I    am    not    diagnosed    as    having   ...
1    so,    i’m    17,    i’ve    had    anxiety   ...
2    Because    of    agoraphobia    I    haven't  ...
3    I    (29M)    just    got    dumped    by    m...
4    He's    alr    now    but    I    don't    kno...
Name: selftext, dtype: object

In [112]:
#Extract only rawContent column for use
df= df_orig['selftext']

df.rename("cleaned")
df.info()

<class 'pandas.core.series.Series'>
RangeIndex: 134181 entries, 0 to 134180
Series name: selftext
Non-Null Count   Dtype 
--------------   ----- 
134181 non-null  object
dtypes: object(1)
memory usage: 1.0+ MB


In [113]:
display(df)

0         I    am    not    diagnosed    as    having   ...
1         so,    i’m    17,    i’ve    had    anxiety   ...
2         Because    of    agoraphobia    I    haven't  ...
3         I    (29M)    just    got    dumped    by    m...
4         He's    alr    now    but    I    don't    kno...
                                ...                        
134176    i    always    considered    ODing    but    b...
134177    I    don't    know    how    to    celebrate  ...
134178        I    took    some    pills    of    mine  ...
134179    my    psychotic    episodes    are    getting ...
134180    its    almost    everyday    i    feel    like...
Name: selftext, Length: 134181, dtype: object

In [114]:
# #un-labelled data

# #Apply normalizeTweet function to each row
# df = df.apply(normalizeTweet)

# #show first 5 rows of cleaned data
# display(df.head())

# #append original data for comparison
# df = pd.concat([df_orig['selftext'],df], axis=1)

# #rename columns
# df.columns = ['originalreddit', 'cleanedreddit']
# display(df.head())

# #export to csv
# df.to_csv('cleaned_'+file, index=False)

In [115]:
#labelled data

#Apply normalizeTweet function to each row
df = df.apply(normalizeTweet)

#show first 5 rows of cleaned data
display(df.head())

#append original data for comparison
df = pd.concat([df_orig['selftext'],df,df_orig['label']], axis=1)

#rename columns
df.columns = ['originalreddit', 'cleanedreddit', 'label']
display(df.head())

#export to csv
df.to_csv('cleaned_'+file, index=False)

0    I am not diagnosed as having anxiety but i hav...
1    so im seventeen i have had anxiety from a very...
2    because of agoraphobia I havent worked and don...
3    I 29m just got dumped by my last girlfriend be...
4    hes alr now but I dont know what to do we call...
Name: selftext, dtype: object

Unnamed: 0,originalreddit,cleanedreddit,label
0,I am not diagnosed as having ...,I am not diagnosed as having anxiety but i hav...,0
1,"so, i’m 17, i’ve had anxiety ...",so im seventeen i have had anxiety from a very...,0
2,Because of agoraphobia I haven't ...,because of agoraphobia I havent worked and don...,0
3,I (29M) just got dumped by m...,I 29m just got dumped by my last girlfriend be...,0
4,He's alr now but I don't kno...,hes alr now but I dont know what to do we call...,0


General Pre-Processing Steps:
- lowercase all
- Remove newlines
- Remove URLs
- Remove punctuation (for twitter only)
- Replace 2 or more spaces with 1 space
- Replace all instances of 3 or more letters with 2 letters
- Remove all instances of 3 or more repeated subsequent words (ex: "I love love love love you")
- Remove html tags (don't think there is any)
- Remove non-ascii characters
- Handling Bullet Points or similar (lists, tables etc.)
- Replace numbers with \<num\> token
- Detect foreign language and remove the entire item


Twitter Specific Pre-Processing Steps:
- Remove RT
- Remove @mentions and usernames for privacy
- Remove hashtags (remove hashtags (both symbol and respective word) for hashtags used in querying) (keep hashtags not used for querying (remove only the hashtag symbol but keep the word))

Reddit Specific Pre-Processing Steps:
- Only keep english text (twitter data was already filtered to english, reddit wasn't)
- Remove user mentions