### 1. Default Setting

In [1]:
import json
import os 
import pandas as pd 
import nltk
import re
import numpy as np
import matplotlib.pyplot as plt 
import fasttext
from nltk.tokenize import word_tokenize
from pprint import pprint

In [2]:
default_path = '/home/lamda_00/Data-related'
subreddit = 'Depression'
# subreddit_id = 't5_2qqqf'   # depression
data_path = os.path.join(default_path, 'data', 'Reddit_origin')
save_path = os.path.join(default_path, 'data', 'Reddit_processed')
model_path = os.path.join(default_path, 'model')

In [3]:
# rs_file = 'dep_rs_2016.csv'
rc_file = 'dep_rc_2017.csv'
# rs_file2 = 'drs_2016.csv'
rc_file2 = 'rc_2017.csv'

In [None]:
reddit = [file for file in self.file_list if file.startswith(reddit_type) and year in file]
print(reddit)
for month in reddit: 
    reddit_file = open(os.path.join(self.data_path, month), 'r')
    reddit_data = reddit_file.readlines()

    print(f'{month}월, 데이터 개수: {len(reddit_data)}')
    for reddit_content in reddit_data:
        try:
            reddit_json = json.loads(reddit_content)
            # 간혹 json file의 key 값에 reddit, reddit_id가 없는 경우 존재 (외부 게시글)
            if reddit_type.lower() == 'rs':
                assert reddit_json['subreddit'], reddit_json['selftext']
            elif reddit_type.lower() == 'rc':
                assert reddit_json['subreddit'], reddit_json['comment']
        except:
            error_list.append(reddit_content)
            continue 

### 2. Define Class

In [4]:
class RedditData():
    def __init__(self, subreddit, data_path, save_path):
        self.subreddit = subreddit
        self.data_path = data_path
        self.save_path = save_path
        self.file_list = os.listdir(self.data_path)
        self.rs_dtype = {'id': str, 'subreddit': str, 'author': str, 'title': str, 'context': str, 'url': str}
        self.rc_dtype = {'id': str, 'subreddit': str, 'author': str, 'comment': str}
        
    def check_condition(self, json_file, reddit_type):
        '''
        json_file에서 추출하고 싶은 조건 작성하는 함수
        조건 만족 시 True 반환 (reddit_type: rs or rc)
        '''
        assert reddit_type in ['rs','rc','RS','RC']
        if reddit_type.lower() == 'rs':
            if json_file['subreddit'] == self.subreddit and (json_file['selftext'] != '[deleted]' and 'http' not in \
                json_file['selftext'] and json_file['selftext'] != '' and 'cross post' not in json_file['title']):
                return True 
            return False
        
        elif reddit_type.lower() == 'rc':
            if json_file['subreddit'] == self.subreddit and json_file['comment'] != '[deleted]':
                return True 
            return False 
            
    def extract_info(self, year, reddit_type):
        assert reddit_type in ['rs','rc','RS','RC']
        
        reddit_info = []
        error_list = []
        
        reddit = [file for file in self.file_list if file.startswith(reddit_type) and year in file]
        print(reddit)
        for month in reddit: 
            reddit_file = open(os.path.join(self.data_path, month), 'r')
            reddit_data = reddit_file.readlines()
            
            print(f'{month}월, 데이터 개수: {len(reddit_data)}')
            for reddit_content in reddit_data:
                try:
                    reddit_json = json.loads(reddit_content)
                    # 간혹 json file의 key 값에 reddit, reddit_id가 없는 경우 존재 (외부 게시글)
                    if reddit_type.lower() == 'rs':
                        assert reddit_json['subreddit'], reddit_json['selftext']
                    elif reddit_type.lower() == 'rc':
                        assert reddit_json['subreddit'], reddit_json['comment']
                except:
                    error_list.append(reddit_content)
                    continue 
                    
                if self.check_condition(reddit_json, reddit_type):
                    reddit_info.append(reddit_json)
        
        print(reddit_info)
        print(f'{len(error_list)}개의 파일 로드에 실패하였습니다.')
        return reddit_info, error_list
        
    def convert_to_df(self, reddit_info, reddit_type):
        reddit_df = []
        columns = []
        errs = []
        for info in reddit_info:
            if reddit_type == 'RS':
                columns = ['id', 'subreddit', 'author', 'title', 'context', 'comments', 'score', 'url', 'created_utc']
                try:
                    reddit_df.append([str(info['id']), self.subreddit, info['author'], info['title'], info['selftext'], \
                                      info['num_comments'], info['score'], info['url'], info['created_utc']])
                except:
                    errs.append(info)
                    continue
                
            elif reddit_type == 'RC':   # 2016년도에는 comment -> body로 변경됨, downs 없어짐 
                columns = ['id', 'subreddit', 'author', 'body', 'ups', 'downs', 'score', 'created_utc']
                try:
                    reddit_df.append([str(info['link_id']), self.subreddit, info['author'], info['comment'], info['ups'], \
                                      info['downs'], info['score'], info['created_utc']]) 
                except:
                    errs.append(info)
                    continue       
        
        print(f'변환 과정 중 {len(errs)}개의 파일 오류 발생')
        reddit_df = pd.DataFrame(reddit_df, columns=columns)            
        return reddit_df
    
    def load_data(self, file_name, reddit_type):
        assert reddit_type in ['rs', 'rc', 'RS', 'RC']
        reddit_df = pd.read_csv(os.path.join(self.save_path, file_name), \
                                dtype=self.rs_dtype if reddit_type.lower=='rs' else self.rc_dtype, engine='python')
        return reddit_df
    
    def check_df(self, reddit_df):
        print(f'전체 데이터 개수: {len(reddit_df)}\n')
        print(f'결측값 개수: \n{reddit_df.isna().sum()}\n')
        print(f'중복 데이터 개수: {reddit_df.duplicated().sum()}')
        # for column in reddit_df.columns:
        #    print(f'{column} 타입: {pd.api.types.infer_dtype(reddit_df[column])}')
        
    def save_df(self, reddit_df, filename):
        reddit_df.to_csv(os.path.join(self.save_path, filename), index=None)

In [5]:
class RedditProcessor():
    def __init__(self, data_path, save_path):
        self.data_path = data_path
        self.save_path = save_path
    
    def drop_na(self, reddit_df, reddit_type, na_type):
        '''
        na_type: 'na' (None object) or 'nc' ('')
        '''
        assert na_type in ['na','nc']
        assert reddit_type in ['rs', 'rc', 'RS', 'RC']
        
        if na_type == 'na':
            reddit_df.dropna(inplace=True)
        else:
            if reddit_type.lower() == 'rs':
                reddit_df = reddit_df[reddit_df.title != '']
                reddit_df = reddit_df[reddit_df.context != '']
            else:
                reddit_df = reddit_df[reddit_df.body != '']
        reddit_df.reset_index(inplace=True, drop=True)
        return reddit_df 
    
    def drop_duplicates(self, reddit_df):
        reddit_df.drop_duplicates(keep='first', inplace=True)
        reddit_df.reset_index(inplace=True, drop=True)
        return reddit_df

    def cleanse_text(self, text):
        if isinstance(text, float):
            return []
        
        text = text.lower()   # lower case
        text = re.sub(r"http\S*|\S*\.com\S*|\S*www\S*", " ", text)    # delete url 
        text = re.sub(r"\s@\S+", " ", text)   # delete @mentions
        text = re.sub(r"\s+", " ", text)   # replace all whitespace with a single space
        text = text.strip()    # strip off spaces on either end
        
        if len(text) < 6: 
            return ''
        return text
    
    def get_token_list(self, reddit_df, reddit_type):
        assert reddit_type in ['RS', 'RC', 'rs', 'rc']
        if reddit_type.lower() == 'rs':
            return [word_tokenize(context) for context in reddit_df.context.values.tolist()]
        else:   # 2016  -> comment -> body
            return [word_tokenize(context) for context in reddit_df.body.values.tolist()]
    
    def set_max_tok(self, reddit_df, reddit_type, max_len):
        assert reddit_type in ['RS', 'RC', 'rs', 'rc']
        tok_list = self.get_token_list(reddit_df, reddit_type)
        idx_list = [idx for idx in range(len(tok_list)) if len(tok_list[idx]) <= max_len]
        reddit_df = reddit_df.loc[idx_list]
        reddit_df.reset_index(inplace=True, drop=True)
        return reddit_df

    def map_rc_rs(self, reddit_rs, reddit_rc):
        '''
        submission이 존재하지 않는 comment 제거 후, reddit_rc 반환하는 함수 
        '''
        id_list = reddit_rs.id.unique().tolist()
        id_list = ['t3_' + id for id in id_list]
        reddit_rc = reddit_rc[reddit_rc.id.isin(id_list)]
        reddit_rc.reset_index(drop=True, inplace=True)
        return reddit_rc

In [6]:
class RedditVisual():
    pass

In [7]:
subreddit

'Depression'

In [8]:
reddit = RedditData(subreddit, data_path, save_path)

In [9]:
rs, rs_err = reddit.extract_info('2017', 'RS')

['RS_2016-02', 'RS_2016-04', 'RS_2016-05', 'RS_2016-07', 'RS_2016-08', 'RS_2016-10', 'RS_2016-12', 'RS_2016-11', 'RS_2016-01', 'RS_2016-03', 'RS_2016-06', 'RS_2016-09']
RS_2016-02월, 데이터 개수: 7146932
RS_2016-04월, 데이터 개수: 6922510
RS_2016-05월, 데이터 개수: 7277166
RS_2016-07월, 데이터 개수: 7369835
RS_2016-08월, 데이터 개수: 7591689
RS_2016-10월, 데이터 개수: 8286759
RS_2016-12월, 데이터 개수: 8921474
RS_2016-11월, 데이터 개수: 8660144
RS_2016-01월, 데이터 개수: 6877510
RS_2016-03월, 데이터 개수: 6901239
RS_2016-06월, 데이터 개수: 7188762
RS_2016-09월, 데이터 개수: 7437862
78183개의 파일 로드에 실패하였습니다.


In [11]:
len(rs), len(rs_err)

NameError: name 'rs' is not defined

In [9]:
rc, rc_err = reddit.extract_info('2017', 'RC')

['RC_2017-01']
RC_2017-01월, 데이터 개수: 78946585
0개의 파일 로드에 실패하였습니다.


In [17]:
rc

[]

In [14]:
import pickle 

with open(os.path.join(save_path, '2016_rc.pickle'), 'wb') as f:
    pickle.dump(rc, f, pickle.HIGHEST_PROTOCOL)

In [10]:
rs_df = reddit.convert_to_df(rs, 'RS')

변환 과정 중 0개의 파일 오류 발생


In [11]:
rs_df

Unnamed: 0,id,subreddit,author,title,context,comments,score,url,created_utc
0,43ltsn,SuicideWatch,Pontonpersonal,I have no regard for my safety and may end up ...,Recently I have been sneaking out of the house...,4,1,https://www.reddit.com/r/SuicideWatch/comments...,1454286303
1,43luaa,SuicideWatch,throwaway122457,Hoping this helps,"I was going to send this to a friend, but I co...",3,5,https://www.reddit.com/r/SuicideWatch/comments...,1454286476
2,43lue8,SuicideWatch,evangregory1492,The straw that broke...,So i lost my job this week. It was a shit job ...,1,3,https://www.reddit.com/r/SuicideWatch/comments...,1454286520
3,43lvw7,SuicideWatch,TheLastSunriseD,Killing myself tonight,I dropped out of school because of social anxi...,32,39,https://www.reddit.com/r/SuicideWatch/comments...,1454287072
4,43lwgh,SuicideWatch,narcolepsythrowaway1,No Light at the End of the Tunnel,Let me start off by saying that my considerati...,1,1,https://www.reddit.com/r/SuicideWatch/comments...,1454287304
...,...,...,...,...,...,...,...,...,...
22828,55am7j,SuicideWatch,curiousyoungbonobo,Fuck,I've been going through some shit for years an...,1,6,https://www.reddit.com/r/SuicideWatch/comments...,1475276892
22829,55amry,SuicideWatch,love_is_free,I hope to be dead by this time tomorrow.,I'm a 20-year-old girl who's given up on life....,2,1,https://www.reddit.com/r/SuicideWatch/comments...,1475277132
22830,55an48,SuicideWatch,Plsanswerawkwardporn,Why won't anyone help me?,I can't ask for help because I'm bad at everyt...,5,4,https://www.reddit.com/r/SuicideWatch/comments...,1475277276
22831,55aq62,SuicideWatch,CanderousOrdinance,Life after the psych ward,So I spent most of last week in the hospital b...,8,10,https://www.reddit.com/r/SuicideWatch/comments...,1475278520


In [12]:
reddit.save_df(rs_df, rs_file)

In [None]:
rs_df = reddit.convert_to_df(rs, 'RS')
rc_df = reddit.convert_to_df(rc, 'RC')

In [None]:
reddit.save_df(rs_df, rs_file)
reddit.save_df(rc_df, rc_file)

### 3. Data Process

#### 3.1 결측치 & 중복 값 제거

In [21]:
rProcessor = RedditProcessor(data_path, save_path)

In [22]:
rs = reddit.load_data(rs_file2, 'rs')
rc = reddit.load_data(rc_file2, 'rc')

In [23]:
len(rs), len(rc)

(65185, 219375)

In [24]:
reddit.check_df(rc)

전체 데이터 개수: 219375

결측값 개수: 
id             0
subreddit      0
author         0
body           0
ups            0
score          0
created_utc    0
dtype: int64

중복 데이터 개수: 1


In [25]:
rs = rProcessor.drop_na(rs, 'rs', 'na')
rs = rProcessor.drop_duplicates(rs)
rs.tail(3)

Unnamed: 0,id,subreddit,author,title,context,comments,score,url,created_utc
65182,55asdi,depression,legion_2k,music!!! has probably saved me many times.,i am sure it's some ancient instinct that huma...,13,3,https://www.reddit.com/r/depression/comments/5...,1475279438
65183,55asx6,depression,ijusttriedthaifood,i wish i had a best friend,everyone talks about how they have somebody wh...,6,19,https://www.reddit.com/r/depression/comments/5...,1475279654
65184,55atn3,depression,KoNaosuke,someone feels bad when watch other people happ...,i have been with depression for more than 10 y...,0,1,https://www.reddit.com/r/depression/comments/5...,1475279977


In [26]:
rc = rProcessor.drop_na(rc, 'rc', 'na')
rc = rProcessor.drop_duplicates(rc)
rc.tail(3)

Unnamed: 0,id,subreddit,author,body,ups,score,created_utc
219371,t3_4vjdrf,depression,thelonelyfootballfan,"hi, i'd really like to talk to you but i don't...",1.0,1.0,1470009000.0
219372,t3_4vjcb2,depression,LegacyMJT,i understand. one time a week ago i went throu...,7.0,7.0,1470010000.0
219373,t3_4vjdrf,depression,AcidPack,you can click on their username at the top of ...,1.0,1.0,1470010000.0


In [27]:
reddit.save_df(rs, rs_file)
reddit.save_df(rc, rc_file)

In [28]:
len(rs), len(rc)

(65185, 219374)

#### 3.2 Submission이 없는 Comment 제거

In [31]:
rc3 = rProcessor.map_rc_rs(rs2, rc2)

In [32]:
len(rs2), len(rc2), len(rc3)

(65185, 219374, 216344)

In [33]:
rc3

Unnamed: 0,id,subreddit,author,body,ups,score,created_utc
0,t3_48bugn,depression,Catachaos,who's there?,1.0,1.0,1.456790e+09
1,t3_48b67a,depression,[deleted],[removed],4.0,4.0,1.456791e+09
2,t3_48bugn,depression,Catachaos,i like the naming of bobb. my mum had a cat ca...,2.0,2.0,1.456791e+09
3,t3_48ckgp,depression,Silverkid99,"go lay down and cry, or let your emotions out ...",1.0,1.0,1.456791e+09
4,t3_48ddm6,depression,Silverkid99,"you aren't a failure, you've graduated college...",2.0,2.0,1.456791e+09
...,...,...,...,...,...,...,...
216339,t3_4ugo0c,depression,JaazzHandss,thanks for listening to me.,1.0,1.0,1.470009e+09
216340,t3_4vglqn,depression,whytf_not,never apologize for mcr.,2.0,2.0,1.470009e+09
216341,t3_4vjdrf,depression,thelonelyfootballfan,"hi, i'd really like to talk to you but i don't...",1.0,1.0,1.470009e+09
216342,t3_4vjcb2,depression,LegacyMJT,i understand. one time a week ago i went throu...,7.0,7.0,1.470010e+09


#### 3.3 Text Cleaning 

In [35]:
rs2.title = rs2.title.apply(rProcessor.cleanse_text)
rs2.context = rs2.context.apply(rProcessor.cleanse_text)
rc3.body = rc3.body.apply(rProcessor.cleanse_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rc3.body = rc3.body.apply(rProcessor.cleanse_text)


#### 3.5 Drop na

In [36]:
len(rs2), len(rc3)

(65185, 216344)

In [None]:
rs2 = rProcessor.drop_na(rs2, 'rs', 'nc')
rc3 = rProcessor.drop_na(rc3, 'rc', 'nc')

In [None]:
len(rs2), len(rc3)

In [None]:
rc3[rc3.comment.str.len() <=5]

#### 3.6 Save df 

In [31]:
reddit.save_df(rs2, rs_file2)

In [30]:
reddit.save_df(rs2, rs_file2)
reddit.save_df(rc3, rc_file2)

In [32]:
reddit.save_df(rc3, rc_file2)