## Pre-processing steps:
GET A BEFORE AND AFTER NUMBER.

1. Delete rows that do not have an (unique) ID.   
	1.1 delete duplicate  
	1.2 delete rows with no ID  
  
2. Delete rows that do not have body text  
	2.1 delete rows with empty values  
	2.2 delete rows with NaN  
	2.3 delete rows with only the text \[deleted\]  
	2.4 delete rows with only the text \[removed\]  
  
3. Remove markdown marks   
	3.1 remove \&gt; and \&gt;/- and \&amp; and \#x200B and \/-; and \&lt;   
	3.2 remove all *  
	3.3 remove all square brackets  
  
4. Remove all links using  a regex
  


\#x200B; is a 'zero-width space' which is a character that acts like a spacebar, except its invisible. 
[resource](https://www.reddit.com/r/OutOfTheLoop/comments/9abjhm/what_does_x200b_mean/)

## preprocessing each subreddit
  
1. only selecting the following columns created_utc, subreddit, id and body.  
2. After that following the preprocessing steps as mentioned above  



In [1]:
import pandas as pd
import collections

subreddit = 'worldnews'

In [None]:
sr1 = pd.read_csv(f'ds_master/{subreddit}_masterfile.csv', low_memory=False)
# sr1 = pd.read_csv(f'ds_master/{subreddit}_masterfile.csv', low_memory=False, lineterminator='\n')

### Calculate length of dataset before pre-processing

In [None]:
sr_length = len(sr1)
print(sr_length)

### selecting specific columns

In [None]:
df_new = []

In [None]:
df_new = sr1

# Select the specific columns you want to keep.
columns_to_keep = ['created_utc', 'utc_datetime_str', 'subreddit', 'id', 'body']
df_new = df_new[columns_to_keep]


df_new.head(5)

### Delete rows that do not have an (unique) ID

In [None]:
df_without_id = df_new[df_new['id'].isna()]
num_rows_without_id = len(df_without_id)
print(f"Number of rows without an ID: {num_rows_without_id}")

In [None]:
df_new = df_new.dropna(subset=['id'])
print(len(df_new))

In [None]:
df_new.head(5)

### remove duplicates

In [None]:
duplicate_ids = df_new[df_new.duplicated(subset=['id'], keep=False)]
duplicates_df = pd.DataFrame(duplicate_ids)

duplicates_df.to_csv(f'ds_test/{subreddit}_duplicates.csv', index=False)
print(len(duplicates_df))

In [None]:
df_new.drop_duplicates(subset=['id'], inplace=True)

In [None]:
print(len(df_new))

### delete rows with empty body text

this includes empty rows or rows with NaN

In [None]:
mask = df_new['body'].notnull()
df_new = df_new[mask]
print(len(df_new))

delete rows where the body text only contains the word '\[deleted\]' or '\[removed\]'

In [None]:
mask = (df_new['body'].str.contains('[removed]', regex=False) == False) & (df_new['body'].str.contains('[deleted]', regex=False) == False) & (df_new['body'].str.strip() != '')
df_new = df_new[mask]
print(len(df_new))

### delete markdown marks

such as \&gt; and \&gt;/- and \&amp; and \#x200B and \/-; and \&lt;   

In [None]:
# saving it in a new dataframe 
df_pro = df_new

In [None]:
df_pro['body'] = df_pro['body'].str.replace('&gt;', '')

In [None]:
df_pro['body'] = df_pro['body'].str.replace('&gt;/-', '')

In [None]:
df_pro['body'] = df_pro['body'].str.replace('&amp;', '&')

In [None]:
df_pro['body'] = df_pro['body'].str.replace('/-;', '')

In [None]:
df_pro['body'] = df_pro['body'].str.replace('&lt;', '')

### remove other characters 

such as * and all square brackets

In [None]:
df_pro['body'] = df_pro['body'].str.replace('*', '', regex=False)

In [None]:
df_pro['body'] = df_pro['body'].str.replace('[', '', regex=False)

In [None]:
df_pro['body'] = df_pro['body'].str.replace(']', '', regex=False)

### remove all urls

In [None]:
import re

In [None]:
url_pattern = r'\(?(?:https?:\/\/|www\.)\S+\b\)?'

df_pro['body'] = df_pro['body'].str.replace(url_pattern, '', regex=True)

In [None]:
df_pro['body'] = df_pro['body'].str.replace('/)', '', regex=False)

## saving pre-processed data into a new file

In [None]:
mask = df_pro['body'].notnull()
df_pro = df_pro[mask]

In [None]:
print(len(df_pro))

In [None]:
df_pro.to_csv(f'ds_preprocessed/{subreddit}_masterfile_preprocessed.csv', index=False)

## combine all preprocessed files into one master file

In [None]:
directory = 'ds_preprocessed'
staticfilename = '_masterfile_preprocessed'

In [None]:
# solo testing problamatic files with lineterminator
sr6 = pd.read_csv(f'{directory}/environment{staticfilename}.csv', low_memory=False, lineterminator='\n')
print(len(sr6))

In [None]:
# all master files per subreddit
sr1 = pd.read_csv(f'{directory}/askreddit{staticfilename}.csv', low_memory=False)
sr2 = pd.read_csv(f'{directory}/climate{staticfilename}.csv', low_memory=False)
sr3 = pd.read_csv(f'{directory}/climatechange{staticfilename}.csv', low_memory=False, lineterminator='\n')
sr4 = pd.read_csv(f'{directory}/climateskeptics{staticfilename}.csv', low_memory=False)
sr5 = pd.read_csv(f'{directory}/collapse{staticfilename}.csv', low_memory=False)
sr6 = pd.read_csv(f'{directory}/environment{staticfilename}.csv', low_memory=False, lineterminator='\n')
sr7 = pd.read_csv(f'{directory}/futurology{staticfilename}.csv', low_memory=False)
sr8 = pd.read_csv(f'{directory}/news{staticfilename}.csv', low_memory=False)
sr9 = pd.read_csv(f'{directory}/politics{staticfilename}.csv', low_memory=False)
sr10 = pd.read_csv(f'{directory}/science{staticfilename}.csv', low_memory=False)
sr11 = pd.read_csv(f'{directory}/worldnews{staticfilename}.csv', low_memory=False)


In [None]:
df_def = []

In [None]:
df_def.append(sr1)
df_def.append(sr2)
df_def.append(sr3)
df_def.append(sr4)
df_def.append(sr5)
df_def.append(sr6)
df_def.append(sr7)
df_def.append(sr8)
df_def.append(sr9)
df_def.append(sr10)
df_def.append(sr11)

In [None]:
combined_df = pd.concat(df_def, ignore_index=True)

In [None]:
combined_df.to_csv('ds_master/reddit_climatechange_dataset.csv', index=False)

In [None]:
print(len(combined_df))

## Saving raw files as one master file

In [2]:
directory = 'ds_master'
staticfilename = '_masterfile'

In [3]:
# all master files per subreddit
sr1 = pd.read_csv(f'{directory}/askreddit{staticfilename}.csv', low_memory=False)
sr2 = pd.read_csv(f'{directory}/climate{staticfilename}.csv', low_memory=False)
sr3 = pd.read_csv(f'{directory}/climatechange{staticfilename}.csv', low_memory=False, lineterminator='\n')
sr4 = pd.read_csv(f'{directory}/climateskeptics{staticfilename}.csv', low_memory=False)
sr5 = pd.read_csv(f'{directory}/collapse{staticfilename}.csv', low_memory=False)
sr6 = pd.read_csv(f'{directory}/environment{staticfilename}.csv', low_memory=False, lineterminator='\n')
sr7 = pd.read_csv(f'{directory}/futurology{staticfilename}.csv', low_memory=False)
sr8 = pd.read_csv(f'{directory}/news{staticfilename}.csv', low_memory=False)
sr9 = pd.read_csv(f'{directory}/politics{staticfilename}.csv', low_memory=False)
sr10 = pd.read_csv(f'{directory}/science{staticfilename}.csv', low_memory=False)
sr11 = pd.read_csv(f'{directory}/worldnews{staticfilename}.csv', low_memory=False)


In [4]:
df_raw = []

In [5]:
df_raw.append(sr1)
df_raw.append(sr2)
df_raw.append(sr3)
df_raw.append(sr4)
df_raw.append(sr5)
df_raw.append(sr6)
df_raw.append(sr7)
df_raw.append(sr8)
df_raw.append(sr9)
df_raw.append(sr10)
df_raw.append(sr11)

In [6]:
combined_raw_df = pd.concat(df_raw, ignore_index=True)

In [7]:
# remove duplicates
duplicate_raw_ids = combined_raw_df[combined_raw_df.duplicated(subset=['id'], keep=False)]
duplicates_raw_df = pd.DataFrame(duplicate_raw_ids)

print(len(duplicates_raw_df))

2658428


In [8]:
# remove duplicates
combined_raw_df.drop_duplicates(subset=['id'], inplace=True)

In [11]:
print(len(combined_raw_df))

2042862


In [12]:
combined_raw_df.to_csv('ds_master/reddit_climatechange_raw-dataset.csv', index=False)

In [14]:
last_10_rows = combined_raw_df.tail(10)
last_10_rows.to_csv('ds_master/reddit_climatechange_raw-dataset-snippet10.csv', index=False)