In [None]:
import re
import pandas as pd
import numpy as np
import nltk
import nltk.data
import warnings
nltk.download('stopwords')
nltk.download('punkt')
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from psaw import PushshiftAPI
from tqdm import tqdm_notebook

!pip install google-colab
from google.colab import auth
from google.cloud import bigquery
import pandas as pd
auth.authenticate_user()
print('Authenticated')
client = bigquery.Client(project='socialseg')

nltk.download('punkt')
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
warnings.filterwarnings('ignore')

In [None]:
dates=[]
for i in range(2010,2015):
    dates.append(i)
for i in range(1,13):
    for year in range(5,9):
        month = '0{}'.format(i)[-2:]
        dates.append('201{}_{}'.format(year, month))

# Scrape raw data from Reddit with Bigquery

In [None]:
comments_body = []
comments_author = []
subreddits = []

for date in dates:
    print(date,len(comments_body))
    query = """
            SELECT body,author,subreddit 
            FROM `fh-bigquery.reddit_comments.{}` 
            WHERE body LIKE "% live %" and author != '[deleted]'
            """.format(date)
    job_config = bigquery.QueryJobConfig()
    query_job = client.query(query, job_config=job_config)
    i = 0
    for row in query_job:
        comments_body.append(row.body)
        comments_author.append(row.author)
        subreddits.append(row.subreddit)
        i+=1
        if i>5500:
            break

In [None]:
database = pd.DataFrame({'body':comments_body,'author':comments_author,'subreddit':subreddits}).dropna()

# Find similar comments

In [None]:
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def get_vectors(*strs):
    text = [t for t in strs]
    vectorizer = CountVectorizer(text)
    vectorizer.fit(text)
    return vectorizer.transform(text).toarray()
def get_consine_sim(*strs):
    vectors = [t for t in get_vectors(*strs)]
    return cosine_similarity(vectors)

In [None]:
with open("data/Examples.txt","r") as f:
    lines = f.readlines()

verified = [line.strip() for line in lines if line!='\n']

In [None]:
valid_comments = []
authors = []
for i in tqdm_notebook(range(len(database['author']))):
    data_one = database.iloc[i]
    author = data_one['author']
    body = data_one['body']
    raw_sentences = tokenizer.tokenize(body.strip())

    for raw_sentence in raw_sentences:
        if len(raw_sentence.split())<5:
            continue
        if re.sub("[^a-zA-Z]", "", raw_sentence)=="":
            continue
        score = max([get_consine_sim(raw_sentence,v)[1,0] for v in verified])
        if score>0.075:
            valid_comments.append(raw_sentence)
            authors.append(author)
            break

In [None]:
df_valid = pd.DataFrame({'body':valid_comments,'author':authors})
df_valid.to_csv('valid_sentences.csv')

# Label authors' races

In [None]:
black_authors = []

for date in dates:
    query = """
            SELECT body,author 
            FROM `fh-bigquery.reddit_comments.{}` 
            WHERE (body LIKE "%I\'m black %" OR body LIKE "%I am black %" OR body LIKE "%As a black %"
                   OR body LIKE "%I\'m a black %" OR body LIKE "%I am a black %"))
            """.format(date)
    job_config = bigquery.QueryJobConfig()
    query_job = client.query(query, job_config=job_config)
    for row in query_job:
        black_authors.append(row.author)

white_authors = []

for date in dates:
    query = """
            SELECT body,author 
            FROM `fh-bigquery.reddit_comments.{}` 
            WHERE (body LIKE "%I\'m white %" OR body LIKE "%I am white %" OR body LIKE "%As a white %"
                   OR body LIKE "%I\'m a white %" OR body LIKE "%I am a white %")
            """.format(date)
    job_config = bigquery.QueryJobConfig()
    query_job = client.query(query, job_config=job_config)
    for row in query_job:
        white_authors.append(row.author)        

In [None]:
white_authors = set(white_authors)
black_authors = set(black_authors)
invalid_class = white_authors.intersection(black_authors)

In [None]:
author_race = []
for i in range(len(df_valid)):
    author = df_valid.iloc[i]['authors']
    if author == '[deleted]':
        author_race.append(None)
        continue
    if author in white_authors and author not in invalid_class:
        author_race.append("white")
    elif author in black_authors and author not in invalid_class:
        author_race.append("black")
    else:
        author_race.append(None)

In [None]:
df_valid['race']=author_race
df_valid = df_valid.dropna()
df_valid = df_valid[['body','race']].reset_index().drop(columns=['index'])
df_valid.to_csv('data/complete_data_all.csv')