In [1]:
# insert your credentials here
import pandas as pd
import sqlite3
import os
from dotenv import load_dotenv
import dataset
import json
import sys
import tweepy
from sqlalchemy.exc import ProgrammingError
import pandas as pd

In [2]:
# install dependencies 
import spacy
from spacy.lang.en import English
import nltk

from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer

In [3]:
load_dotenv()

db = dataset.connect(os.getenv("DATABASE_URL"))

# Variables that contains the credentials to access Twitter API
CONSUMER_KEY = os.getenv('CONSUMER_KEY')
CONSUMER_SECRET = os.getenv('CONSUMER_SECRET')
ACCESS_KEY = os.getenv('ACCESS_KEY')
ACCESS_SECRET = os.getenv('ACCESS_SECRET')

In [4]:
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_KEY, ACCESS_SECRET)

# initialize Tweepy API
api = tweepy.API(auth, wait_on_rate_limit=True,
          wait_on_rate_limit_notify=True)

In [5]:
class StreamListener(tweepy.StreamListener):
    def __init__(self, output_file=sys.stdout):
        super(StreamListener,self).__init__()
        self.output_file = output_file
        self.counter = 0

    def on_status(self, status):
        self.counter = self.counter + 1
        conditions = (not 'RT @' in status.text)
        if conditions:
            description = status.user.description
            loc = status.user.location
            text = status.text
            coords = status.coordinates
            geo = status.geo
            name = status.user.screen_name
            user_created = status.user.created_at
            id_str = status.id_str
            created = status.created_at
            source = status.user.url
            language = status.lang

            if geo is not None:
                geo = json.dumps(geo)

            if coords is not None:
                coords = json.dumps(coords)

            table = db["tweets"]
            try:
                table.insert(dict(
                    user_description=description,
                    user_location=loc,
                    coordinates=coords,
                    text=text,
                    geo=geo,
                    user_name=name,
                    user_created=user_created,
                    id_str=id_str,
                    created=created,
                    source = source,
                    language = language,
                    ))
            except ProgrammingError as err:
                print(err)

    def on_error(self, status_code):
        print('Encountered error with status code:', status_code)
        if status_code == 420:
            #return False in on_data disconnects the stream
            return False

    # When a deleted tweet appears
    def on_delete(self, status_id, user_id):
        print("Delete notice")
        return

    # When reach the rate limit
    def on_limit(self, track):
        print("Rate limited, continuing")
        # Continue mining tweets
        return True

In [None]:
# create instance of the tweepy tweet stream listener
stream_listener = StreamListener()

# create instance of the tweepy stream
stream = tweepy.Stream(auth=auth, listener=stream_listener, tweet_mode="extended")

# words to search for
track = ["police", "cop", "officer"]

# search twitter for programming languages
stream.filter(track=track, languages = ['en', 'und'])

In [7]:
df_raw = pd.DataFrame(db['tweets'])
df = df_raw[["id_str", "text"]]
df.rename(columns={'id_str': 'ids'}, inplace=True)
df

Unnamed: 0,ids,text
0,1371142519459876865,"@onlygeek @chrissieA2 @Mike_Fabricant No, I'm ..."
1,1371142519828926469,@SkyNews @skymarkwhite May as well protest whi...
2,1371142519883493379,@JMPSimor The right to free assembly and prote...
3,1371142520684576769,@yampylad @therealmissjo The city has accepted...
4,1371142522848874498,@CathCarterMusic Absolute disgrace the police ...
...,...,...
6403,1371991058448445440,Oh my. 🥺 #StopAsianHate
6404,1371991062550548488,It's the @NHL...they only punish based on the ...
6405,1371991068980310018,@theonyxshade if i can trick them someones goi...
6406,1371991070813261830,16.3.2021 Police setting fires so close to res...


In [8]:
cnx = sqlite3.connect('tweets.db')

df = pd.read_sql_query("SELECT * FROM tweets", cnx)

In [8]:
# drop NAs and get shape
df.dropna(inplace=True)
df.shape

(6408, 2)

In [9]:
# seeing how the data looks
df.head()

Unnamed: 0,ids,text
0,1371142519459876865,"@onlygeek @chrissieA2 @Mike_Fabricant No, I'm ..."
1,1371142519828926469,@SkyNews @skymarkwhite May as well protest whi...
2,1371142519883493379,@JMPSimor The right to free assembly and prote...
3,1371142520684576769,@yampylad @therealmissjo The city has accepted...
4,1371142522848874498,@CathCarterMusic Absolute disgrace the police ...


In [10]:
# example of text 
sample = df['text'][0]
sample

"@onlygeek @chrissieA2 @Mike_Fabricant No, I'm not short of criticisms of the police. In fact, I have many - and I d… https://t.co/Jxxe31eKnk"

In [17]:
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import io

ImportError: Keras requires TensorFlow 2.2 or higher. Install TensorFlow via `pip install tensorflow`

In [25]:
# BERT imports
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import io

RuntimeError: generic_type: cannot initialize type "TensorProtoDataType": an object with that name is already defined

In [11]:
# loading small version of english nlp
nlp = spacy.load("en_core_web_sm")

In [12]:
# load english parser from spacy
parser = English()

# boiler-plate tokenize function
def tokenize(text):
    """Parses a string into a list of semantic units (words)
    Args: text (str): The string that the function will tokenize.
    Returns: list: tokens parsed out by the mechanics of your choice
    """
    lda_tokens = []
    tokens = nlp(text)

    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.pos_ == 'PROPN':
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [None]:
# sample through function to test outcome
tokenize(sample)

In [14]:
nltk.download('wordnet')

def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    

def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

[nltk_data] Error loading wordnet: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1123)>


In [None]:
# universal stopwords from nltk
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

In [None]:
# extra stop words that pertains to this model
more_stop = ['police', 'officer', 'cop', 'SCREEN_NAME']

In [None]:
def prepare_text_for_lda(text):
    """ takes text and tokenizes it, only looks at tweets with more than 4 words and removes stopwords"""
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    tokens = [token for token in tokens if token not in more_stop]
    return tokens

In [None]:
# creates column in DF with lemmas
df['lemmas'] = df['text'].apply(prepare_text_for_lda)

# visualize your work
df