In [25]:
import numpy as np
import glob
import re
#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#spacy
import spacy
from nltk.corpus import stopwords

#pandas
import pandas as pd

#vis
import pyLDAvis
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()

In [26]:
df = pd.read_csv("data/ExtractedTweets.csv")
df.head()

Unnamed: 0,Party,Handle,Tweet
0,Democrat,RepDarrenSoto,"Today, Senate Dems vote to #SaveTheInternet. P..."
1,Democrat,RepDarrenSoto,RT @WinterHavenSun: Winter Haven resident / Al...
2,Democrat,RepDarrenSoto,RT @NBCLatino: .@RepDarrenSoto noted that Hurr...
3,Democrat,RepDarrenSoto,RT @NALCABPolicy: Meeting with @RepDarrenSoto ...
4,Democrat,RepDarrenSoto,RT @Vegalteno: Hurricane season starts on June...


### Extract tweet information

In [27]:
def find_retweeted(tweet):
    '''This function will extract the twitter handles of retweed people'''
    return re.findall('(?<=RT\\s)(@[A-Za-z]+[A-Za-z0-9-_]+)', tweet)

def find_mentioned(tweet):
    '''This function will extract the twitter handles of people mentioned in the tweet'''
    return re.findall('(?<!RT\\s)(@[A-Za-z]+[A-Za-z0-9-_]+)', tweet)

def find_hashtags(tweet):
    '''This function will extract hashtags'''
    return re.findall('(#[A-Za-z]+[A-Za-z0-9-_]+)', tweet)

# make new columns for retweeted usernames, mentioned usernames and hashtags
df['retweeted'] = df['Tweet'].apply(find_retweeted)
df['mentioned'] = df['Tweet'].apply(find_mentioned)
df['hashtags'] = df['Tweet'].apply(find_hashtags)
df.head(30)

Unnamed: 0,Party,Handle,Tweet,retweeted,mentioned,hashtags
0,Democrat,RepDarrenSoto,"Today, Senate Dems vote to #SaveTheInternet. P...",[],[],"[#SaveTheInternet, #NetNeutrality]"
1,Democrat,RepDarrenSoto,RT @WinterHavenSun: Winter Haven resident / Al...,[@WinterHavenSun],[@RepDarrenSoto],[]
2,Democrat,RepDarrenSoto,RT @NBCLatino: .@RepDarrenSoto noted that Hurr...,[@NBCLatino],[@RepDarrenSoto],[]
3,Democrat,RepDarrenSoto,RT @NALCABPolicy: Meeting with @RepDarrenSoto ...,[@NALCABPolicy],"[@RepDarrenSoto, @LatinoLeader]",[#NALCABPolicy2018]
4,Democrat,RepDarrenSoto,RT @Vegalteno: Hurricane season starts on June...,[@Vegalteno],"[@Pwr4PuertoRico, @RepDarrenSoto, @EspaillatNY]",[]
5,Democrat,RepDarrenSoto,RT @EmgageActionFL: Thank you to all who came ...,[@EmgageActionFL],[],[]
6,Democrat,RepDarrenSoto,Hurricane Maria left approx $90 billion in dam...,[],[],[]
7,Democrat,RepDarrenSoto,RT @Tharryry: I am delighted that @RepDarrenSo...,[@Tharryry],[@RepDarrenSoto],[#NetNeutrality]
8,Democrat,RepDarrenSoto,RT @HispanicCaucus: Trump's anti-immigrant pol...,[@HispanicCaucus],[],[]
9,Democrat,RepDarrenSoto,RT @RepStephMurphy: Great joining @WeAreUnidos...,[@RepStephMurphy],"[@WeAreUnidosUS, @RepDarrenSoto]",[#Orlando]


### Clean metioned, retweets and hashtags from tweet

In [36]:
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
stop_words.append("rt")

def clean_retweeted(tweet):
    '''This function will extract the twitter handles of retweed people'''
    return re.sub('(?<=RT\\s)(@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet)

def clean_mentioned(tweet):
    '''This function will extract the twitter handles of people mentioned in the tweet'''
    return re.sub('(?<!RT\\s)(@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet)

def clean_hashtags(tweet):
    '''This function will extract hashtags'''
    return re.sub('(#[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet)

def clean(tweet):
    # Remove mentions, retweets and hashtags
    tweet = clean_hashtags(tweet)
    tweet = clean_mentioned(tweet)
    tweet = clean_retweeted(tweet)
    tweet = tweet.lower()
    # Remove punctuation
    tweet = re.sub("[\\.,;:!/\\?]*", "", tweet)
    # Remove stop words
    tweet = ' '.join([word for word in tweet.split(' ') if not word in stop_words])
    # Remove multiple spaces
    tweet = re.sub("\\s+", " ", tweet)
    return tweet

df['Tweet'] = df['Tweet'].apply(clean)
df.head()

Unnamed: 0,Party,Handle,Tweet,retweeted,mentioned,hashtags
0,Democrat,RepDarrenSoto,today senate dems vote proud support similar l...,[],[],"[#SaveTheInternet, #NetNeutrality]"
1,Democrat,RepDarrenSoto,winter resident alta vista teacher one severa...,[@WinterHavenSun],[@RepDarrenSoto],[]
2,Democrat,RepDarrenSoto,noted hurricane maria left approximately $90 ...,[@NBCLatino],[@RepDarrenSoto],[]
3,Democrat,RepDarrenSoto,meeting thanks taking time meet ed marucci gu...,[@NALCABPolicy],"[@RepDarrenSoto, @LatinoLeader]",[#NALCABPolicy2018]
4,Democrat,RepDarrenSoto,hurricane season starts june 1st puerto rico’...,[@Vegalteno],"[@Pwr4PuertoRico, @RepDarrenSoto, @EspaillatNY]",[]


### Lemmatize the text

In [29]:
nlp = spacy.load("en_core_web_lg", disable=["parser", "ner"])

def lemmatization(text, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    texts_out = []
    doc = nlp(text)
    new_text = []
    for token in doc:
        if token.pos_ in allowed_postags:
            new_text.append(token.lemma_)
    final = " ".join(new_text)
    texts_out.append(final)
    return texts_out

def gen_words(tweet):
    # Use gensims simple simple_preprocess to remove accents, lowercase and tokenize
    return gensim.utils.simple_preprocess(tweet, deacc=True)


OSError: [E050] Can't find model 'en_core_web_lg'. It doesn't seem to be a Python package or a valid path to a data directory.