## Data Prep

In [2]:
import json
import numpy as np
import pandas as pd

import re
import matplotlib.pyplot as plt
from wordcloud import WordCloud

from textblob import TextBlob
# gensim
from gensim import corpora, models, similarities, matutils
# sklearn
from sklearn import datasets
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


import preprocessor as p # 
import string # use string.punctuation to clean out punctuation

from pprint import pprint

from datetime import datetime
import pickle

import nltk

from tqdm import tqdm

pd.set_option('display.max_colwidth', -1)

In [3]:
# Previous 7 days tweets
# Export the MongoDB collection to json format and transfer from AWS EC2 to localhost
# mongoexport -d climatechange -c climate_tweets -o tweets.json
# scp myaws:tweets.json .
# Read the data into an array

tweets_data = []
tweets_file = open('tweets.json','r')
for line in tweets_file:

    tweet = json.loads(line)
    tweets_data.append(tweet)
    

In [4]:
print(len(tweets_data))

17579


In [5]:
# Streamed tweets
# Export the MongoDB collection to json format and transfer from AWS EC2 to localhost
# mongoexport -d climatechange -c climate_tweets -o tweets.json
# scp myaws:tweets.json .
# Read the data into an array

tweets_stream_data = []
tweets_stream_file = open('tweets_stream.json','r')
for line in tweets_stream_file:

    tweet = json.loads(line)
    tweets_stream_data.append(tweet)
    

In [6]:
print(len(tweets_stream_data))

163341


In [7]:
tweets_stream_data[0]

{'_id': {'$oid': '5a973c81d72c206c69b41871'},
 'contributors': None,
 'coordinates': None,
 'created_at': 'Wed Feb 28 23:33:47 +0000 2018',
 'display_text_range': [0, 140],
 'entities': {'hashtags': [],
  'symbols': [],
  'urls': [],
  'user_mentions': [{'id': 31413260,
    'id_str': '31413260',
    'indices': [3, 17],
    'name': 'Energy In Depth',
    'screen_name': 'EnergyInDepth'},
   {'id': 53970358,
    'id_str': '53970358',
    'indices': [45, 54],
    'name': 'Matt Pawa',
    'screen_name': 'MattPawa'},
   {'id': 949934436,
    'id_str': '949934436',
    'indices': [58, 68],
    'name': 'Tom Steyer',
    'screen_name': 'TomSteyer'}]},
 'favorite_count': 0,
 'favorited': False,
 'full_text': 'RT @EnergyInDepth: Memo from activist-lawyer @MattPawa to @TomSteyer outlines strategy to charge energy companies with causing global warmi…',
 'geo': None,
 'id': {'$numberLong': '968992659242143744'},
 'id_str': '968992659242143744',
 'in_reply_to_screen_name': None,
 'in_reply_to_status_

### Load Tweets into a DataFrame

In [8]:
def make_df(tweets_data):
    
    tweetsDF = pd.DataFrame()
    tweetsDF['user'] = [tweet['user']['screen_name'] for tweet in tweets_data]
    tweetsDF['created_at'] = [datetime.strptime(tweet['created_at'], '%a %b %d %H:%M:%S %z %Y') for tweet in tweets_data] 
    
    text = []

    for tweet in tweets_data:
        try: 
            text.append(tweet['retweeted_status']['full_text'])
        except KeyError:       
            text.append(tweet['full_text'])
        
    tweetsDF['full_text'] = text
    
    tweetsDF['truncated'] = [tweet['truncated'] for tweet in tweets_data]
    tweetsDF['contains_url'] = tweetsDF['full_text'].str.contains('https', regex=True)*1
    tweetsDF['favorite_count'] = [tweet['favorite_count'] for tweet in tweets_data]
    tweetsDF['location'] = [tweet['place']['country'] if tweet['place'] != None else None 
                            for tweet in tweets_data]
    tweetsDF['time_zone'] = [tweet['user']['time_zone'] for tweet in tweets_data]
    tweetsDF['following_count'] = [tweet['user']['friends_count'] for tweet in tweets_data]
    tweetsDF['followers_count'] = [tweet['user']['followers_count'] for tweet in tweets_data]
    tweetsDF['retweet_count'] = [tweet['retweet_count'] for tweet in tweets_data]
    
    return tweetsDF

In [9]:
tweetsDF = make_df(tweets_data)

In [10]:
tweetsDF.dtypes

user               object             
created_at         datetime64[ns, UTC]
full_text          object             
truncated          bool               
contains_url       int64              
favorite_count     int64              
location           object             
time_zone          object             
following_count    int64              
followers_count    int64              
retweet_count      int64              
dtype: object

In [11]:
tweetsDF.full_text[0]

'https://t.co/Yu7IgoHsYd population control will not reverse or mitigate climate change here is the logic and evidence #PopulationControlLaw'

In [12]:
tweets_streamedDF = make_df(tweets_stream_data)

In [13]:
tweets_streamedDF.dtypes

user               object             
created_at         datetime64[ns, UTC]
full_text          object             
truncated          bool               
contains_url       int64              
favorite_count     int64              
location           object             
time_zone          object             
following_count    int64              
followers_count    int64              
retweet_count      int64              
dtype: object

In [14]:
tweets_streamedDF.full_text[0]

'Memo from activist-lawyer @MattPawa to @TomSteyer outlines strategy to charge energy companies with causing global warming https://t.co/DvBfY2ZHwz https://t.co/cVvXlg7O4I'

Put both sets of tweets together for full corpus.  Retain smaller set for testing purposes.

In [15]:
sampleDF = tweetsDF

In [16]:
fullDF = pd.concat([tweets_streamedDF, tweetsDF], ignore_index = True)

In [17]:
sampleDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17579 entries, 0 to 17578
Data columns (total 11 columns):
user               17579 non-null object
created_at         17579 non-null datetime64[ns, UTC]
full_text          17579 non-null object
truncated          17579 non-null bool
contains_url       17579 non-null int64
favorite_count     17579 non-null int64
location           229 non-null object
time_zone          10438 non-null object
following_count    17579 non-null int64
followers_count    17579 non-null int64
retweet_count      17579 non-null int64
dtypes: bool(1), datetime64[ns, UTC](1), int64(5), object(4)
memory usage: 1.4+ MB


In [18]:
fullDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180920 entries, 0 to 180919
Data columns (total 11 columns):
user               180920 non-null object
created_at         180920 non-null datetime64[ns, UTC]
full_text          180920 non-null object
truncated          180920 non-null bool
contains_url       180920 non-null int64
favorite_count     180920 non-null int64
location           1991 non-null object
time_zone          106776 non-null object
following_count    180920 non-null int64
followers_count    180920 non-null int64
retweet_count      180920 non-null int64
dtypes: bool(1), datetime64[ns, UTC](1), int64(5), object(4)
memory usage: 14.0+ MB


## Clean Tweets

In [19]:
def clean_tweets(tweet_DF):
    
    # Remove #-signs, but keep words from hashtag
    tweet_DF.full_text.replace(regex=True, inplace = True, to_replace = r'#', value = r'') 
    
    cleaned_tweets = []
    for tweet in tweet_DF.full_text:
        cleaned_tweets.append(p.clean(tweet))
    tweet_DF['full_text'] = cleaned_tweets
    
     # Remove ampersand code
    tweet_DF.full_text.replace(regex=True, inplace = True, to_replace = r'&amp;', value = r'and')
    
    tweets = []
    for tweet in tweet_DF.full_text:
        letters_only = re.sub("[^a-zA-Z]", " ", tweet) 
        words = ' '.join(letters_only.lower().split())
        tweets.append(words)
    tweet_DF['full_text'] = tweets
    
    return tweet_DF

In [20]:
sampleDF = clean_tweets(sampleDF)

In [21]:
fullDF = clean_tweets(fullDF)

In [22]:
sampleDF.full_text[0]

'population control will not reverse or mitigate climate change here is the logic and evidence populationcontrollaw'

In [23]:
fullDF.full_text[0]

'memo from activist lawyer to outlines strategy to charge energy companies with causing global warming'

## Sentiment Analysis

In [24]:
def append_sentiment(tweetsDF):
    
    bloblist = []
    for tweet in tweetsDF.full_text:
        blob = TextBlob(tweet)
        bloblist.append(blob)
        
    sentlist = []
    for blob in bloblist:
        bs = blob.sentiment
        sentlist.append(bs)

    sentDF = pd.DataFrame()
    sentDF['sentiment'] = sentlist
    
    sentDF = sentDF.sentiment.apply(pd.Series)
    sentDF.rename(columns={0: 'polarity', 1: 'subjectivity'}, inplace=True)
    
    tweetsDF['Polarity'] = sentDF.polarity
    tweetsDF['Subjectivity'] = sentDF.subjectivity
    
    return tweetsDF

In [25]:
sampleDF = append_sentiment(sampleDF)

In [26]:
fullDF = append_sentiment(fullDF)

In [27]:
print('Sample:')
print('Polarity:', sampleDF.Polarity.mean())
print('Subjectivity:', sampleDF.Subjectivity.mean())

Sample:
Polarity: 0.04950138866188848
Subjectivity: 0.3255819384331037


In [28]:
print('Full:')
print('Polarity:', fullDF.Polarity.mean())
print('Subjectivity:', fullDF.Subjectivity.mean())

Full:
Polarity: 0.053287074657702396
Subjectivity: 0.3372217454489455


In [29]:
with open('clean_tweets_sample.pkl', 'wb') as picklefile: # wb: write, binary
    pickle.dump(sampleDF, picklefile) #dump data into pickle file

In [30]:
with open('clean_tweets_full.pkl', 'wb') as picklefile: # wb: write, binary
    pickle.dump(fullDF, picklefile) #dump data into pickle file

In [35]:
sampleDF.created_at.count()

17579

In [37]:
fullDF.created_at.count()

180920

## Location

In [13]:
pd.unique(tweetsDF.location)

array([None, 'Singapore', 'Australia', 'United Kingdom', 'India',
       'Canada', 'United States', 'Kenya', 'Indonesia', 'Malaysia',
       'Mexico', 'Spain', 'Thailand', 'Austria', 'Ireland', 'Denmark',
       'Greece', 'Mali', 'Norway', 'Japan', 'Finland', 'New Zealand',
       'Fiji'], dtype=object)

In [14]:
pd.unique(tweetsDF.time_zone)

array(['Amsterdam', None, 'Central Time (US & Canada)', 'London',
       'Mountain Time (US & Canada)', 'Bern', 'Alaska', 'Baghdad',
       'Sydney', 'Casablanca', 'Eastern Time (US & Canada)', 'Athens',
       'Pacific Time (US & Canada)', 'America/Chicago', 'Nairobi',
       'Quito', 'Bogota', 'Europe/London', 'Brasilia', 'Arizona',
       'Midway Island', 'Kabul', 'Brisbane', 'Helsinki', 'Wellington',
       'Edinburgh', 'Atlantic Time (Canada)', 'Dublin', 'Pretoria',
       'Paris', 'Copenhagen', 'Melbourne', 'Stockholm', 'Africa/Nairobi',
       'Berlin', 'Belgrade', 'Canberra', 'Bucharest', "Nuku'alofa",
       'America/Los_Angeles', 'Monterrey', 'Madrid', 'Hawaii', 'Warsaw',
       'Brussels', 'Bangkok', 'Hanoi', 'Greenland', 'Vienna',
       'Europe/Madrid', 'Kuala Lumpur', 'Karachi', 'Kyiv', 'Buenos Aires',
       'Tbilisi', 'New Delhi', 'Mid-Atlantic', 'Prague',
       'America/Toronto', 'PST', 'Singapore', 'Perth', 'Budapest',
       'Santiago', 'America/New_York', 'Chennai'

In [15]:
sum(tweetsDF.location.value_counts())

229

In [16]:
sum(tweetsDF.time_zone.value_counts())

10438