# Import Packages

In [118]:
import pandas as pd
import numpy as np
import pickle
import re

import nltk
# nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
from nltk import word_tokenize, FreqDist
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import spacy

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier

In [2]:
pickle_in = open('data/pickle_jar/cleaned.pkl', 'rb')
tweets = pickle.load(pickle_in)
pickle_in.close()

tweets.reset_index(inplace=True)
tweets.drop(columns = 'index', inplace=True)

In [3]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1970258 entries, 0 to 1970257
Data columns (total 21 columns):
 #   Column                   Dtype         
---  ------                   -----         
 0   created_at               datetime64[ns]
 1   id                       int64         
 2   full_text                object        
 3   truncated                bool          
 4   in_reply_to_screen_name  object        
 5   is_quote_status          bool          
 6   retweet_count            int64         
 7   favorite_count           int64         
 8   lang                     object        
 9   retweeted_status         object        
 10  names                    object        
 11  screen_names             object        
 12  locations                object        
 13  follower_counts          int64         
 14  user_created_at          object        
 15  verified                 bool          
 16  statuses_counts          int64         
 17  location                 ob

In [4]:
tweets.shape

(1970258, 21)

In [5]:
tweets.isna().sum()

created_at                       0
id                               0
full_text                        0
truncated                        0
in_reply_to_screen_name    1525932
is_quote_status                  0
retweet_count                    0
favorite_count                   0
lang                             0
retweeted_status           1970258
names                            0
screen_names                     0
locations                        0
follower_counts                  0
user_created_at                  0
verified                         0
statuses_counts                  0
location                   1889581
country                    1889581
hashtags                   1234431
withheld_in_countries      1970213
dtype: int64

In [6]:
location_perc = 100 * tweets.dropna(subset =['location']).shape[0] / tweets.shape[0]
print(f'{round(location_perc, 3)}% of the dataset has location data')

4.095% of the dataset has location data


In [7]:
df = tweets.dropna(subset=['hashtags'])

In [8]:
hashtags_all = []
for row in df['hashtags']:
    for item in row:
        hashtags_all.append(item.lower())

In [9]:
pd.Series(hashtags_all).value_counts()

climatechange                     403207
globalwarming                      55929
climateaction                      50345
environment                        48337
climate                            38087
                                   ...  
theresatrumptweetforeverything         1
newbridge                              1
thehappening                           1
ruthbaderginsburg                      1
airqualityaproblem                     1
Length: 166071, dtype: int64

In [10]:
unique_hashtags = list(pd.Series(hashtags_all).value_counts().index)

In [11]:
unique_hashtags[:10]

['climatechange',
 'globalwarming',
 'climateaction',
 'environment',
 'climate',
 'actonclimate',
 'energy',
 'climatechangeisreal',
 'auspol',
 'sustainability']

In [12]:
denier_tags = ['climatechangeisfalse', 'climatechangenotreal', 'climatechangehoax', 
               'globalwarminghoax', 'tcot', 'ccot', 'tlot', 'pjnet', 'rednationrising', 'votered', 
               'libtard', 'libtards', 'maga']

believer_tags = ['climatechangeisreal', 'actonclimate', 'extinctionrebellion', 'climateemergency', 
                  'climateactionnow', 'capitalism', 'public_health', 'climateaction', 'humanityextinction',
                'activism', 'noplanetb']

In [13]:
believer = []
denier = []
unsure = []
believe_series = []
count = 0
for idx, row in df['hashtags'].iteritems():
    believe = 0
    deny = 0 
    for tag in row:
        if tag.lower() in denier_tags:
            deny += 1
        elif tag.lower() in believer_tags:
            believe += 1
    if (believe > 0) and (deny == 0):
        believer.append(int(idx))
        believe_series.append(True)
    elif (believe == 0) and (deny > 0):
        denier.append(int(idx))
        believe_series.append(False)
    else:
        unsure.append(int(idx))
        believe_series.append(None)

In [14]:
df = df.assign(believer = believe_series)

In [15]:
to_train = df.dropna(subset=['believer'])

In [16]:
to_train

Unnamed: 0,created_at,id,full_text,truncated,in_reply_to_screen_name,is_quote_status,retweet_count,favorite_count,lang,retweeted_status,...,locations,follower_counts,user_created_at,verified,statuses_counts,location,country,hashtags,withheld_in_countries,believer
65,2018-08-13 10:46:39,1028955987191848961,Latest article from Vanguard on the Federal Go...,False,,False,0,0,en,,...,Australia,576,2018-08-10,False,575,,,"[fraud, GreatBarrierReef, climatechange, capit...",,True
77,2018-08-13 10:48:14,1028956388339404801,#ClimateChangeIsReal - ‘researchers projected ...,False,,True,0,0,en,,...,"Scotland, United Kingdom",668,2016-03-09,False,3283,,,[ClimateChangeIsReal],,True
80,2018-08-13 10:48:34,1028956469763424258,#actonclimate Apple is now a $1 trillion tech ...,False,,False,1,0,en,,...,,25054,2013-11-09,False,129289,,,[actonclimate],,True
86,2018-08-13 10:49:11,1028956628064894976,What is geoengineering? https://t.co/wwdORYyv3...,False,,False,1,1,nl,,...,Globally l Planet Earth,46745,2016-05-13,False,128032,,,"[climatechange, climateaction, environment, en...",,True
112,2018-08-13 10:52:38,1028957494910115840,"As America Burns from #ClimateChange, #Trump O...",False,,False,0,0,en,,...,Montréal Québec Canada,885,2011-07-26,False,46543,,,"[ClimateChange, Trump, uspoli, GOP, MAGA]",,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1970127,2018-11-27 02:42:28,1067247252601749505,https://t.co/aODTrYRymY #ClimateAction #Climat...,False,,False,0,0,und,,...,"Florida, USA",3562,2009-12-21,False,20657,,,"[ClimateAction, ClimateChangeisReal, ClimateCh...",,True
1970131,2018-11-27 02:42:33,1067247276626710529,@jaketapper @PamelaBrownCNN @TheLeadCNN Under ...,False,jaketapper,False,0,0,en,,...,"Texas, USA",125,2015-03-02,False,6700,,,[ClimateChangeHoax],,False
1970135,2018-11-27 02:42:40,1067247302904086528,@StephenLeahy I'd even take mushy peas over ba...,False,StephenLeahy,False,0,1,en,,...,"Treaty One Territory, Winnipeg",829,2011-08-18,False,9970,,,[ActOnClimate],,True
1970205,2018-11-27 02:44:09,1067247677434408962,#NoPlanetB #ClimateChangeIsReal #ClimateAction...,False,,True,0,0,en,,...,UK,541,2009-03-30,False,42375,,,"[NoPlanetB, ClimateChangeIsReal, ClimateAction...",,True


In [17]:
to_train['locations'].value_counts()

                                 13823
Globally l Planet Earth          11973
Tampere, Finland                  9519
Right Here......                  3128
United States                     2699
                                 ...  
Boston, MA     Technygal Blog        1
the Tree of Life                     1
Vossestrand,Norway                   1
Dijon | Marseille | NYC              1
Around you                           1
Name: locations, Length: 9481, dtype: int64

In [67]:
def re_clean(tweet):
    tweet = tweet.lower()
    tweet = re.sub(r'[@][\w]+','', tweet)
    tweet = re.sub(r'[#]','', tweet)
    tweet = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', tweet)
    tweet = re.sub(r'\s{2,5}', ' ', tweet)
    tweet = re.sub(r'\n', ' ', tweet)

    return tweet.strip()

In [68]:
n = -3

print(re_clean(to_train.iloc[n]['full_text']))

to_train.iloc[n]['full_text']

i'd even take mushy peas over baked brits. actonclimate


"@StephenLeahy I'd even take mushy peas over baked Brits.\n#ActOnClimate"

In [91]:
to_train.shape

(113726, 22)

In [69]:
cleaned_tweets = to_train['full_text'].apply(lambda x: re_clean(x))

In [70]:
cleaned_tweets

65         latest article from vanguard on the federal go...
77         climatechangeisreal - ‘researchers projected t...
80         actonclimate apple is now a $1 trillion tech b...
86         what is geoengineering? climatechange climatea...
112        as america burns from climatechange, trump off...
                                 ...                        
1970127    climateaction climatechangeisreal climatechang...
1970131    under what us president was that report initia...
1970135    i'd even take mushy peas over baked brits. act...
1970205    noplanetb climatechangeisreal climateactionnow...
1970245    the concentration of carbon dioxide in the atm...
Name: full_text, Length: 113726, dtype: object

In [71]:
y = to_train['believer']

In [79]:
stopwords_list = stopwords.words('english') + list(string.punctuation) + denier_tags + believer_tags 

In [80]:
def process_tweets(tweet):
    tok = nltk.regexp_tokenize(tweet, r"([a-zA-Z]+(?:'[a-z]+)?)")
    return [word.lower() for word in tok if word.lower() not in stopwords_list]

In [101]:
token_data = list(map(process_tweets, cleaned_tweets))

In [111]:
token_data[12]

['globalwarming', 'firenado', 'seems', 'really', 'need']

In [105]:
lemmatizer = WordNetLemmatizer()

processed_data = [[lemmatizer.lemmatize(token) for token in tweet] for tweet in token_data]

# Prints "This sentence wa transformed using WordNet Lemmatizer"

In [113]:
processed_data[12]

['globalwarming', 'firenado', 'seems', 'really', 'need']

In [117]:
lemmatizer.lemmatize('play')

'play'

In [83]:
all_vocab = set()
for tweet_token in processed_data:
    all_vocab.update(tweet_token)

In [84]:
tweets_concat = []
for tweet in processed_data:
    tweets_concat += tweet

In [87]:
tweet_freqdist = FreqDist(tweets_concat)

## TF-IDF Vectorizer

In [94]:
X_train, X_test, y_train, y_test = train_test_split(cleaned_tweets, y)

In [96]:
vectorizer = TfidfVectorizer()
tf_idf_data_train = vectorizer.fit_transform(X_train)
tf_idf_data_test = vectorizer.transform(X_test)