# Import Packages

In [1]:
import pandas as pd
import numpy as np
import pickle
import re

import nltk
# nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
from nltk import word_tokenize, FreqDist
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import spacy

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier

In [2]:
pickle_in = open('data/pickle_jar/cleaned.pkl', 'rb')
tweets = pickle.load(pickle_in)
pickle_in.close()

tweets.reset_index(inplace=True)
tweets.drop(columns = 'index', inplace=True)

In [3]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1970258 entries, 0 to 1970257
Data columns (total 21 columns):
 #   Column                   Dtype         
---  ------                   -----         
 0   created_at               datetime64[ns]
 1   id                       int64         
 2   full_text                object        
 3   truncated                bool          
 4   in_reply_to_screen_name  object        
 5   is_quote_status          bool          
 6   retweet_count            int64         
 7   favorite_count           int64         
 8   lang                     object        
 9   retweeted_status         object        
 10  names                    object        
 11  screen_names             object        
 12  locations                object        
 13  follower_counts          int64         
 14  user_created_at          object        
 15  verified                 bool          
 16  statuses_counts          int64         
 17  location                 ob

In [4]:
tweets.shape

(1970258, 21)

In [5]:
tweets.isna().sum()

created_at                       0
id                               0
full_text                        0
truncated                        0
in_reply_to_screen_name    1525932
is_quote_status                  0
retweet_count                    0
favorite_count                   0
lang                             0
retweeted_status           1970258
names                            0
screen_names                     0
locations                        0
follower_counts                  0
user_created_at                  0
verified                         0
statuses_counts                  0
location                   1889581
country                    1889581
hashtags                   1234431
withheld_in_countries      1970213
dtype: int64

In [6]:
location_perc = 100 * tweets.dropna(subset =['location']).shape[0] / tweets.shape[0]
print(f'{round(location_perc, 3)}% of the dataset has location data')

4.095% of the dataset has location data


In [7]:
df = tweets.dropna(subset=['hashtags'])

In [8]:
hashtags_all = []
for row in df['hashtags']:
    for item in row:
        hashtags_all.append(item.lower())

In [9]:
pd.Series(hashtags_all).value_counts()

climatechange                403207
globalwarming                 55929
climateaction                 50345
environment                   48337
climate                       38087
                              ...  
notshower                         1
lasallianswithoutlimits           1
floodfutures                      1
republicanskillanimals            1
internationalpolarbearday         1
Length: 166071, dtype: int64

In [10]:
unique_hashtags = list(pd.Series(hashtags_all).value_counts().index)

In [11]:
unique_hashtags[:10]

['climatechange',
 'globalwarming',
 'climateaction',
 'environment',
 'climate',
 'actonclimate',
 'energy',
 'climatechangeisreal',
 'auspol',
 'sustainability']

In [12]:
denier_tags = ['climatechangeisfalse', 'climatechangenotreal', 'climatechangehoax', 
               'globalwarminghoax', 'tcot', 'ccot', 'tlot', 'pjnet', 'rednationrising', 'votered', 
               'libtard', 'libtards', 'maga']

believer_tags = ['climatechangeisreal', 'actonclimate', 'extinctionrebellion', 'climateemergency', 
                  'climateactionnow', 'capitalism', 'public_health', 'climateaction', 'humanityextinction',
                'activism', 'noplanetb', 'savetheplanet']

In [13]:
believer = []
denier = []
unsure = []
believe_series = []
count = 0
for idx, row in df['hashtags'].iteritems():
    believe = 0
    deny = 0 
    for tag in row:
        if tag.lower() in denier_tags:
            deny += 1
        elif tag.lower() in believer_tags:
            believe += 1
    if (believe > 0) and (deny == 0):
        believer.append(int(idx))
        believe_series.append(1)
    elif (believe == 0) and (deny > 0):
        denier.append(int(idx))
        believe_series.append(0)
    else:
        unsure.append(int(idx))
        believe_series.append(None)

In [14]:
df = df.assign(believer = believe_series)

In [15]:
to_train = df.dropna(subset=['believer'])

In [16]:
to_train['locations'].value_counts()

                               14027
Globally l Planet Earth        11973
Tampere, Finland                9519
Right Here......                3128
United States                   2721
                               ...  
SF | LA | SAC | SD | SB            1
Hudson Valley + LI Sound           1
Towson, Maryland                   1
Sicilia                            1
Dallas, TX - Southeast Asia        1
Name: locations, Length: 9663, dtype: int64

In [27]:
def re_clean(tweet):
    tweet = tweet.lower()
    tweet = re.sub(r'[@#][\w]+','', tweet)
#     tweet = re.sub(r'[#]','', tweet)
    tweet = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', tweet)
    tweet = re.sub(r'\s{2,5}', ' ', tweet)
    tweet = re.sub(r'\n', ' ', tweet)

    return tweet.strip()

In [28]:
def tweet_to_vec(tweet):
    tweet = re_clean(tweet)
    

In [29]:
n = -3

print(re_clean(to_train.iloc[n]['full_text']))

to_train.iloc[n]['full_text']

i'd even take mushy peas over baked brits.


"@StephenLeahy I'd even take mushy peas over baked Brits.\n#ActOnClimate"

In [30]:
cleaned_tweets = to_train['full_text'].apply(lambda x: re_clean(x))

In [31]:
y = to_train['believer']

In [32]:
stopwords_list = stopwords.words('english') + list(string.punctuation) + denier_tags + believer_tags

In [33]:
def process_tweets(tweet):
    tok = nltk.regexp_tokenize(tweet, r"([a-zA-Z]+(?:'[a-z]+)?)")
    return [word.lower() for word in tok if word.lower() not in stopwords_list]

In [34]:
token_data = list(map(process_tweets, cleaned_tweets))

In [35]:
token_data[12]

['seems', 'really', 'need']

In [36]:
lemmatizer = WordNetLemmatizer()

processed_data = [[lemmatizer.lemmatize(token) for token in tweet] for tweet in token_data]

# Prints "This sentence wa transformed using WordNet Lemmatizer"

In [37]:
processed_data[739]

['thomas', 'dolby', 'blinded', 'science']

In [38]:
all_vocab = set()
for tweet_token in processed_data:
    all_vocab.update(tweet_token)

In [39]:
tweets_concat = []
for tweet in processed_data:
    tweets_concat += tweet

In [40]:
tweet_freqdist = FreqDist(tweets_concat)

## TF-IDF Vectorizer

In [41]:
X_train, X_test, y_train, y_test = train_test_split(cleaned_tweets, y, stratify=y)

In [42]:
vectorizer = TfidfVectorizer()
tf_idf_data_train = vectorizer.fit_transform(X_train)
tf_idf_data_test = vectorizer.transform(X_test)

# Classifier

In [43]:
rf_classifier = RandomForestClassifier()

In [44]:
rf_classifier.fit(tf_idf_data_train, y_train)

RandomForestClassifier()

In [45]:
y_pred_train = rf_classifier.predict(tf_idf_data_train)
y_pred_test = rf_classifier.predict(tf_idf_data_test)

In [46]:
accuracy_score(y_test, y_pred_test)

0.9429774205893608

In [47]:
confusion_matrix(y_test, y_pred_test)

array([[  286,  1616],
       [   23, 26818]])

## Apply Classifier to remainder dataset

In [48]:
X_test.iloc[4]

'remember the clean power plan? well trump wants to reverse course because he believes that is a hoax.'