# Import Packages

In [126]:
import pandas as pd
import numpy as np
import pickle
import re

import nltk
# nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
from nltk import word_tokenize, FreqDist
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import spacy

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier

In [89]:
pickle_in = open('data/pickle_jar/cleaned.pkl', 'rb')
tweets = pickle.load(pickle_in)
pickle_in.close()

tweets.reset_index(inplace=True)
tweets.drop(columns = 'index', inplace=True)

In [90]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1970258 entries, 0 to 1970257
Data columns (total 21 columns):
 #   Column                   Dtype         
---  ------                   -----         
 0   created_at               datetime64[ns]
 1   id                       int64         
 2   full_text                object        
 3   truncated                bool          
 4   in_reply_to_screen_name  object        
 5   is_quote_status          bool          
 6   retweet_count            int64         
 7   favorite_count           int64         
 8   lang                     object        
 9   retweeted_status         object        
 10  names                    object        
 11  screen_names             object        
 12  locations                object        
 13  follower_counts          int64         
 14  user_created_at          object        
 15  verified                 bool          
 16  statuses_counts          int64         
 17  location                 ob

In [91]:
tweets.shape

(1970258, 21)

In [92]:
tweets.isna().sum()

created_at                       0
id                               0
full_text                        0
truncated                        0
in_reply_to_screen_name    1525932
is_quote_status                  0
retweet_count                    0
favorite_count                   0
lang                             0
retweeted_status           1970258
names                            0
screen_names                     0
locations                        0
follower_counts                  0
user_created_at                  0
verified                         0
statuses_counts                  0
location                   1889581
country                    1889581
hashtags                   1234431
withheld_in_countries      1970213
dtype: int64

In [93]:
location_perc = 100 * tweets.dropna(subset =['location']).shape[0] / tweets.shape[0]
print(f'{round(location_perc, 3)}% of the dataset has location data')

4.095% of the dataset has location data


In [94]:
df = tweets.dropna(subset=['hashtags'])

In [95]:
hashtags_all = []
for row in df['hashtags']:
    for item in row:
        hashtags_all.append(item.lower())

In [97]:
unique_hashtags = list(pd.Series(hashtags_all).value_counts().index)

In [98]:
unique_hashtags[:10]

['climatechange',
 'globalwarming',
 'climateaction',
 'environment',
 'climate',
 'actonclimate',
 'energy',
 'climatechangeisreal',
 'auspol',
 'sustainability']

In [99]:
denier_tags = ['climatechangeisfalse', 'climatechangenotreal', 'climatechangehoax', 
               'globalwarminghoax', 'tcot', 'ccot', 'tlot', 'pjnet', 'rednationrising', 'votered', 
               'libtard', 'libtards', 'maga']

believer_tags = ['climatechangeisreal', 'actonclimate', 'extinctionrebellion', 'climateemergency', 
                  'climateactionnow', 'capitalism', 'public_health', 'climateaction', 'humanityextinction',
                'activism', 'noplanetb', 'savetheplanet']

In [100]:
believer = []
denier = []
unsure = []
believe_series = []
count = 0
for idx, row in df['hashtags'].iteritems():
    believe = 0
    deny = 0 
    for tag in row:
        if tag.lower() in denier_tags:
            deny += 1
        elif tag.lower() in believer_tags:
            believe += 1
    if (believe > 0) and (deny == 0):
        believer.append(int(idx))
        believe_series.append(1)
    elif (believe == 0) and (deny > 0):
        denier.append(int(idx))
        believe_series.append(0)
    else:
        unsure.append(int(idx))
        believe_series.append(None)

In [101]:
df = df.assign(believer = believe_series)

In [102]:
to_train = df.dropna(subset=['believer'])

In [327]:
lemmatizer = WordNetLemmatizer()
stopwords_list = stopwords.words('english') + (list(string.punctuation) + 
                                               denier_tags + believer_tags + unique_hashtags[:100])

def re_clean(tweet):
    tweet = tweet.lower()
    tweet = re.sub(r'[@][\w]+','', tweet)
    tweet = re.sub(r'[#]','', tweet)
    tweet = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', tweet)
    tweet = re.sub(r'\s{2,15}', ' ', tweet)
    tweet = re.sub(r'\n', ' ', tweet)
    tweet = re.sub(r'\s\s', ' ', tweet)
    tweet = tweet.strip()
    
    tok = nltk.regexp_tokenize(tweet, r"([a-zA-Z]+(?:'[a-z]+)?)")
    toks = [word.lower() for word in tok if word.lower() not in stopwords_list]
    lemma = [lemmatizer.lemmatize(token) for token in toks]
    
    return ' '.join(lemma)

In [328]:
def check_cleaned(df, n=None):
    if not n:
        n = np.random.randint(0, len(df) - 1)
    print('ORIGINAL:',df.iloc[n]['full_text'])
    print('\nCLEANED:', re_clean(df.iloc[n]['full_text']))
    
check_cleaned(to_train)

ORIGINAL: What is Sustainable Seafood? Learn more here: https://t.co/asJ9CxxnbI #climatechange #climateaction 
#environment 
#energy https://t.co/oO9ugGtoyz

CLEANED: seafood learn


In [329]:
process_tweets = to_train['full_text'].apply(lambda x: re_clean(x))

In [330]:
y = to_train['believer']

In [331]:

def process_tweets(tweet):
    tok = nltk.regexp_tokenize(tweet, r"([a-zA-Z]+(?:'[a-z]+)?)")
    toks = [word.lower() for word in tok if word.lower() not in stopwords_list]
    lemma = [lemmatizer.lemmatize(token) for token in toks]
    
    return ' '.join(lemma)

In [332]:
processed_tweets = cleaned_tweets.apply(lambda x: process_tweets(x))

In [333]:
token_data.iloc[12]

'globalwarming firenado seems really need'

## TF-IDF Vectorizer

In [334]:
processed_tweets

65         latest article vanguard federal government's f...
77         researcher projected share population exposed ...
80         apple trillion behemoth planet paying price en...
86                                                          
112        america burn official attend denial conference...
                                 ...                        
1970127                      environmentaljustice greenpeace
1970131               u president report initiated rest case
1970135                   i'd even take mushy pea baked brit
1970205    elon musk kinda dick prob v rich ppl amp robot...
1970245    concentration dioxide atmosphere reached avera...
Name: full_text, Length: 114971, dtype: object

In [335]:
cleaned_tweets.iloc[-2]

"noplanetb climatechangeisreal climateactionnow [elon musk is kinda a dick. prob. only v. rich ppl &amp;robots/slaves will get 2 colonise other planets. ignore rich orange fools w/bad combover &amp;rightwingers w/vested interests in fossil fuels. time's running out: listen to science]"

In [336]:
X_train, X_test, y_train, y_test = train_test_split(processed_tweets, y, stratify=y)

In [337]:
X_train

241715            wea nao bae iu go go facing solomonislands
326980                                 threaten availability
1180529                    look forward seeing advance group
391597     yet another disturbing one today's manorama pi...
1281744                                 low technology learn
                                 ...                        
1033881    thanks faith courageousconversations series co...
1499113                                 today large oilspill
1238133    absence president willing lead important ever ...
1550446                                          affect rest
898337     global inextricably synergistically linked un'...
Name: full_text, Length: 86228, dtype: object

In [338]:
pipeline = Pipeline([('vec', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', RandomForestClassifier(class_weight='balanced'))])

In [339]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('vec', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', RandomForestClassifier(class_weight='balanced'))])

In [340]:
pipeline.score(X_test, y_test)

0.9277041366593606

In [341]:
vectorizer = TfidfVectorizer()
tf_idf_data_train = vectorizer.fit_transform(X_train)
tf_idf_data_test = vectorizer.transform(X_test)

# Classifier

In [342]:
rf_classifier = RandomForestClassifier(class_weight='balanced')

In [343]:
rf_classifier.fit(tf_idf_data_train, y_train)

RandomForestClassifier(class_weight='balanced')

In [344]:
y_pred_train = rf_classifier.predict(tf_idf_data_train)
y_pred_test = rf_classifier.predict(tf_idf_data_test)

In [345]:
accuracy_score(y_test, y_pred_test)

0.9278433009776293

In [346]:
confusion_matrix(y_test, y_pred_test)

array([[ 1222,   680],
       [ 1394, 25447]])

## Apply Classifier to remainder dataset

In [347]:
to_class = tweets[tweets['hashtags'].isna()]

In [348]:
processed_test = to_class['full_text'].apply(lambda x: re_clean(x))

In [349]:
check_cleaned(to_class)

ORIGINAL: good question. I'm going with 'more scared of the energy lobby than the insurance lobby'. Which is why DSA candidates have a plan that fuses climate and economy - the Green New Deal. (which is mentioned here, to be fair!) https://t.co/WHDAfweaIB

CLEANED: good question i'm going scared lobby insurance lobby dsa candidate plan fuse economy new deal mentioned fair


In [350]:
un_class_pred = pipeline.predict(processed_test)

In [351]:
to_class['clean_text'] = processed_test
to_class['believer_pred'] = un_class_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [352]:
def check_pred(df, n=None):
    if n == None:
        n = np.random.randint(0,len(df)-1)
    print(df.iloc[n]['clean_text'], '\n', df.iloc[n]['believer_pred'])

In [371]:
to_class_clean = to_class[to_class['in_reply_to_screen_name'].isna()]

check_pred(to_class_clean[to_class_clean['believer_pred'] == 0])

ValueError: low >= high