# SETUP

General Libraries

In [None]:
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sci
import numpy as np
import numpy.linalg as linalg
import pandas as pd
import json
import re
import ast
from ast import literal_eval
import datetime
import math
from numpy import sqrt

pd.set_option('display.max_colwidth', None)

Sentiment Analyzer, Text Manipulation, Computing Composite Score

In [None]:
# !pip uninstall vaderSentiment
!pip install vader-multi

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()
# analyzer.polarity_scores("VADER is smart, handsome, and funny.")

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download("vader_lexicon")
# from nltk.sentiment.vader import SentimentIntensityAnalyzer

import string
import nltk # for text manipulation
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

%matplotlib inline

import string

from nltk.corpus import stopwords
from nltk import re

MIN_YEAR = 1900
MAX_YEAR = 2100


def get_url_patern():
    return re.compile(
        r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))'
        r'[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9]\.[^\s]{2,})')


def get_emojis_pattern():
    try:
        # UCS-4
        emojis_pattern = re.compile(u'([\U00002600-\U000027BF])|([\U0001f300-\U0001f64F])|([\U0001f680-\U0001f6FF])')
    except re.error:
        # UCS-2
        emojis_pattern = re.compile(
            u'([\u2600-\u27BF])|([\uD83C][\uDF00-\uDFFF])|([\uD83D][\uDC00-\uDE4F])|([\uD83D][\uDE80-\uDEFF])')
    return emojis_pattern


def get_hashtags_pattern():
    return re.compile(r'#\w*')


def get_single_letter_words_pattern():
    return re.compile(r'(?<![\w\-])\w(?![\w\-])')


def get_blank_spaces_pattern():
    return re.compile(r'\s{2,}|\t')


def get_twitter_reserved_words_pattern():
    return re.compile(r'(RT|rt|FAV|fav|VIA|via)')


def get_mentions_pattern():
    return re.compile(r'@\w*')


def is_year(text):
    if (len(text) == 3 or len(text) == 4) and (MIN_YEAR < len(text) < MAX_YEAR):
        return True
    else:
        return False


class TwitterPreprocessor:

    def __init__(self, text: str):
        self.text = text

    def fully_preprocess(self):
        return self \
            .remove_urls() \
            .remove_mentions() \
            .remove_hashtags() \
            .remove_twitter_reserved_words() \
            .remove_punctuation() \
            .remove_single_letter_words() \
            .remove_blank_spaces() \
            .remove_stopwords() \
            .remove_numbers()

    def remove_urls(self):
        self.text = re.sub(pattern=get_url_patern(), repl='', string=self.text)
        return self

    def remove_punctuation(self):
        self.text = self.text.translate(str.maketrans('', '', string.punctuation))
        return self

    def remove_mentions(self):
        self.text = re.sub(pattern=get_mentions_pattern(), repl='', string=self.text)
        return self

    def remove_hashtags(self):
        self.text = re.sub(pattern=get_hashtags_pattern(), repl='', string=self.text)
        return self

    def remove_twitter_reserved_words(self):
        self.text = re.sub(pattern=get_twitter_reserved_words_pattern(), repl='', string=self.text)
        return self

    def remove_single_letter_words(self):
        self.text = re.sub(pattern=get_single_letter_words_pattern(), repl='', string=self.text)
        return self

    def remove_blank_spaces(self):
        self.text = re.sub(pattern=get_blank_spaces_pattern(), repl=' ', string=self.text)
        return self

    def remove_stopwords(self, extra_stopwords=None):
        if extra_stopwords is None:
            extra_stopwords = []
        text = nltk.word_tokenize(self.text)
        stop_words = set(stopwords.words('english'))

        new_sentence = []
        for w in text:
            if w not in stop_words and w not in extra_stopwords:
                new_sentence.append(w)
        self.text = ' '.join(new_sentence)
        return self

    def remove_numbers(self, preserve_years=False):
        text_list = self.text.split(' ')
        for text in text_list:
            if text.isnumeric():
                if preserve_years:
                    if not is_year(text):
                        text_list.remove(text)
                else:
                    text_list.remove(text)

        self.text = ' '.join(text_list)
        return self

    def lowercase(self):
        self.text = self.text.lower()
        return self



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


# DATA CLEANING

In [None]:
master = pd.read_csv('/content/drive/MyDrive/Echo_Chamber_Datasets/master.csv')
master.head()

  master = pd.read_csv('/content/drive/MyDrive/Echo_Chamber_Datasets/master.csv')


Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,place,...,quote_url,video,near,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date
0,1417693.0,1417693.0,1166600000000.0,20/12/2006,10:36:26,E. Africa Standard Time,16053.0,sinenox,Cat,,...,,0,,,,,,,"[{'user_id': '16053', 'username': 'Sinenox'}]",
1,3306733.0,3306733.0,1169170000000.0,19/01/2007,04:00:47,E. Africa Standard Time,621543.0,bbchealth,BBC Health News,,...,,0,,,,,,,"[{'user_id': '621543', 'username': 'bbchealth'}]",
2,5042833.0,5042833.0,1170370000000.0,02/02/2007,01:17:44,E. Africa Standard Time,614623.0,clith,Reid,,...,,0,,,,,,,"[{'user_id': '614623', 'username': 'clith'}]",
3,5232473.0,5232473.0,1170510000000.0,03/02/2007,17:27:30,E. Africa Standard Time,68453.0,vivdora,Delia 💚,,...,,0,,,,,,,"[{'user_id': '68453', 'username': 'Vivdora'}]",
4,5312417.0,5312417.0,1170610000000.0,04/02/2007,19:18:33,E. Africa Standard Time,717363.0,luxalptraum,Lux 📝 Alptraum,,...,,0,,,,,,,"[{'user_id': '717363', 'username': 'LuxAlptraum'}]",


Preliminary function needed: (to extract year)

In [None]:
from datetime import datetime
# in this case, date is in a string format, NOT datetime.
def get_year(date):
  y = datetime.strptime(date, '%Y-%m-%d').year
  return(y)

dat = '2019-11-30'
get_year(dat)

2019

Splitting Data set by Year, then cleaning a subset of the years. (2015-2019)

In [None]:
######################################################### SPLITTING BY YEAR
master_copy = master.copy()
master_copy['date'] = pd.to_datetime(master_copy['date'])
master_copy['Year'] = master_copy['date'].dt.strftime('%Y')

master_2010 = master_copy[master_copy['Year'] == '2010']
master_2011 = master_copy[master_copy['Year'] == '2011']
master_2012 = master_copy[master_copy['Year'] == '2012']
master_2013 = master_copy[master_copy['Year'] == '2013']
master_2014 = master_copy[master_copy['Year'] == '2014']
master_2015 = master_copy[master_copy['Year'] == '2015']
master_2016 = master_copy[master_copy['Year'] == '2016']
master_2017 = master_copy[master_copy['Year'] == '2017']
master_2018 = master_copy[master_copy['Year'] == '2018']
master_2019 = master_copy[master_copy['Year'] == '2019']

master_2010.to_csv('/content/drive/My Drive/Echo_Chamber_Datasets/master_2010.csv')
master_2011.to_csv('/content/drive/My Drive/Echo_Chamber_Datasets/master_2011.csv')
master_2012.to_csv('/content/drive/My Drive/Echo_Chamber_Datasets/master_2012.csv')
master_2013.to_csv('/content/drive/My Drive/Echo_Chamber_Datasets/master_2013.csv')
master_2014.to_csv('/content/drive/My Drive/Echo_Chamber_Datasets/master_2014.csv')
master_2015.to_csv('/content/drive/My Drive/Echo_Chamber_Datasets/master_2015.csv')
master_2016.to_csv('/content/drive/My Drive/Echo_Chamber_Datasets/master_2016.csv')
master_2017.to_csv('/content/drive/My Drive/Echo_Chamber_Datasets/master_2017.csv')
master_2018.to_csv('/content/drive/My Drive/Echo_Chamber_Datasets/master_2018.csv')
master_2019.to_csv('/content/drive/My Drive/Echo_Chamber_Datasets/master_2019.csv')


############################################### TAKE SUBSET OF YEARS TO CLEAN
df = pd.concat([master_2015, master_2016, master_2017, master_2018, master_2019]).reset_index().drop('index', axis=1)


##################################################################### CLEANING

from datetime import datetime
df_clean = df.copy()

###################### STEP 0: EXTRACT YEAR

# only take tweets with appropriate dates
df_clean = df_clean[df_clean['date'].str.len() == 10]

# extract the year
df_clean['year'] = df_clean['date'].apply(lambda x: get_year(x))

# relevant columns only
df_clean = df_clean[['username', 'reply_to', 'mentions', 'tweet', 'year']]

################ STEP 1: SENTIMENTS
# Clean tweets and append to new column
tweets = df_clean['tweet'].apply(str)
clean_tweets = []
for tweet in tweets:
    c = TwitterPreprocessor((tweet))
    c.fully_preprocess()
    c = c.text
    clean_tweets.append(c)

df_clean['clean_tweets'] = clean_tweets

# create analyzer object
analyzer = SentimentIntensityAnalyzer()

# get a list of scores and plot
scores = [analyzer.polarity_scores(tweet)['compound'] for tweet in df_clean['clean_tweets']]

# append sentiment to df
df_clean['sentiment'] = pd.DataFrame(scores)

#################### STEP 2: MENTIONS
copy2 = df_clean.copy()

copy2['mentions'] = copy2['mentions'].apply(lambda x: ast.literal_eval(str(x)))
copy2 = copy2.explode("mentions").reset_index()
copy2['interact_with'] = copy2['mentions'].str.lower()
copy2['username'] = copy2['username'].str.lower()

mention_df = copy2[['username', 'interact_with', 'clean_tweets', 'sentiment', 'year']]
mention_df = mention_df.replace(np.nan,'',regex=True)

###### extra cleaning: mentions
users_m = mention_df.username.unique().tolist()
interacted_m = mention_df.interact_with.unique().tolist()

# remove those who don't tweet
mention_df = mention_df[mention_df['interact_with'].isin(users_m)]

# remove any other empty rows that made their way into the df
mention_df = mention_df[mention_df['username'] != '']
mention_df = mention_df[mention_df['username'] != '[]']
mention_df = mention_df[mention_df['interact_with'] != '']

# remove self-loops
mention_df = mention_df[mention_df['username'] != mention_df['interact_with']]

# finally, remove non-appropriate sentiment values
mention_df = mention_df[~mention_df['sentiment'].isna()]
mention_df = mention_df[mention_df['sentiment'] != '']

##################### STEP 3: REPLIES
copy1 = df_clean.copy()

copy1['reply_to'] = copy1['reply_to'].apply(lambda x: ast.literal_eval(str(x)))
copy1 = copy1.explode('reply_to').reset_index()
copy1['interact_with'] = copy1['reply_to'].str['username'].str.lower()
copy1['username'] = copy1['username'].str.lower()

reply_df = copy1[['username', 'interact_with', 'clean_tweets', 'sentiment', 'year']]
reply_df = reply_df.replace(np.nan,'',regex=True)

###### extra cleaning: replies
users_r = reply_df.username.unique().tolist()
interacted_r = reply_df.interact_with.unique().tolist()

# remove those who don't tweet
reply_df = reply_df[reply_df['interact_with'].isin(users_r)]

# remove any other empty rows that made their way into the df
reply_df = reply_df[reply_df['username'] != '']
reply_df = reply_df[reply_df['username'] != '[]']
reply_df = reply_df[reply_df['interact_with'] != '']

# remove self-loops
reply_df = reply_df[reply_df['username'] != reply_df['interact_with']]

# finally, remove NaN
reply_df = reply_df[~reply_df['sentiment'].isna()]
reply_df = reply_df[reply_df['sentiment'] != '']


# SAVE TO GOOGLE DRIVE
mention_df.to_csv('/content/drive/MyDrive/Echo_Chamber_Datasets/mention_15_19.csv')
reply_df.to_csv('/content/drive/MyDrive/Echo_Chamber_Datasets/reply_15_19.csv')