In [0]:
# Mount your google drive to the colab environment
from google.colab import drive
drive.mount('/content/gdrive')

# copy data
!cp '/content/gdrive/My Drive/Colab Notebooks/Twitter_Analysis/OLD_tweets_with_hashtag_AIDebate.txt' before_debate.txt
!cp '/content/gdrive/My Drive/Colab Notebooks/Twitter_Analysis/tweets_with_hashtag_AIDebate1.txt' after_debate.txt




# change enviroment variable of keras's backend to theano
import os; os.environ['KERAS_BACKEND'] = 'theano'

# get and install emotion predictor model
!git clone https://github.com/nikicc/twitter-emotion-recognition.git 
!ls # list of files

# go to the directory
%cd twitter-emotion-recognition/

# install required versions of the libraries
!pip install -r requirements.txt





# install library for language detection
!pip install langdetect

# download NLTK packages
import nltk
!python -m nltk.downloader all

## Emotion Analysis Part

In [97]:



# analyze each tweet
import datetime
from emotion_predictor import EmotionPredictor



class TweetEmotion:
  
  # load emotion prediction model
  def __init__(self):
    self.model = EmotionPredictor(classification='ekman', setting='mc')


  # what emotion?
  def what_emotion(self, tweet):
    prediction = self.model.predict_classes([tweet])
    # print('The associated emotion with [' + tweet + '] seems to be:' + '\n' + str(prediction))
    return prediction

  # probability of emotion
  def probability_of_emotion(self, tweet):
    probability = self.model.predict_probabilities([tweet])
    # print('The associated probablity of emotion with [' + tweet + '] seems to be:' + '\n' + str(probability))
    # return [probability['Anger'], probability['Disgust'], probability['Fear'],probability['Joy'], probability['Sadness'], probability['Surprise']]
    return probability



# tweet_emotion = TweetEmotion()






Cloning into 'twitter-emotion-recognition'...
remote: Enumerating objects: 54, done.[K
remote: Total 54 (delta 0), reused 0 (delta 0), pack-reused 54[K
Unpacking objects: 100% (54/54), done.
after_debate.txt      LICENSE	   requirements.txt
before_debate.txt     models	   twitter-emotion-recognition
demo.py		      __pycache__
emotion_predictor.py  README.md
/content/twitter-emotion-recognition/twitter-emotion-recognition


# Tweet Cleaning and Sentiment Analysis

In [0]:
from textblob import TextBlob
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
import langdetect

def sentiment_and_freq(tweet):

  # check whether tweet's language is English
  is_english = False
  lang = langdetect.detect(tweet)
  if(lang != 'en'):
    is_english = False
    # print("non-English")
  else:
    # print("Yeah, English")
    is_english = True


  # Clean the Tweet

  # split into words
  tokens = word_tokenize(tweet)
  # convert to lower case
  tokens = [w.lower() for w in tokens]
  # remove punctuation from each word
  import string
  table = str.maketrans('', '', string.punctuation)
  stripped = [w.translate(table) for w in tokens]
  # remove remaining tokens that are not alphabetic
  words = [word for word in stripped if word.isalpha()]
  # filter out stop words
  from nltk.corpus import stopwords
  stop_words = set(stopwords.words('english'))
  words = [w for w in words if not w in stop_words]




  # detokenize to calculate sentiment polarity
  cleaned_sentence = TreebankWordDetokenizer().detokenize(words)
  polarity_of_tweet = TextBlob(cleaned_sentence).sentiment.polarity
  # print(polarity_of_tweet)





  # count specific words (speakers' first or family names)
  yoshua_name = ['yoshua' , 'bengio', 'yoshuabengio']
  gary_name = ['gary' , 'marcus', 'garymarcus']

  has_yoshua = any(word in cleaned_sentence for word in yoshua_name)
  has_gary = any(word in cleaned_sentence for word in gary_name)



  return is_english, polarity_of_tweet, has_yoshua, has_gary



In [100]:
import json
import pandas as pd
# import TweetEmotion

# csv file of the data
file = '/content/gdrive/My Drive/Colab Notebooks/Twitter_Analysis/ALL_tweets_with_hashtag_AIDebate.csv'

# read as pandas dataframe
df = pd.read_csv(file)

# standardize the time
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d %H:%M:%S')

# initialize emotion prediction model
tweet_emotion = TweetEmotion()

# determine emotion
df['emotions'] = df['text'].apply(tweet_emotion.probability_of_emotion)



                     

print(df['date'])


0     2019-12-31 17:27:05
1     2019-12-31 17:11:53
2     2019-12-31 17:04:32
3     2019-12-31 14:36:41
4     2019-12-31 14:00:51
              ...        
435   2019-12-22 18:59:43
436   2019-12-22 18:30:11
437   2019-12-22 15:06:56
438   2019-12-22 15:01:19
439   2019-12-22 14:59:50
Name: date, Length: 440, dtype: datetime64[ns]


In [103]:
# determine emotion
# df['emotions'] = df['text'].apply(tweet_emotion.probability_of_emotion)
# for col in df.columns: 
#     print(col) 

print(df['emotions'][2]['Anger'])

0    0.000316
Name: Anger, dtype: float32


In [104]:
df[['is_eng', 'polarity', 'yoshua' , 'gary']] = df['text'].apply(sentiment_and_freq)

KeyError: ignored

In [69]:
bad_words = ['\'RT', '\"RT']

cnt = 0
with open('after_debate.txt') as oldfile, open('newfile.txt', 'w') as newfile:
    for line in oldfile:
        if not any(bad_word in line for bad_word in bad_words):
            newfile.write(line)
            cnt +=1
            print(line)

print(cnt)

{'created_at': 'Mon Dec 30 21:02:44 +0000 2019', 'id': 1211754515151704065, 'id_str': '1211754515151704065', 'text': '/s #AIDebate https://t.co/6gSVt5A0Me', 'truncated': False, 'entities': {'hashtags': [{'text': 'AIDebate', 'indices': [3, 12]}], 'symbols': [], 'user_mentions': [], 'urls': [{'url': 'https://t.co/6gSVt5A0Me', 'expanded_url': 'https://twitter.com/zergylord/status/1211493810020196353', 'display_url': 'twitter.com/zergylord/stat…', 'indices': [13, 36]}]}, 'metadata': {'iso_language_code': 'und', 'result_type': 'recent'}, 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 3095056652, 'id_str': '3095056652', 'name': 'Nafiz Hamid', 'screen_name': 'nafiz_h', 'location': 'Ames, IA', 'description': 'Soon postdoc at Harvard Med and Mass General. PhD @IowaStateU. M