# Twitter sentiment analysis

## Imports and load data

In [1]:
from searchtweets import ResultStream, gen_rule_payload, load_credentials, collect_results

# general imports
import numpy as np
import pandas as pd
import subprocess
from textblob import TextBlob
import re
import time
import datetime

# plotting and visualization
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def runCmd(cmd, run=False):
    '''
    use stdout to run command and shell=True to allow command to be entered as a string
    use rstrip() to remove carriage return and decode binary form to string. 
     '''
    if run == False:
        try:
            output_cmd = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=True)
            output_cmd = output_cmd.rstrip().decode('UTF-8')
        except subprocess.CalledProccessError as e:
            print("Command", e.cmd, "returned with error code", e.returncode)
            print("setting output_cmd to empty array")
            output_cmd = []
        return output_cmd
    else:
        subprocess.Popen(cmd, stderr=subprocess.STDOUT, shell=True)
        return None

def S2_from_csv(filename_A, filename_B):
    # assumption, Tweets file ends with 'Tweets.csv'
    filename_tweets, filename_meta = '', ''
    if 'Tweets.csv' in filename_A:
        filename_tweets, filename_meta = filename_A, filename_B
    else:
        filename_tweets, filename_meta = filename_B, filename_A
    S2_tweets = pd.read_csv(filename_tweets, encoding='latin')
    S2_meta = pd.read_csv(filename_meta, encoding='latin')
    S2 = pd.concat([S2_tweets, S2_meta], axis=1)
    return S2

### file name

In [3]:
current_dir = [x for x in runCmd('ls .').split('\n') if '.csv' in x]
print("Data files available to load")
for i, x in enumerate(current_dir):
    print(str(i), " ", x)
    
value_selection = input("\nPlease input file numbers separated by a space ")
value_selection = [ int(x) for x in value_selection.split() ]
S2 = S2_from_csv(current_dir[value_selection[0]], current_dir[value_selection[1]])

Data files available to load
0   S3_df.csv
1   tweets_2018-08-01_2018-08-15_Date.csv
2   tweets_2018-08-01_2018-08-15_Tweets.csv
3   tweets_2018-09-01-00_Date.csv
4   tweets_2018-09-01-00_Tweets.csv
5   tweets_2018-09-01_2018-09-03_Date.csv
6   tweets_2018-09-01_2018-09-03_Tweets.csv
7   tweets_2018-09-01_2018-09-15_Date.csv
8   tweets_2018-09-01_2018-09-15_Tweets.csv
9   tweets_2018-09-03-21_Date.csv
10   tweets_2018-09-03-21_Tweets.csv
11   tweets_2018-09-03_2018-09-05_Date.csv
12   tweets_2018-09-03_2018-09-05_Tweets.csv
13   tweets_2018-09-06_2018-09-15_Date.csv
14   tweets_2018-09-06_2018-09-15_Tweets.csv
15   tweets_2018-09-10_2018-09-14_Date.csv
16   tweets_2018-09-10_2018-09-14_Tweets.csv
17   tweets_2018-09-10_2018-09-14_df.csv
18   tweets_2018-09-15-21_Date.csv
19   tweets_2018-09-15-21_Tweets.csv
20   tweets_2018-0916_2018-0930_Metadata.csv
21   tweets_2018-0916_2018-0930_Tweets.csv
22   tweets_2018-10-01_2018-10-05_Date.csv
23   tweets_2018-10-01_2018-10-05_Tweets.csv
24   

In [5]:
S2.iloc[100:110,:]

Unnamed: 0,tweets,date,user_name,user_screen_name,user_followers,user_friends,user_verified,user_language,retweet_count,favorite_count
100,Ecuador Bans Bitcoin -- Dogecoin Founder Speak...,Wed Oct 10 23:56:47 +0000 2018,BitcoinProfits,profits_bitcoin,13198,12695,False,en,0,0
101,@ekliptor @mhagelstrom @derose @BennettTomlin ...,Wed Oct 10 23:56:46 +0000 2018,dasource,dasource_,236,337,False,en,0,0
102,RT @ErikVoorhees: @Nouriel Thanks for posting ...,Wed Oct 10 23:56:44 +0000 2018,Thamzor,ThamJason,177,439,False,en,0,0
103,RT @RealistNews: REALIST NEWS - Live Stream wi...,Wed Oct 10 23:56:44 +0000 2018,Sam ÅTC,unclesam520,352,419,False,en,0,0
104,And so here it is! My 40 minute discussion vid...,Wed Oct 10 23:56:42 +0000 2018,Alexander Trapp,itsatrapppro,4314,4844,False,en,7,11
105,RT @JWWeatherman_: #Bitcoin is the greatest op...,Wed Oct 10 23:56:41 +0000 2018,Bull â¡,BlakkBull,340,465,False,en,0,0
106,@PhilCrypto77 Honey Badger up 10x since his st...,Wed Oct 10 23:56:36 +0000 2018,Global Chain ð,global_chain,232,224,False,en,0,0
107,"VeChain (VEN) Partners DHL APIC, Looks Forward...",Wed Oct 10 23:56:33 +0000 2018,Coin Spectator - Real-time cryptocurrency news,coinspectator,4992,1851,False,en,0,0
108,RT @Nouriel: So Bitcoin isn't a currency. It i...,Wed Oct 10 23:56:33 +0000 2018,BT|D,BuyTheDipp,416,915,False,en,0,0
109,"Y tan exclusiva, la van a usar ella y sus palm...",Wed Oct 10 23:56:27 +0000 2018,Daniel MartÃ­nez,danmmarti,1744,1256,False,es,1,3


## Sentiment analysis

In [10]:
def clean_tweet(tweet):
    '''
    Utility function to clean the text in a tweet by removing 
    links and special characters using regex.
    '''
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())

def analize_sentiment(tweet):
    '''
    Utility function to classify the polarity of a tweet
    using textblob.
    
    textblob already has a trained analyser to work 
    with different machine learning models on 
    natural language processing.
    
    Might want to train our own model
    '''
    analysis = TextBlob(clean_tweet(tweet))
    if analysis.sentiment.polarity > 0:
        return 1
    elif analysis.sentiment.polarity == 0:
        return 0
    else:
        return -1
    

def sentiment_analysis(S2):
    # We create a column with the result of the analysis:
    S2['SA'] = np.array([ analize_sentiment(tweet) for tweet in S2['tweets'] ])
    
    # We construct lists with classified tweets:
    pos_tweets = [ tweet for index, tweet in enumerate(S2['tweets']) if S2['SA'][index] > 0]
    neu_tweets = [ tweet for index, tweet in enumerate(S2['tweets']) if S2['SA'][index] == 0]
    neg_tweets = [ tweet for index, tweet in enumerate(S2['tweets']) if S2['SA'][index] < 0]

    # We print percentages:
    print("Percentage of positive tweets: {}%".format(len(pos_tweets)*100/len(S2['tweets'])))
    print("Percentage of neutral tweets: {}%".format(len(neu_tweets)*100/len(S2['tweets'])))
    print("Percentage de negative tweets: {}%".format(len(neg_tweets)*100/len(S2['tweets'])))

In [11]:
sentiment_analysis(S2)

Percentage of positive tweets: 38.82%
Percentage of neutral tweets: 50.94%
Percentage de negative tweets: 10.24%
