# Building Dataset from Twitter

In [None]:
# Your Twitter app credentials need to be added here - go to https://developer.twitter.com/en/apps

import tweepy

consumer_key = ''
consumer_key_secret = ''
access_token = ''
access_token_secret = ''

In [None]:
twitter_auth = tweepy.OAuthHandler(consumer_key, consumer_key_secret)
api = tweepy.API(twitter_auth)

tweets = api.search("trump", count=100) # Search term goes here

print(tweets)

In [None]:
import re
import pandas as pd
from textblob import TextBlob

# Storing features from the tweets that will be useful for ML later
# This is broken into features relating to the tweet (prefixed with 'tweet_') and use related ones (prefixed with 'user_')
df = pd.DataFrame(columns=('tweet_text', 'tweet_sentiment', 'tweet_subjectivity',
                           'user_followers_count', 'user_friends_count',
                           'user_account_age', 'user_verified',
                           'user_favourites_count', 'user_tweets',
                           'tweet_retweeted', 'tweet_retweet_count', 'tweet_favorite_count'))

# Remove duplicates
df.sort_values("tweet_text", inplace = True) 
df.drop_duplicates(subset ="tweet_text", keep = False, inplace = True) 

for tweet in tweets:
    sentimentText = TextBlob(tweet.text)
    df = df.append({'tweet_text': re.sub(r'http\S+', '', tweet.text), # Removing any URL's in the tweet text here
                    'tweet_sentiment': sentimentText.sentiment.polarity,
                    'tweet_subjectivity': sentimentText.sentiment.subjectivity,
                    'user_followers_count': tweet.user.followers_count, 
                    'user_friends_count': tweet.user.friends_count,
                    'user_account_age': tweet.user.created_at, 
                    'user_verified': tweet.user.verified,
                    'user_favourites_count': tweet.user.favourites_count,
                    'user_tweets': tweet.user.statuses_count,
                    'tweet_retweeted': tweet.retweeted,
                    'tweet_retweet_count': tweet.retweet_count,
                    'tweet_favorite_count': tweet.favorite_count},
                   ignore_index=True)

df.head()

### Create other features for model

In [None]:
# Tweet length
df['tweet_text_length'] = df['tweet_text'].apply(lambda x: len(x) - x.count(" "))

# Count of punctuation in the tweet
import string

def count_punc(text):
    count = sum([1 for char in text if char in string.punctuation])
    return count

df['tweet_text_punc_count'] = df['tweet_text'].apply(lambda x: count_punc(x))

df.head()

### Add a wordcloud

In [None]:
import nltk
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
from nltk.corpus import stopwords 

# Setup stop words first
stop_words = stopwords.words('english') 
new_stopwords = ['RT']
stop_words.extend(new_stopwords)
stop_words = set(stop_words)

# Remove stopwords
text = " ".join(review for review in df.tweet_text)
clean_text = " ".join(word for word in text.split() if word not in stop_words)

print ("There are {} words in all tweets.".format(len(text)))
print ("There are {} words in  all tweets with stopwords removed.".format(len(clean_text)))

# Generate a word cloud image
wordcloud = WordCloud(background_color="white").generate(clean_text)

plt.figure( figsize=(15,7))
plt.axis("off")
plt.imshow(wordcloud, interpolation='bilinear')

### Sentiment of tweets

In [None]:
# Sentiment of the tweets - using TextBlob for this
print ("Total Tweet count: {}" .format(len(df)))
print ("The average sentiment of the tweets is {} ".format(df["tweet_sentiment"].mean()))

positive_tweets_count = 0
negative_tweets_count = 0
neutral_tweets_count = 0

for index, tweet in df.iterrows():
    #print(tweet['tweet_sentiment'])
    if tweet['tweet_sentiment'] > 0:
        positive_tweets_count += 1
    elif tweet['tweet_sentiment'] < 0:
        negative_tweets_count += 1
    else:
        neutral_tweets_count += 1

print("\nPercentage of positive tweets: {}%".format(positive_tweets_count*100/len(df)))
print("Percentage of negative tweets: {}%".format(negative_tweets_count*100/len(df)))
print("Percentage of neutral tweets: {}%".format(neutral_tweets_count*100/len(df)))