In [None]:
#This script is activated from 'main'. See respective script for NLP imports & dependencies
print('Successfully activated NLP Script')

### Reddit: 

In [None]:
print('')
coin_sentiment = []
analyzer = SentimentIntensityAnalyzer()

#Initiate NLP Sentiment Function 
def article_sentiment(news):

    for post in news['data']['children']:
    
        try:
            sentiment = analyzer.polarity_scores(post['data']['selftext'])
            compound = sentiment["compound"]
            pos = sentiment["pos"]
            neu = sentiment["neu"]
            neg = sentiment["neg"]
    
            coin_sentiment.append({
                'date':str(datetime.fromtimestamp(post['data']['created'])),
                'subreddit':post['data']['subreddit'],
                'title':post['data']['title'],
                'selftext':post['data']['selftext'],
                'upvote_ratio':post['data']['upvote_ratio'], 
                'ups':post['data']['ups'],
                'downs':post['data']['downs'],
                'score':post['data']['score'],
                'compound': compound,
                'positive': pos,
                'negative': neg,
                'neutral': neu})
        except AttributeError:
            pass
    sentiment_df = pd.DataFrame(coin_sentiment)
    return sentiment_df

#Reddit Analysis
#Extract Reddit NLP Data and write to csv file
reddit_requests = {}

for i in range(len(nlp_cryptos)):
    response = requests.get(f'https://oauth.reddit.com/r/{nlp_cryptos[i]}/controversial.json?limit=10&t=day', headers=headers)
    if response.status_code ==200:
        reddit_requests[i] = response
    else:
        pass


sentiment_df = pd.DataFrame()
for i in reddit_requests:
    try:
        articles = (reddit_requests[i].json())
        sentiment_df = sentiment_df.append(article_sentiment(articles))
    except:
        pass
sentiment_df['subreddit'] = sentiment_df['subreddit'].str.lower()

isolation = dict()
for k, v in sentiment_df.groupby('subreddit'):
    isolation[k] = v

summary = dict()
for x in nlp_cryptos:
    try:
        summary[x] = (isolation[x].describe())
        summary[x].to_csv(f'Data/Functionality/Reddit/Reddit_{x}.csv')
    except:
        pass

### Twitter:

In [15]:
#Twitter Analysis
#Extract Tweets from financial influencers on twitter using Tweepy
influencers = ['elonmusk','CoinMarketCap','Cointelegraph','Gemini','wallstreetbets','krakenfx','coinbase']

#Extract 15 tweets from each twitter user timeline
recent_twitter_df = pd.DataFrame()
for influencer in influencers:
    recent_posts = api.user_timeline(screen_name = influencer, count=10, tweet_mode='extended')
    data = pd.DataFrame( [tweet.full_text for tweet in recent_posts] , columns=['Tweets'])
    recent_twitter_df = recent_twitter_df.append(data)

#Clean twitter posts for NLP Analaysis
def clean_text(text):
    text= re.sub(r'@[A-Za-z0-9]+', '', text) #removes @mentions
    text = re.sub(r'#','', text) #removes the # symbol
    text = re.sub(r'RT[\s]+','', text) #removes RT
    text = re.sub(r'https?:\/\/\S+','', text) #removes hyperlink
    return text

recent_twitter_df['Tweets'] = recent_twitter_df['Tweets'].apply(clean_text)

#create a function to get subjectivity and polarities
#subjectivity - how opinionated the text is
def get_subjectivity(text):
    return TextBlob(text).sentiment.subjectivity
    
#create a function to get polarity, how positive or negative the text is
def get_polarity(text):
    return TextBlob(text).sentiment.polarity

#create two new columns
recent_twitter_df['Subjectivity'] = recent_twitter_df['Tweets'].apply(get_subjectivity)
recent_twitter_df['Polarity'] = recent_twitter_df['Tweets'].apply(get_polarity)

#create function to analyze polarity
def get_analysis(score):
    if score <0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'
    
#create new column for analysis    
recent_twitter_df['Analysis'] = recent_twitter_df['Polarity'].apply(get_analysis)
recent_twitter_df.to_csv('Data/Functionality/Twitter/raw_twitter_data.csv')
recent_twitter_df.to_csv('Data/Functionality/Twitter/twitter_clean_text_function.csv')

#plot Word Cloud users timelines
all_words = ' '.join( [twts for twts in recent_twitter_df['Tweets']] )
word_cloud = WordCloud(width = 500, height=300, random_state = 2, max_font_size = 119).generate(all_words)

plt.imshow(word_cloud, interpolation = 'bilinear')
plt.axis('off')
plt.show()

In [19]:
#Find general crypto sentiment using search terms on twitter - Uses Tweepy version 4
#Bearer_Token Authentication
bearer_token = os.getenv("bearer_token")
consumer_key = os.getenv("consumer_key")
consumer_secret_key = os.getenv("consumer_secret")
client = tweepy.Client(bearer_token)

#Authenticate account using consumer_key & consumer_secret_key
auth = tweepy.AppAuthHandler(consumer_key, consumer_secret_key)
api = tweepy.API(auth)

#Create new API search for recent tweets using a search term, then perform tone analysis to get market sentiment
query = 'crypto+market, -filter:retweets'
max_tweets = 25
posts = [status for status in tweepy.Cursor(api.search_tweets,q=query, lang='en', result_type='recent', count=25).items(max_tweets)]

#Create array from results and create dataframe of results
string_array = []
for status in posts:
    string_array.append(status.text)
query_tweets_df = pd.DataFrame(string_array, columns=['Tweets'])
query_tweets_df.to_csv('Data/Functionality/Twitter/query_tweets_raw.csv')

#Clean_text to search query tweets
query_tweets_df['Tweets'] = query_tweets_df['Tweets'].apply(clean_text)
recent_twitter_df.to_csv('Data/Functionality/Twitter/users_subjectivity_and_polarity.csv')
query_tweets_df.to_csv('Data/Functionality/Twitter/query_tweets_clean_text.csv')

#Create new columns and apply subjectivity and polarity sentiment analysis
query_tweets_df['Subjectivity'] = query_tweets_df['Tweets'].apply(get_subjectivity)
query_tweets_df['Polarity'] = query_tweets_df['Tweets'].apply(get_polarity)

#Create new column for analysis of polarity  
query_tweets_df['Analysis'] = query_tweets_df['Polarity'].apply(get_analysis)

#Plot word cloud to visualize tweets in conjunction with search query 
all_words = ' '.join( [twts for twts in query_tweets_df['Tweets']] )
word_cloud = WordCloud(width = 500, height=300, random_state = 2, max_font_size = 119).generate(all_words)
query_tweets_df.to_csv('Data/Functionality/Twitter/market_sentiment_analysis.csv')

plt.imshow(word_cloud, interpolation = 'bilinear')
plt.axis('off')
plt.show()

#Plot the polarity and subjectivity 
plt.figure(figsize=(8,6))
for i in range(0, query_tweets_df.shape[0]):
    plt.scatter(query_tweets_df['Polarity'][i], query_tweets_df['Subjectivity'][i], color='blue' )
plt.title('Sentiment Analysis')
plt.xlabel('Polarity')
plt.ylabel('Subjectivity')

plt.show()

In [25]:
#Authenticate IBM watson api connection & keys to use tone_analyzer
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
tone_api = os.getenv("tone_api")

#Set_service_url is a private/personal endpoint url provided in your IBM-watson account, not publicly available
authenticator = IAMAuthenticator(tone_api)
tone_analyzer = ToneAnalyzerV3(
    version='2017-09-21',
    authenticator=authenticator
)
tone_analyzer.set_service_url('https://api.au-syd.tone-analyzer.watson.cloud.ibm.com/instances/85c20273-70b0-4277-b776-694e7146a612')

#Create a document from query_tweets_df to run as single string through IBM tone_analyzer
full_text = " ".join( [tweets for tweets in query_tweets_df["Tweets"]] )

#Define text as string
text = full_text

# Analyze the text's tone with the 'tone()' method.
tone_analysis = tone_analyzer.tone(
    {"text": text},
    content_type="application/json",
    content_language="en",
    accept_language="en",
).get_result()

#Get Document Tones as a whole (market sentiment)
doc_tone_df = json_normalize(data=tone_analysis["document_tone"], record_path=["tones"])

# Sentences Tones (Get individual sentence sentement)
sentences_tone_df = json_normalize(
    data=tone_analysis["sentences_tone"],
    record_path=["tones"],
    meta=["sentence_id", "text"],
)
print(f'Total Data points: {sentences_tone_df.shape}')
print(sentences_tone_df.head(10))
sentences_tone_df.to_csv('Data/Functionality/Twitter/sentences_tone.csv')

### Google (PyTrends):

In [None]:
# Google Trends (PyTrends), analyzed in subsets of five due to function limitations. 
pytrends = TrendReq(hl= 'en-US')
list_a = nlp_cryptos[0:5]
list_b = nlp_cryptos[6:10]
list_c = nlp_cryptos[11:15]
list_d = nlp_cryptos[16:20]

google_topics = []
if len(list_a) > 0:
    google_topics.append(list_a)

if len(list_b) > 0:
    google_topics.append(list_b)

if len(list_c) > 0:
    google_topics.append(list_c)

if len(list_d) > 0:
    google_topics.append(list_d)



# List of cryptocurriences (only 5 crypto curriences at a time) 
# NOTE - pytrends gives 400 error if more then 5 cryptocurreinces are added. 
total_df = pd.DataFrame()
sid = SentimentIntensityAnalyzer()


for list in google_topics:
        pytrends.build_payload(list, cat=0, timeframe='today 3-m', gprop='news')
        crypto_trends = pytrends.interest_over_time()
        crypto_trends=crypto_trends.drop(columns= 'isPartial')        
        total_df = crypto_trends.join(total_df).dropna(how="any")
        crypto_trends.tail(10)

# Set current date and the date from one month ago using the ISO format
current_date = pd.Timestamp(datetime.now(), tz="America/New_York").isoformat()
past_date = pd.Timestamp(datetime.now()- timedelta(30), tz="America/New_York").isoformat()


# Use newsapi client to get most relevant headlines per day in the past month
def get_headlines(keyword):
    all_headlines = []
    all_dates = []    
    date = datetime.strptime(current_date[:10], "%Y-%m-%d")
    end_date = datetime.strptime(past_date[:10], "%Y-%m-%d")
    print(f"Fetching news about '{keyword}'")
    print("*" * 30)
    while date > end_date:
        print(f"retrieving news from: {date}")
        articles = newsapi.get_everything(
            q=keyword,
            from_param=str(date)[:10],
            to=str(date)[:10],
            language="en",
            sort_by="relevancy",
            page=1,
        )
        headlines = []
        for i in range(0, len(articles["articles"])):
            headlines.append(articles["articles"][i]["title"])
        all_headlines.append(headlines)
        all_dates.append(date)
        date = date - timedelta(days=1)
    return all_headlines, all_dates


# Get first topic (kw = trending crypto)
crypto_headlines, dates = get_headlines("crypto")

# inflation headline
inflation_headlines, _ = get_headlines("inflation")

# Get third topic 
energy__headlines, _ = get_headlines("energy")


In [None]:
#Compute average compound sentiment of headlines for each day
def headline_sentiment_summarizer_avg(headlines):
    sentiment = []
    for day in headlines:
        day_score = []
        for h in day:
            if h == None:
                continue
            else:
                day_score.append(sid.polarity_scores(h)["compound"])
        sentiment.append(sum(day_score) / len(day_score))
    return sentiment

# Get averages of each topics sentiment
crypto_choice_avg = headline_sentiment_summarizer_avg(crypto_headlines)
inflation_headlinese_avg = headline_sentiment_summarizer_avg(inflation_headlines)
energy__headlines_avg = headline_sentiment_summarizer_avg(energy__headlines)


# Combine Sentiment Averages into DataFrame
topic_sentiments = pd.DataFrame(
    {
        "crypto_choice_avg": crypto_choice_avg,
        "inflation_headlinese_avg": inflation_headlinese_avg,
        "energy_consumption_avg": energy__headlines_avg,
    }
)

# Set the index value of the sentiment averages DataFrame to be the series of dates, Merge with main dataframe
topic_sentiments.index = pd.to_datetime(dates)
topic_sentiments_trends = total_df.join(topic_sentiments).dropna(how="any")
topic_sentiments_trends.to_csv('Data/Functionality/Google/Sentiments.csv')
correlation_csv = topic_sentiments_trends.corr()
correlation_csv.to_csv('Data/Functionality/Google/Correlation.csv')


# Correlate the headlines' sentiment to returns
correlation_df = topic_sentiments_trends.corr().style.background_gradient(cmap='PuBu')

In [None]:
print('Finished Extracting NLP Data.. see Investor Summary')