In [1]:
# Import packages

import os
import pandas as pd
from dotenv import load_dotenv
load_dotenv()
from pathlib import Path

# Important nltk and sentimentanalyzer
import nltk as nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Used for API call
import requests
import json
import datetime
from datetime import date, timedelta


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Ryan\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
# Create API key variable
api_key=os.getenv("CRYPTONEWS_API_KEY")
print(type(api_key))



<class 'str'>


In [3]:
# Instantiate sentiment analyzer
analyzer=SentimentIntensityAnalyzer()

In [4]:
# Datetime format for while loop to call API
start_date = datetime.date(2021, 4, 1)
end_date = datetime.date(2021, 9, 1)
delta = datetime.timedelta(days=1)

# Create list to store data
solana_sentiment=[]

# Runs while loop to pull daily articles from API                         
while start_date <= end_date:
        
        # Converts start_date to string
        start=start_date.strftime("%m%d%Y")
        
        # Counter for start date
        start_date+=delta         
        
        #Calls API
        solana_articles=f"https://cryptonews-api.com/api/v1?tickers=SOL&items=50&date={start}-{start}&token={api_key}"
        
        # Formats API into JSON
        response_data=requests.get(solana_articles)
        response_content=response_data.content
        data=response_data.json()
        
        # For loop to get sentiment on each article 
        for article in data["data"]:
            try:
                date=article["date"]
                text=article["text"]
                sentiment=analyzer.polarity_scores(text)
                compound=sentiment["compound"]
                pos=sentiment["pos"]
                neu=sentiment["neu"]
                neg=sentiment["neg"]
                
                # Creates binary variable based on positive score
                if pos >= .1:
                    pos_column = 1
                else: 
                    pos_column=0
                # Appends sentiment analysis to solana list    
                solana_sentiment.append({
                    "date":date,
                    "text":text,
                    "compound":compound,
                    "positive":pos,
                    "negative":neg,
                    "neutral":neu,
                    "binary score":pos_column
                }) 
                
            except AttributeError:
                pass

In [5]:
# Creates solana dataframe from list
solana_df=pd.DataFrame(solana_sentiment)

# Add columns to dataframe
columns = ["date","text","compound","positive","negative","neutral","binary score"]    

# Adds column names to dataframe
solana_df=solana_df[columns]

# Formats date column
solana_df['date'] = pd.to_datetime(solana_df['date']).dt.date

# Sorts values by date and drops duplicates
solana_df=solana_df.sort_values(by=["date"])
solana_df.drop_duplicates

solana_df

Unnamed: 0,date,text,compound,positive,negative,neutral,binary score
0,2021-04-06,Prominent crypto trader and analyst Cantering ...,0.8402,0.136,0.000,0.864,1
1,2021-04-06,Ethereum (ETH) is re-testing the previous all-...,0.4767,0.170,0.000,0.830,1
2,2021-04-07,The Solana price had a significant rally in th...,0.9382,0.280,0.000,0.720,1
3,2021-04-09,Solana (SOL) has been increasing rapidly since...,-0.3182,0.080,0.089,0.831,0
4,2021-04-10,A widely-followed crypto strategist and trader...,0.8074,0.128,0.000,0.872,1
...,...,...,...,...,...,...,...
568,2021-09-01,Blockchain analytics firm Santiment is pointin...,-0.5267,0.000,0.053,0.947,0
567,2021-09-01,"With August coming to an end, many in the cryp...",0.0498,0.111,0.126,0.762,1
577,2021-09-01,Defying the slump in the broader crypto market...,-0.4215,0.000,0.052,0.948,0
571,2021-09-01,Cardano ADA/USD is a proof-of-stake (PoS) publ...,0.7906,0.108,0.000,0.892,1


In [6]:
# Creates mean score per day
solana_df=solana_df.groupby(by='date').mean()

solana_df


Unnamed: 0_level_0,compound,positive,negative,neutral,binary score
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-04-06,0.658450,0.153000,0.000000,0.847000,1.000000
2021-04-07,0.938200,0.280000,0.000000,0.720000,1.000000
2021-04-09,-0.318200,0.080000,0.089000,0.831000,0.000000
2021-04-10,0.807400,0.128000,0.000000,0.872000,1.000000
2021-04-11,0.000000,0.000000,0.000000,1.000000,0.000000
...,...,...,...,...,...
2021-08-28,0.465323,0.101154,0.009923,0.888923,0.461538
2021-08-29,0.168300,0.058250,0.032625,0.909125,0.125000
2021-08-30,0.275933,0.078619,0.027333,0.894048,0.476190
2021-08-31,0.125087,0.058708,0.033667,0.907625,0.291667


In [7]:
# Classify score for binary class 
import numpy as np
solana_df['new score'] = np.where(solana_df['binary score'] >= .5, 1 , 0)

In [8]:
# Import price data
file_path=("sol-usd-max.csv")


price_df = pd.read_csv(file_path, index_col="date", infer_datetime_format=True, parse_dates=True)
price_df.index= price_df.index.strftime('%Y-%m-%d')


In [9]:
price_df.head()

Unnamed: 0_level_0,price,market_cap,total_volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-04-06,22.942986,6143755029,324307982.0
2021-04-07,24.989549,6677871480,404116807.9
2021-04-09,27.005146,7238834910,445724093.1
2021-04-10,27.784698,7509326043,351754156.4
2021-04-11,26.833966,7243681262,340108404.2


In [10]:
# Merge dataframes
result = pd.merge(solana_df.reset_index(),price_df.reset_index(), left_index=True, right_index=True, how="left")

In [11]:
# Calculate price change
result["change"]=result.price.pct_change()
result

Unnamed: 0,date_x,compound,positive,negative,neutral,binary score,new score,date_y,price,market_cap,total_volume,change
0,2021-04-06,0.658450,0.153000,0.000000,0.847000,1.000000,1,2021-04-06,22.942986,6143755029,3.243080e+08,
1,2021-04-07,0.938200,0.280000,0.000000,0.720000,1.000000,1,2021-04-07,24.989549,6677871480,4.041168e+08,0.089202
2,2021-04-09,-0.318200,0.080000,0.089000,0.831000,0.000000,0,2021-04-09,27.005146,7238834910,4.457241e+08,0.080658
3,2021-04-10,0.807400,0.128000,0.000000,0.872000,1.000000,1,2021-04-10,27.784698,7509326043,3.517542e+08,0.028867
4,2021-04-11,0.000000,0.000000,0.000000,1.000000,0.000000,0,2021-04-11,26.833966,7243681262,3.401084e+08,-0.034218
...,...,...,...,...,...,...,...,...,...,...,...,...
119,2021-08-28,0.465323,0.101154,0.009923,0.888923,0.461538,0,2021-08-28,87.968207,25629594571,3.193853e+09,0.155749
120,2021-08-29,0.168300,0.058250,0.032625,0.909125,0.125000,0,2021-08-29,97.066668,28329094352,2.343521e+09,0.103429
121,2021-08-30,0.275933,0.078619,0.027333,0.894048,0.476190,0,2021-08-30,94.009774,27304152717,1.619244e+09,-0.031493
122,2021-08-31,0.125087,0.058708,0.033667,0.907625,0.291667,0,2021-08-31,109.646733,31920673370,4.827156e+09,0.166333


In [12]:
#Drop null values 
result.dropna(inplace=True)

In [19]:
# Drop extra columns
result.drop(columns=['date_y', 'market_cap',"total_volume"])

KeyError: "['date_y' 'market_cap' 'total_volume'] not found in axis"

In [20]:
# Output merged dataframe
path=Path("Data/solana_sentiment_df.csv")
result.to_csv(path,index="date")