# Split the clean.csv file into multiple files. Compute VADER sentiment and score

In [1]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from tqdm import tnrange, tqdm_notebook, tqdm

# Define the currency
CURRENCY = "bitcoin"
CURRENCY_SYMBOL = "BTC"
tweets_clean_file = 'data/twitter/%s/%s_tweets_clean.csv'%(CURRENCY_SYMBOL,CURRENCY)
path = 'data/twitter/%s'%(CURRENCY_SYMBOL) #/{CURRENCY}_tweets_clean.csv'


## Read the cleaned file

In [2]:
df_clean = pd.read_csv(tweets_clean_file,low_memory=False)
print(df_clean.shape)
df_clean.head(5)

(737554, 7)


Unnamed: 0,ID,Text,UserName,UserFollowerCount,RetweetCount,Likes,CreatedAt
0,1161461727877165056,BTC ETH TG username,Maxie shane,60,0,0,Wed Aug 14 02:17:09 +0000 2019
1,1161461726975369216,RT : ♣️ Important announcement! ♣️AIO Casino: ...,merjjjj,394,75,0,Wed Aug 14 02:17:09 +0000 2019
2,1161461713822191617,RT : /5jypVGalN6 ® on SALE brand tech technolo...,»»TĦėDŐMAINS»»,1093,6,0,Wed Aug 14 02:17:06 +0000 2019
3,1161461700467601408,Bitcoin Cash BCH Current Price:$352.161 Hour: ...,aWebAnalysis,1122,0,0,Wed Aug 14 02:17:02 +0000 2019
4,1161461686726881281,RT : 🎁Win up to 1500 $TRX 🎁Steps:1. Retweet2. ...,merjjjj,394,97,0,Wed Aug 14 02:16:59 +0000 2019


In [3]:
df_clean = df_clean.sort_values(by='ID') # the bigger the ID, the most recent the tweet 

## Sentiment analysis with Vader

VADER (Valence Aware Dictionary and sEntiment Reasoner) is a lexicon and rule-based sentiment analysis tool that is specifically attuned to sentiments expressed in social media.

VADER takes into account 
- negations and contractions (not good, wasn’t good)
- Punctuation (good!!!), CAPS, emotes :), emojis 
- Intensificators (very, kind of), acronyms ‘lol’
- Scores between -1.0 (negative) and 1.0 (positive)

We will use this sentiment analysis of the tweets to calculate a score that will represent the importance of each tweet.

In [4]:
analyzer = SentimentIntensityAnalyzer()
compound = []
for i,s in enumerate(tqdm(df_clean['Text'])):
    vs = analyzer.polarity_scores(s)
    compound.append(vs["compound"])
df_clean["compound"] = compound
df_clean.head(2)

100%|██████████| 737554/737554 [03:24<00:00, 3606.01it/s]


Unnamed: 0,ID,Text,UserName,UserFollowerCount,RetweetCount,Likes,CreatedAt,compound
737553,1150766972797366272,$BTC BTC Bitcoin Aroon Trend: 40% Bullish100% ...,Crypto Trader,733,0,0,Mon Jul 15 14:00:01 +0000 2019,0.0
737552,1150766972797366272,$BTC BTC Bitcoin Aroon Trend: 40% Bullish100% ...,Crypto Trader,733,0,0,Mon Jul 15 14:00:01 +0000 2019,0.0


## Calculate a score for each tweet

To calculate the score for each tweet, we use different variables to which we had a weight based on its importance.

The compound column represents the sentiment of the tweets and its value is between -1 and 1.

We also use the number of retweets, the number of likes, and the number of users that follow the tweet's author.

In [6]:
scores = []
for i, s in tqdm(df_clean.iterrows(), total=df_clean.shape[0]):
    scores.append(s["compound"] * ((int(s["UserFollowerCount"]))) * ((int(s["Likes"])+1)) *((int(s["RetweetCount"])+1)))
df_clean["score"] = scores
df_clean.head(2)

100%|██████████| 737554/737554 [01:53<00:00, 6490.51it/s]


Unnamed: 0,ID,Text,UserName,UserFollowerCount,RetweetCount,Likes,CreatedAt,compound,score
737553,1150766972797366272,$BTC BTC Bitcoin Aroon Trend: 40% Bullish100% ...,Crypto Trader,733,0,0,Mon Jul 15 14:00:01 +0000 2019,0.0,0.0
737552,1150766972797366272,$BTC BTC Bitcoin Aroon Trend: 40% Bullish100% ...,Crypto Trader,733,0,0,Mon Jul 15 14:00:01 +0000 2019,0.0,0.0


## Split dataframe and save it into multiple files

In [7]:
from datetime import datetime

In [8]:
n = 20000  #chunk row size
chunks_df = [df_clean[i:i+n] for i in range(0,df_clean.shape[0],n)]

sep_char = '~'
for chunk_df in chunks_df:
    chunk_min = chunk_df['ID'].min()
    chunk_max = chunk_df['ID'].max()
    date_from = (datetime.strptime(chunk_df.iloc[0]['CreatedAt'], '%a %b %d %X %z %Y')).strftime('%Y-%m-%d %H-%M-%S')
    date_to = (datetime.strptime(chunk_df.iloc[-1]['CreatedAt'], '%a %b %d %X %z %Y')).strftime('%Y-%m-%d %H-%M-%S')
    print(date_from, date_to)

    # Write into csv
    chunk_df.to_csv(f"{path}/{date_from}{sep_char}{date_to}.csv", header=True, index=False)
    


2019-07-15 14-00-01 2019-07-16 06-13-43
2019-07-16 06-13-43 2019-07-16 19-50-03
2019-07-16 19-50-03 2019-07-17 14-05-19
2019-07-17 14-05-20 2019-07-18 04-52-58
2019-07-18 04-53-04 2019-07-18 19-01-23
2019-07-18 19-01-31 2019-07-19 14-03-10
2019-07-19 14-03-17 2019-07-20 08-28-03
2019-07-20 08-28-08 2019-07-21 07-01-26
2019-07-21 07-01-37 2019-07-22 01-58-38
2019-07-22 01-58-40 2019-07-22 17-14-33
2019-07-22 17-14-33 2019-07-23 12-39-43
2019-07-23 12-39-48 2019-07-24 07-42-13
2019-07-24 07-42-15 2019-07-25 00-17-53
2019-07-25 00-17-56 2019-07-25 19-47-10
2019-07-25 19-47-12 2019-07-26 16-38-27
2019-07-26 16-38-31 2019-07-27 14-20-43
2019-07-27 14-20-59 2019-07-28 15-06-01
2019-07-28 15-06-06 2019-07-29 12-45-52
2019-07-29 12-45-52 2019-07-30 07-54-04
2019-07-30 07-54-04 2019-07-31 02-45-02
2019-07-31 02-45-04 2019-07-31 20-21-05
2019-07-31 20-21-09 2019-08-01 16-52-37
2019-08-01 16-52-41 2019-08-02 13-03-03
2019-08-02 13-03-05 2019-08-03 08-02-06
2019-08-03 08-02-07 2019-08-04 08-03-15


## Update var.csv

In [9]:
import glob
import numpy as np

ENVS = ['CRYPTO', 'LINE_COUNT', 'MOST_RECENT_FILE', 'MOST_RECENT_ID'] # Stored in var.csv

def get_var(key, crypto):
    df_var = pd.read_csv("data/twitter/var.csv", sep=',', dtype={'LINE_COUNT': np.int32})
    return df_var[key].loc[df_var['CRYPTO'] == crypto].values[0]

def update_var(key, value, crypto):
    df_var = pd.read_csv("data/twitter/var.csv", sep=',', dtype={'LINE_COUNT': np.int32})
    df_var[key].loc[df_var['CRYPTO'] == crypto] = str(value)
    df_var.to_csv("data/twitter/var.csv", index=False)
    
def add_new_crypto(crypto):
    df_var = pd.read_csv("data/twitter/var.csv", sep=',', dtype={'LINE_COUNT': np.int32})
    if df_var[ENVS[0]].loc[df_var['CRYPTO'] == crypto].empty:
        new_line = pd.DataFrame([[crypto,-1,"",0]], columns=ENVS)
        df_var = df_var.append(new_line)
        df_var.to_csv("data/twitter/var.csv", index=False)

In [10]:
files = glob.glob("%s/*~*.csv"%(path))
files = sorted(files)
last_file = files[-1]
print(last_file)
last_df = pd.read_csv(last_file)
last_elem = last_df.tail(1)
print(last_elem['ID'])
print(last_df.shape)

add_new_crypto(CURRENCY_SYMBOL)
update_var(ENVS[1], last_df.shape[0], CURRENCY_SYMBOL)
update_var(ENVS[2], last_file, CURRENCY_SYMBOL)
update_var(ENVS[3], last_elem, CURRENCY_SYMBOL)

data/twitter/BTC/2019-08-13 09-38-07~2019-08-14 02-17-09.csv
17553    1161461727877165056
Name: ID, dtype: int64
(17554, 9)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [11]:
var_df = pd.read_csv('data/twitter/var.csv')
var_df

Unnamed: 0,CRYPTO,LINE_COUNT,MOST_RECENT_FILE,MOST_RECENT_ID
0,BTC,17554,data/twitter/BTC/2019-08-13 09-38-07~2019-08-1...,ID Te...
