In [1]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from tqdm import tnrange, tqdm_notebook, tqdm

CURRENCY = "bitcoin"
CURRENCY_SYMBOL = "BTC"
tweets_clean_file = "sample_testing_data.csv"
path = 'data/twitter/%s'%(CURRENCY_SYMBOL) 

In [None]:
df_clean = pd.read_csv(tweets_clean_file,low_memory=False)
print(df_clean.shape)
df_clean.head(5)

In [3]:
df_clean = df_clean.sort_values(by='ID') 

In [None]:
analyzer = SentimentIntensityAnalyzer()
compound = []
for i,s in enumerate(tqdm(df_clean['Text'])):
    vs = analyzer.polarity_scores(s)
    compound.append(vs["compound"])
df_clean["compound"] = compound
df_clean.head(2)

In [None]:
scores = []
for i, s in tqdm(df_clean.iterrows(), total=df_clean.shape[0]):
    scores.append(s["compound"] * ((int(s["UserFollowerCount"]))) * ((int(s["Likes"])+1)) *((int(s["RetweetCount"])+1)))
df_clean["score"] = scores
df_clean.head(2)

In [7]:
from datetime import datetime

In [None]:
n = 20000  
chunks_df = [df_clean[i:i+n] for i in range(0,df_clean.shape[0],n)]

sep_char = '~'
for chunk_df in chunks_df:
    chunk_min = chunk_df['ID'].min()
    chunk_max = chunk_df['ID'].max()
    date_from = (datetime.strptime(chunk_df.iloc[0]['CreatedAt'], '%a %b %d %X %z %Y')).strftime('%Y-%m-%d %H-%M-%S')
    date_to = (datetime.strptime(chunk_df.iloc[-1]['CreatedAt'], '%a %b %d %X %z %Y')).strftime('%Y-%m-%d %H-%M-%S')
    print(date_from, date_to)

    chunk_df.to_csv(f"{path}/{date_from}{sep_char}{date_to}.csv", header=True, index=False)
    


In [9]:
import glob
import numpy as np

ENVS = ['CRYPTO', 'LINE_COUNT', 'MOST_RECENT_FILE', 'MOST_RECENT_ID'] # Stored in var.csv

def get_var(key, crypto):
    df_var = pd.read_csv("data/twitter/var.csv", sep=',', dtype={'LINE_COUNT': np.int32})
    return df_var[key].loc[df_var['CRYPTO'] == crypto].values[0]

def update_var(key, value, crypto):
    df_var = pd.read_csv("data/twitter/var.csv", sep=',', dtype={'LINE_COUNT': np.int32})
    df_var[key].loc[df_var['CRYPTO'] == crypto] = str(value)
    df_var.to_csv("data/twitter/var.csv", index=False)
    
def add_new_crypto(crypto):
    df_var = pd.read_csv("data/twitter/var.csv", sep=',', dtype={'LINE_COUNT': np.int32})
    if df_var[ENVS[0]].loc[df_var['CRYPTO'] == crypto].empty:
        new_line = pd.DataFrame([[crypto,-1,"",0]], columns=ENVS)
        df_var = df_var.append(new_line)
        df_var.to_csv("data/twitter/var.csv", index=False)