In [1]:
import pandas as pd
df = pd.read_csv("Filtered tweets.csv")

In [2]:
df.shape

(1331668, 9)

In [3]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from pathlib import Path
from nltk.tokenize import TweetTokenizer

def preprocess_text(text, stop_words, lemmatizer):
    """
    Clean and preprocess tweet text.

    Parameters:
    - text (str): Tweet text.
    - stop_words (set): Set of English stopwords.
    - lemmatizer (WordNetLemmatizer): Lemmatizer object.

    Returns:
    - str: Preprocessed text.
    """

    # special_words = ["US", "UK", "AI"]

    # Convert to lowercase
    text = text.lower()

    # Remove URLs, mentions, and retweets
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"\@\w+", "", text)
    text = re.sub(r"^RT[\s]+", "", text)

    # Remove non-alphabetic characters
    text = re.sub(r"[^a-zA-Z\s]", "", text)

    # Tokenization
    tokens = TweetTokenizer().tokenize(text)

    # Lemmatization for long words
    tokens = [lemmatizer.lemmatize(token) if len(token) > 3 else token for token in tokens]

    # Remove stopwords
    tokens = [token for token in tokens if token not in stop_words]

    # Remove punctuation and numbers
    # text = "".join([char for char in text if char not in string.punctuation and not char.isdigit()])
    text = " ".join([token for token in tokens if token not in string.punctuation and not token.isdigit()])
    return text
    # return " ".join(tokens)

In [4]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\isaen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\isaen\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [6]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
df["cleaned_text"] = df["text"].apply(lambda x: preprocess_text(x, stop_words, lemmatizer))

In [7]:
df

Unnamed: 0,datetime,date,username,text,Date_hourly,Date_min,Date_day,Date_30m,Date_15m,cleaned_text
0,2022-01-01 00:00:00+00:00,2022-01-01,CryptoNerdApp,Current Price of Bitcoin:\n$46320 (-1.85%)\n\n...,2022-01-01 01:00:00+00:00,2022-01-01 00:01:00+00:00,2022-01-02 00:00:00+00:00,2022-01-01 00:30:00+00:00,2022-01-01 00:15:00+00:00,current price bitcoin btc bitcoin cryptocurren...
1,2022-01-01 17:10:02+00:00,2022-01-01,bitcoinalerts,The year for Bitcoin — A 2021 roundup of the f...,2022-01-01 18:00:00+00:00,2022-01-01 17:11:00+00:00,2022-01-02 00:00:00+00:00,2022-01-01 17:30:00+00:00,2022-01-01 17:15:00+00:00,year bitcoin roundup flagship crypto
2,2022-01-01 17:10:00+00:00,2022-01-01,oneandonlypd,#Bitcoin is all our savings. https://t.co/SGgy...,2022-01-01 18:00:00+00:00,2022-01-01 17:11:00+00:00,2022-01-02 00:00:00+00:00,2022-01-01 17:30:00+00:00,2022-01-01 17:15:00+00:00,bitcoin saving
3,2022-01-01 17:09:59+00:00,2022-01-01,CHAIRFORCE_BTC,@gmekhail Isn't that usually shouted at the pe...,2022-01-01 18:00:00+00:00,2022-01-01 17:10:00+00:00,2022-01-02 00:00:00+00:00,2022-01-01 17:30:00+00:00,2022-01-01 17:15:00+00:00,isnt usually shouted people promoting going bi...
4,2022-01-01 17:09:58+00:00,2022-01-01,crypto_squared,What are Decentralized Apps or DApps?\n#Crypto...,2022-01-01 18:00:00+00:00,2022-01-01 17:10:00+00:00,2022-01-02 00:00:00+00:00,2022-01-01 17:30:00+00:00,2022-01-01 17:15:00+00:00,decentralized apps dapps cryptonews bitcoin cr...
...,...,...,...,...,...,...,...,...,...,...
1331663,2022-01-31 10:17:58+00:00,2022-01-31,cstross,@ignaziop1977 @MLK3030 @SHOKUNIN_STUDIO At the...,2022-01-31 11:00:00+00:00,2022-01-31 10:18:00+00:00,2022-02-01 00:00:00+00:00,2022-01-31 10:30:00+00:00,2022-01-31 10:30:00+00:00,point wrote book unaware existence blockchainb...
1331664,2022-01-31 10:18:02+00:00,2022-01-31,S_Edalati,Cryptocurrency Prices On January 31 2021: Know...,2022-01-31 11:00:00+00:00,2022-01-31 10:19:00+00:00,2022-02-01 00:00:00+00:00,2022-01-31 10:30:00+00:00,2022-01-31 10:30:00+00:00,cryptocurrency price january know rate bitcoin...
1331665,2022-01-31 10:18:05+00:00,2022-01-31,bitcoinpressuk,#Bitcoin #BTC #CRYPTO US Financial Advisors Ex...,2022-01-31 11:00:00+00:00,2022-01-31 10:19:00+00:00,2022-02-01 00:00:00+00:00,2022-01-31 10:30:00+00:00,2022-01-31 10:30:00+00:00,bitcoin btc crypto us financial advisor expect...
1331666,2022-01-31 10:17:28+00:00,2022-01-31,AlfaHedge,How low can #Bitcoin go ???\n\nExperts weigh i...,2022-01-31 11:00:00+00:00,2022-01-31 10:18:00+00:00,2022-02-01 00:00:00+00:00,2022-01-31 10:30:00+00:00,2022-01-31 10:30:00+00:00,low bitcoin go expert weigh cryptonews cryptoc...


In [8]:
df.to_csv("Cleaned_tweets.csv", index = False)

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

tokenizer = BertTokenizer.from_pretrained("kk08/CryptoBERT")
model = BertForSequenceClassification.from_pretrained("kk08/CryptoBERT")

classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer )
text = "Bitcoin (BTC) touches $29k, Ethereum (ETH) Set To Explode, RenQ Finance (RENQ) Crosses Massive Milestone"
result = classifier(text)
print(result)

In [None]:
1331668/3

443889.3333333333

In [None]:
Scores_df = pd.DataFrame({"Scores_kk08":[]})

In [None]:
Scores_df

Unnamed: 0,Scores_kk08


In [None]:
df

Unnamed: 0,datetime,date,username,text,Date_hourly,Date_min,Date_day,Date_30m,Date_15m,cleaned_text
0,2022-01-01 00:00:00+00:00,2022-01-01,CryptoNerdApp,Current Price of Bitcoin:\n$46320 (-1.85%)\n\n...,2022-01-01 01:00:00+00:00,2022-01-01 00:01:00+00:00,2022-01-02 00:00:00+00:00,2022-01-01 00:30:00+00:00,2022-01-01 00:15:00+00:00,current price bitcoin btc bitcoin cryptocurren...
1,2022-01-01 17:10:02+00:00,2022-01-01,bitcoinalerts,The year for Bitcoin — A 2021 roundup of the f...,2022-01-01 18:00:00+00:00,2022-01-01 17:11:00+00:00,2022-01-02 00:00:00+00:00,2022-01-01 17:30:00+00:00,2022-01-01 17:15:00+00:00,year bitcoin roundup flagship crypto
2,2022-01-01 17:10:00+00:00,2022-01-01,oneandonlypd,#Bitcoin is all our savings. https://t.co/SGgy...,2022-01-01 18:00:00+00:00,2022-01-01 17:11:00+00:00,2022-01-02 00:00:00+00:00,2022-01-01 17:30:00+00:00,2022-01-01 17:15:00+00:00,bitcoin saving
3,2022-01-01 17:09:59+00:00,2022-01-01,CHAIRFORCE_BTC,@gmekhail Isn't that usually shouted at the pe...,2022-01-01 18:00:00+00:00,2022-01-01 17:10:00+00:00,2022-01-02 00:00:00+00:00,2022-01-01 17:30:00+00:00,2022-01-01 17:15:00+00:00,isnt usually shouted people promoting going bi...
4,2022-01-01 17:09:58+00:00,2022-01-01,crypto_squared,What are Decentralized Apps or DApps?\n#Crypto...,2022-01-01 18:00:00+00:00,2022-01-01 17:10:00+00:00,2022-01-02 00:00:00+00:00,2022-01-01 17:30:00+00:00,2022-01-01 17:15:00+00:00,decentralized apps dapps cryptonews bitcoin cr...
...,...,...,...,...,...,...,...,...,...,...
1331663,2022-01-31 10:17:58+00:00,2022-01-31,cstross,@ignaziop1977 @MLK3030 @SHOKUNIN_STUDIO At the...,2022-01-31 11:00:00+00:00,2022-01-31 10:18:00+00:00,2022-02-01 00:00:00+00:00,2022-01-31 10:30:00+00:00,2022-01-31 10:30:00+00:00,point wrote book unaware existence blockchainb...
1331664,2022-01-31 10:18:02+00:00,2022-01-31,S_Edalati,Cryptocurrency Prices On January 31 2021: Know...,2022-01-31 11:00:00+00:00,2022-01-31 10:19:00+00:00,2022-02-01 00:00:00+00:00,2022-01-31 10:30:00+00:00,2022-01-31 10:30:00+00:00,cryptocurrency price january know rate bitcoin...
1331665,2022-01-31 10:18:05+00:00,2022-01-31,bitcoinpressuk,#Bitcoin #BTC #CRYPTO US Financial Advisors Ex...,2022-01-31 11:00:00+00:00,2022-01-31 10:19:00+00:00,2022-02-01 00:00:00+00:00,2022-01-31 10:30:00+00:00,2022-01-31 10:30:00+00:00,bitcoin btc crypto us financial advisor expect...
1331666,2022-01-31 10:17:28+00:00,2022-01-31,AlfaHedge,How low can #Bitcoin go ???\n\nExperts weigh i...,2022-01-31 11:00:00+00:00,2022-01-31 10:18:00+00:00,2022-02-01 00:00:00+00:00,2022-01-31 10:30:00+00:00,2022-01-31 10:30:00+00:00,low bitcoin go expert weigh cryptonews cryptoc...


In [None]:
len(set(df["Date_hourly"]))
#full hours

744

In [None]:
df["Date_hourly"].unique()

array(['2022-01-01 01:00:00+00:00', '2022-01-01 18:00:00+00:00',
       '2022-01-01 17:00:00+00:00', '2022-01-01 19:00:00+00:00',
       '2022-01-01 15:00:00+00:00', '2022-01-01 16:00:00+00:00',
       '2022-01-01 14:00:00+00:00', '2022-01-01 23:00:00+00:00',
       '2022-01-01 22:00:00+00:00', '2022-01-02 00:00:00+00:00',
       '2022-01-01 20:00:00+00:00', '2022-01-01 21:00:00+00:00',
       '2022-01-01 04:00:00+00:00', '2022-01-01 05:00:00+00:00',
       '2022-01-01 03:00:00+00:00', '2022-01-01 06:00:00+00:00',
       '2022-01-01 07:00:00+00:00', '2022-01-01 02:00:00+00:00',
       '2022-01-01 12:00:00+00:00', '2022-01-01 13:00:00+00:00',
       '2022-01-01 11:00:00+00:00', '2022-01-01 08:00:00+00:00',
       '2022-01-01 09:00:00+00:00', '2022-01-01 10:00:00+00:00',
       '2022-01-02 18:00:00+00:00', '2022-01-02 17:00:00+00:00',
       '2022-01-02 19:00:00+00:00', '2022-01-02 15:00:00+00:00',
       '2022-01-02 16:00:00+00:00', '2022-01-03 00:00:00+00:00',
       '2022-01-02 23:00:

In [None]:
# ivan
Scores_kk08_ivan = []
for i in range(443890):
    Scores_kk08_ivan.append(classifier(df['text'][i]))
    print(i)

In [None]:
# ivan
Scores_kk08_ivan = []
for i in range(443890, 0, -1):
    Scores_kk08_ivan.append(classifier(df['text'][i]))
    print(i)

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
351233
351232
351231
351230
351229
351228
351227
351226
351225
351224
351223
351222
351221
351220
351219
351218
351217
351216
351215
351214
351213
351212
351211
351210
351209
351208
351207
351206
351205
351204
351203
351202
351201
351200
351199
351198
351197
351196
351195
351194
351193
351192
351191
351190
351189
351188
351187
351186
351185
351184
351183
351182
351181
351180
351179
351178
351177
351176
351175
351174
351173
351172
351171
351170
351169
351168
351167
351166
351165
351164
351163
351162
351161
351160
351159
351158
351157
351156
351155
351154
351153
351152
351151
351150
351149
351148
351147
351146
351145
351144
351143
351142
351141
351140
351139
351138
351137
351136
351135
351134
351133
351132
351131
351130
351129
351128
351127
351126
351125
351124
351123
351122
351121
351120
351119
351118
351117
351116
351115
351114
351113
351112
351111
351110
351109
351108
351107
351106
351105
351104
351103
3

In [None]:
Scores_df["Scores_kk08"][:443890] = Scores_kk08_ivan

In [None]:
pd.to_csv(Scores_df, "Scores_kk08_ivan", index = False)

In [None]:
443890*2

887780

In [None]:
# lorenzo
Scores_kk08_lorenzo = []
for i in range(443890, 887780):
    Scores_kk08_lorenzo.append(classifier(filtered_df_01['text'][i]))
    print(i)

In [None]:
Scores_df["Scores_kk08"][443890:887780] = Scores_kk08_lorenzo
pd.to_csv(Scores_df, "Scores_kk08_lorenzo", index = False)

In [None]:
# paola
Scores_kk08_paola = []
for i in range(887780, len(filtered_df_01)):
    Scores_kk08_paola.append(classifier(filtered_df_01['text'][i]))
    print(i)

In [None]:
Scores_df["Scores_kk08"][887780:] = Scores_kk08_paola
pd.to_csv(Scores_df, "Scores_kk08_paola", index = False)

### Don't go further!!!

In [None]:
df['kk08_CryptoBert']=Scores_kk08

In [None]:
def kk_08_label(score):
    return score[0]['label']
def kk_08_score(score):
    return score[0]['score']

In [None]:
df['kk08_label']=df['kk08_CryptoBert'].apply(kk_08_label)
df['kk08_score']=df['kk08_CryptoBert'].apply(kk_08_score)

In [None]:
# there are to much tweets

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialize the SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

# Get sentiment scores
sentiment_score = analyzer.polarity_scores(filtered_df_01["text"][2])

# Print the sentiment scores
print(sentiment_score)


{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}


In [None]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialize the SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()



# Function to apply VADER sentiment analysis
def get_sentiment(text):
    sentiment_score = analyzer.polarity_scores(text)
    return sentiment_score['compound']  # You can return other scores like 'pos', 'neg', etc.

# Apply the function to the 'text' column
filtered_df_01['sentiment'] = filtered_df_01['text'][:100000].apply(get_sentiment)


filtered_df_01

Unnamed: 0,datetime,date,username,text,Date_hourly,Date_min,Date_day,Date_30m,Date_15m,sentiment
0,2022-01-01 00:00:00+00:00,2022-01-01,CryptoNerdApp,Current Price of Bitcoin:\n$46320 (-1.85%)\n\n...,2022-01-01 01:00:00+00:00,2022-01-01 00:01:00+00:00,2022-01-02 00:00:00+00:00,2022-01-01 00:30:00+00:00,2022-01-01 00:15:00+00:00,0.0000
1,2022-01-01 17:10:02+00:00,2022-01-01,bitcoinalerts,The year for Bitcoin — A 2021 roundup of the f...,2022-01-01 18:00:00+00:00,2022-01-01 17:11:00+00:00,2022-01-02 00:00:00+00:00,2022-01-01 17:30:00+00:00,2022-01-01 17:15:00+00:00,0.1027
2,2022-01-01 17:10:00+00:00,2022-01-01,oneandonlypd,#Bitcoin is all our savings. https://t.co/SGgy...,2022-01-01 18:00:00+00:00,2022-01-01 17:11:00+00:00,2022-01-02 00:00:00+00:00,2022-01-01 17:30:00+00:00,2022-01-01 17:15:00+00:00,0.0000
3,2022-01-01 17:09:59+00:00,2022-01-01,CHAIRFORCE_BTC,@gmekhail Isn't that usually shouted at the pe...,2022-01-01 18:00:00+00:00,2022-01-01 17:10:00+00:00,2022-01-02 00:00:00+00:00,2022-01-01 17:30:00+00:00,2022-01-01 17:15:00+00:00,0.5256
4,2022-01-01 17:09:58+00:00,2022-01-01,crypto_squared,What are Decentralized Apps or DApps?\n#Crypto...,2022-01-01 18:00:00+00:00,2022-01-01 17:10:00+00:00,2022-01-02 00:00:00+00:00,2022-01-01 17:30:00+00:00,2022-01-01 17:15:00+00:00,0.0000
...,...,...,...,...,...,...,...,...,...,...
1331663,2022-01-31 10:17:58+00:00,2022-01-31,cstross,@ignaziop1977 @MLK3030 @SHOKUNIN_STUDIO At the...,2022-01-31 11:00:00+00:00,2022-01-31 10:18:00+00:00,2022-02-01 00:00:00+00:00,2022-01-31 10:30:00+00:00,2022-01-31 10:30:00+00:00,
1331664,2022-01-31 10:18:02+00:00,2022-01-31,S_Edalati,Cryptocurrency Prices On January 31 2021: Know...,2022-01-31 11:00:00+00:00,2022-01-31 10:19:00+00:00,2022-02-01 00:00:00+00:00,2022-01-31 10:30:00+00:00,2022-01-31 10:30:00+00:00,
1331665,2022-01-31 10:18:05+00:00,2022-01-31,bitcoinpressuk,#Bitcoin #BTC #CRYPTO US Financial Advisors Ex...,2022-01-31 11:00:00+00:00,2022-01-31 10:19:00+00:00,2022-02-01 00:00:00+00:00,2022-01-31 10:30:00+00:00,2022-01-31 10:30:00+00:00,
1331666,2022-01-31 10:17:28+00:00,2022-01-31,AlfaHedge,How low can #Bitcoin go ???\n\nExperts weigh i...,2022-01-31 11:00:00+00:00,2022-01-31 10:18:00+00:00,2022-02-01 00:00:00+00:00,2022-01-31 10:30:00+00:00,2022-01-31 10:30:00+00:00,


In [None]:
import pandas as pd
import re

# Sample tweet data
data = {'text': ['I love #Bitcoin and #Crypto', 'This is a great #AI!', '#Hate speech is bad #!', 'Just another #hashtag']}
df = pd.DataFrame(data)

# Function to clean hashtags that are not followed by normal text
def clean_hashtags(tweet):
    # Remove hashtags that are not followed by meaningful text (i.e., not part of a sentence)
    # Matches hashtags at the end of the string or followed only by spaces or punctuation
    tweet = re.sub(r'#\w+(?=\s*$|[^\w\s])', '', tweet)  # Remove isolated or trailing hashtags
    return tweet

# Apply the function to the 'text' column
df['cleaned_text'] = df['text'].apply(clean_hashtags)

# Display the cleaned tweets
print(df)


                          text            cleaned_text
0  I love #Bitcoin and #Crypto    I love #Bitcoin and 
1         This is a great #AI!       This is a great !
2       #Hate speech is bad #!  #Hate speech is bad #!
3        Just another #hashtag           Just another 


In [None]:
import pandas as pd

In [10]:
scores_ivan = pd.read_csv("Scores_kk08_ivan.csv")
scores_lorenzo = pd.read_csv("Scores_kk08_lorenzo.csv")
scores_claudia = pd.read_csv("Scores_kk08_paola.csv")

In [11]:
scores_claudia

Unnamed: 0,Scores_kk08
0,"[{'label': 'LABEL_1', 'score': 0.9600918889045..."
1,"[{'label': 'LABEL_1', 'score': 0.9335080385208..."
2,"[{'label': 'LABEL_1', 'score': 0.9581606984138..."
3,"[{'label': 'LABEL_1', 'score': 0.9460154771804..."
4,"[{'label': 'LABEL_1', 'score': 0.9052924513816..."
...,...
443883,"[{'label': 'LABEL_0', 'score': 0.5097414255142..."
443884,"[{'label': 'LABEL_1', 'score': 0.9015001058578..."
443885,"[{'label': 'LABEL_1', 'score': 0.9661732316017..."
443886,"[{'label': 'LABEL_1', 'score': 0.6327111124992..."


In [14]:
df_scores = pd.concat([scores_ivan, scores_lorenzo, scores_claudia], axis = 0).reset_index(drop = True)

In [15]:
df_scores

Unnamed: 0,Scores_kk08
0,"[{'label': 'LABEL_1', 'score': 0.8663870692253..."
1,"[{'label': 'LABEL_1', 'score': 0.9671657681465..."
2,"[{'label': 'LABEL_1', 'score': 0.9670489430427..."
3,"[{'label': 'LABEL_1', 'score': 0.9589074850082..."
4,"[{'label': 'LABEL_1', 'score': 0.8795316815376..."
...,...
1331663,"[{'label': 'LABEL_0', 'score': 0.5097414255142..."
1331664,"[{'label': 'LABEL_1', 'score': 0.9015001058578..."
1331665,"[{'label': 'LABEL_1', 'score': 0.9661732316017..."
1331666,"[{'label': 'LABEL_1', 'score': 0.6327111124992..."


In [16]:
import ast

def kk_08_label(score):
    return ast.literal_eval(score)[0]['label']
def kk_08_score(score):
    return ast.literal_eval(score)[0]['score']

In [17]:
df_scores['kk08_label']=df_scores['Scores_kk08'].apply(kk_08_label)
df_scores['kk08_score']=df_scores['Scores_kk08'].apply(kk_08_score)

In [18]:
df_scores

Unnamed: 0,Scores_kk08,kk08_label,kk08_score
0,"[{'label': 'LABEL_1', 'score': 0.8663870692253...",LABEL_1,0.866387
1,"[{'label': 'LABEL_1', 'score': 0.9671657681465...",LABEL_1,0.967166
2,"[{'label': 'LABEL_1', 'score': 0.9670489430427...",LABEL_1,0.967049
3,"[{'label': 'LABEL_1', 'score': 0.9589074850082...",LABEL_1,0.958907
4,"[{'label': 'LABEL_1', 'score': 0.8795316815376...",LABEL_1,0.879532
...,...,...,...
1331663,"[{'label': 'LABEL_0', 'score': 0.5097414255142...",LABEL_0,0.509741
1331664,"[{'label': 'LABEL_1', 'score': 0.9015001058578...",LABEL_1,0.901500
1331665,"[{'label': 'LABEL_1', 'score': 0.9661732316017...",LABEL_1,0.966173
1331666,"[{'label': 'LABEL_1', 'score': 0.6327111124992...",LABEL_1,0.632711


In [19]:
df_scores['Positive_score'] = df_scores.apply(lambda row: 1 - row['kk08_score'] if row['kk08_label'] == 'LABEL_0' else row['kk08_score'], axis=1)

In [20]:
df_scores[df_scores['kk08_label']=="LABEL_0"]

Unnamed: 0,Scores_kk08,kk08_label,kk08_score,Positive_score
6,"[{'label': 'LABEL_0', 'score': 0.9561374783515...",LABEL_0,0.956137,0.043863
9,"[{'label': 'LABEL_0', 'score': 0.9560947418212...",LABEL_0,0.956095,0.043905
20,"[{'label': 'LABEL_0', 'score': 0.9606007933616...",LABEL_0,0.960601,0.039399
23,"[{'label': 'LABEL_0', 'score': 0.8907170295715...",LABEL_0,0.890717,0.109283
25,"[{'label': 'LABEL_0', 'score': 0.9740657806396...",LABEL_0,0.974066,0.025934
...,...,...,...,...
1331654,"[{'label': 'LABEL_0', 'score': 0.7042182087898...",LABEL_0,0.704218,0.295782
1331656,"[{'label': 'LABEL_0', 'score': 0.9253163337707...",LABEL_0,0.925316,0.074684
1331660,"[{'label': 'LABEL_0', 'score': 0.9604796171188...",LABEL_0,0.960480,0.039520
1331661,"[{'label': 'LABEL_0', 'score': 0.6241708397865...",LABEL_0,0.624171,0.375829


In [21]:
current_df = pd.concat([df, df_scores["Positive_score"]], axis=1)

In [22]:
current_df

Unnamed: 0,datetime,date,username,text,Date_hourly,Date_min,Date_day,Date_30m,Date_15m,cleaned_text,Positive_score
0,2022-01-01 00:00:00+00:00,2022-01-01,CryptoNerdApp,Current Price of Bitcoin:\n$46320 (-1.85%)\n\n...,2022-01-01 01:00:00+00:00,2022-01-01 00:01:00+00:00,2022-01-02 00:00:00+00:00,2022-01-01 00:30:00+00:00,2022-01-01 00:15:00+00:00,current price bitcoin btc bitcoin cryptocurren...,0.866387
1,2022-01-01 17:10:02+00:00,2022-01-01,bitcoinalerts,The year for Bitcoin — A 2021 roundup of the f...,2022-01-01 18:00:00+00:00,2022-01-01 17:11:00+00:00,2022-01-02 00:00:00+00:00,2022-01-01 17:30:00+00:00,2022-01-01 17:15:00+00:00,year bitcoin roundup flagship crypto,0.967166
2,2022-01-01 17:10:00+00:00,2022-01-01,oneandonlypd,#Bitcoin is all our savings. https://t.co/SGgy...,2022-01-01 18:00:00+00:00,2022-01-01 17:11:00+00:00,2022-01-02 00:00:00+00:00,2022-01-01 17:30:00+00:00,2022-01-01 17:15:00+00:00,bitcoin saving,0.967049
3,2022-01-01 17:09:59+00:00,2022-01-01,CHAIRFORCE_BTC,@gmekhail Isn't that usually shouted at the pe...,2022-01-01 18:00:00+00:00,2022-01-01 17:10:00+00:00,2022-01-02 00:00:00+00:00,2022-01-01 17:30:00+00:00,2022-01-01 17:15:00+00:00,isnt usually shouted people promoting going bi...,0.958907
4,2022-01-01 17:09:58+00:00,2022-01-01,crypto_squared,What are Decentralized Apps or DApps?\n#Crypto...,2022-01-01 18:00:00+00:00,2022-01-01 17:10:00+00:00,2022-01-02 00:00:00+00:00,2022-01-01 17:30:00+00:00,2022-01-01 17:15:00+00:00,decentralized apps dapps cryptonews bitcoin cr...,0.879532
...,...,...,...,...,...,...,...,...,...,...,...
1331663,2022-01-31 10:17:58+00:00,2022-01-31,cstross,@ignaziop1977 @MLK3030 @SHOKUNIN_STUDIO At the...,2022-01-31 11:00:00+00:00,2022-01-31 10:18:00+00:00,2022-02-01 00:00:00+00:00,2022-01-31 10:30:00+00:00,2022-01-31 10:30:00+00:00,point wrote book unaware existence blockchainb...,0.490259
1331664,2022-01-31 10:18:02+00:00,2022-01-31,S_Edalati,Cryptocurrency Prices On January 31 2021: Know...,2022-01-31 11:00:00+00:00,2022-01-31 10:19:00+00:00,2022-02-01 00:00:00+00:00,2022-01-31 10:30:00+00:00,2022-01-31 10:30:00+00:00,cryptocurrency price january know rate bitcoin...,0.901500
1331665,2022-01-31 10:18:05+00:00,2022-01-31,bitcoinpressuk,#Bitcoin #BTC #CRYPTO US Financial Advisors Ex...,2022-01-31 11:00:00+00:00,2022-01-31 10:19:00+00:00,2022-02-01 00:00:00+00:00,2022-01-31 10:30:00+00:00,2022-01-31 10:30:00+00:00,bitcoin btc crypto us financial advisor expect...,0.966173
1331666,2022-01-31 10:17:28+00:00,2022-01-31,AlfaHedge,How low can #Bitcoin go ???\n\nExperts weigh i...,2022-01-31 11:00:00+00:00,2022-01-31 10:18:00+00:00,2022-02-01 00:00:00+00:00,2022-01-31 10:30:00+00:00,2022-01-31 10:30:00+00:00,low bitcoin go expert weigh cryptonews cryptoc...,0.632711


In [23]:
current_df = current_df.sort_values(by="datetime").reset_index(drop=True)

In [24]:
current_df

Unnamed: 0,datetime,date,username,text,Date_hourly,Date_min,Date_day,Date_30m,Date_15m,cleaned_text,Positive_score
0,2022-01-01 00:00:00+00:00,2022-01-01,CryptoNerdApp,Current Price of Bitcoin:\n$46320 (-1.85%)\n\n...,2022-01-01 01:00:00+00:00,2022-01-01 00:01:00+00:00,2022-01-02 00:00:00+00:00,2022-01-01 00:30:00+00:00,2022-01-01 00:15:00+00:00,current price bitcoin btc bitcoin cryptocurren...,0.866387
1,2022-01-01 00:00:00+00:00,2022-01-01,TrendSpider,$BTC Continues to bounce off the YTD anchored ...,2022-01-01 01:00:00+00:00,2022-01-01 00:01:00+00:00,2022-01-02 00:00:00+00:00,2022-01-01 00:30:00+00:00,2022-01-01 00:15:00+00:00,btc continues bounce ytd anchored vwap bitcoin,0.960633
2,2022-01-01 00:00:00+00:00,2022-01-01,MadStudentScie1,Generating misunderstanding for Bitcoin!,2022-01-01 01:00:00+00:00,2022-01-01 00:01:00+00:00,2022-01-02 00:00:00+00:00,2022-01-01 00:30:00+00:00,2022-01-01 00:15:00+00:00,generating misunderstanding bitcoin,0.512859
3,2022-01-01 00:00:01+00:00,2022-01-01,WIRTUALapp,HAPPY NEW YEAR 2022! Experience a new way to e...,2022-01-01 01:00:00+00:00,2022-01-01 00:01:00+00:00,2022-01-02 00:00:00+00:00,2022-01-01 00:30:00+00:00,2022-01-01 00:15:00+00:00,happy new year experience new way exercise wir...,0.962380
4,2022-01-01 00:00:01+00:00,2022-01-01,takeoff_tech,【$TKO token #1】\n\nTKO token will be a utility...,2022-01-01 01:00:00+00:00,2022-01-01 00:01:00+00:00,2022-01-02 00:00:00+00:00,2022-01-01 00:30:00+00:00,2022-01-01 00:15:00+00:00,tko token tko token utility token service fee ...,0.897967
...,...,...,...,...,...,...,...,...,...,...,...
1331663,2022-01-31 23:59:46+00:00,2022-01-31,deyonte_btc,One of the largest asset managers on the plane...,2022-02-01 00:00:00+00:00,2022-02-01 00:00:00+00:00,2022-02-01 00:00:00+00:00,2022-02-01 00:00:00+00:00,2022-02-01 00:00:00+00:00,one largest asset manager planet trillion aum ...,0.027183
1331664,2022-01-31 23:59:52+00:00,2022-01-31,Baripondiss,"Lol... Oh, you just realized, you're still ear...",2022-02-01 00:00:00+00:00,2022-02-01 00:00:00+00:00,2022-02-01 00:00:00+00:00,2022-02-01 00:00:00+00:00,2022-02-01 00:00:00+00:00,lol oh realized youre still early welcome bitcoin,0.963527
1331665,2022-01-31 23:59:52+00:00,2022-01-31,galaxy_orion,@julesapril8 @jimcramer I know. \n\n- He instr...,2022-02-01 00:00:00+00:00,2022-02-01 00:00:00+00:00,2022-02-01 00:00:00+00:00,2022-02-01 00:00:00+00:00,2022-02-01 00:00:00+00:00,know instruct agency ban bitcoin russia get wo...,0.022304
1331666,2022-01-31 23:59:55+00:00,2022-01-31,BitcoinFeesCash,Updated Bitcoin transaction fees: \n \nBCH ...,2022-02-01 00:00:00+00:00,2022-02-01 00:00:00+00:00,2022-02-01 00:00:00+00:00,2022-02-01 00:00:00+00:00,2022-02-01 00:00:00+00:00,updated bitcoin transaction fee bch next block...,0.938034


In [25]:
current_df['datetime'] = pd.to_datetime(current_df['datetime']).dt.tz_localize(None)
current_df['Date_hourly'] = pd.to_datetime(current_df['Date_hourly']).dt.tz_localize(None)
current_df['Date_min'] = pd.to_datetime(current_df['Date_min']).dt.tz_localize(None)
current_df['Date_15m'] = pd.to_datetime(current_df['Date_15m']).dt.tz_localize(None)
current_df['Date_30m'] = pd.to_datetime(current_df['Date_30m']).dt.tz_localize(None)

In [26]:
current_df

Unnamed: 0,datetime,date,username,text,Date_hourly,Date_min,Date_day,Date_30m,Date_15m,cleaned_text,Positive_score
0,2022-01-01 00:00:00,2022-01-01,CryptoNerdApp,Current Price of Bitcoin:\n$46320 (-1.85%)\n\n...,2022-01-01 01:00:00,2022-01-01 00:01:00,2022-01-02 00:00:00+00:00,2022-01-01 00:30:00,2022-01-01 00:15:00,current price bitcoin btc bitcoin cryptocurren...,0.866387
1,2022-01-01 00:00:00,2022-01-01,TrendSpider,$BTC Continues to bounce off the YTD anchored ...,2022-01-01 01:00:00,2022-01-01 00:01:00,2022-01-02 00:00:00+00:00,2022-01-01 00:30:00,2022-01-01 00:15:00,btc continues bounce ytd anchored vwap bitcoin,0.960633
2,2022-01-01 00:00:00,2022-01-01,MadStudentScie1,Generating misunderstanding for Bitcoin!,2022-01-01 01:00:00,2022-01-01 00:01:00,2022-01-02 00:00:00+00:00,2022-01-01 00:30:00,2022-01-01 00:15:00,generating misunderstanding bitcoin,0.512859
3,2022-01-01 00:00:01,2022-01-01,WIRTUALapp,HAPPY NEW YEAR 2022! Experience a new way to e...,2022-01-01 01:00:00,2022-01-01 00:01:00,2022-01-02 00:00:00+00:00,2022-01-01 00:30:00,2022-01-01 00:15:00,happy new year experience new way exercise wir...,0.962380
4,2022-01-01 00:00:01,2022-01-01,takeoff_tech,【$TKO token #1】\n\nTKO token will be a utility...,2022-01-01 01:00:00,2022-01-01 00:01:00,2022-01-02 00:00:00+00:00,2022-01-01 00:30:00,2022-01-01 00:15:00,tko token tko token utility token service fee ...,0.897967
...,...,...,...,...,...,...,...,...,...,...,...
1331663,2022-01-31 23:59:46,2022-01-31,deyonte_btc,One of the largest asset managers on the plane...,2022-02-01 00:00:00,2022-02-01 00:00:00,2022-02-01 00:00:00+00:00,2022-02-01 00:00:00,2022-02-01 00:00:00,one largest asset manager planet trillion aum ...,0.027183
1331664,2022-01-31 23:59:52,2022-01-31,Baripondiss,"Lol... Oh, you just realized, you're still ear...",2022-02-01 00:00:00,2022-02-01 00:00:00,2022-02-01 00:00:00+00:00,2022-02-01 00:00:00,2022-02-01 00:00:00,lol oh realized youre still early welcome bitcoin,0.963527
1331665,2022-01-31 23:59:52,2022-01-31,galaxy_orion,@julesapril8 @jimcramer I know. \n\n- He instr...,2022-02-01 00:00:00,2022-02-01 00:00:00,2022-02-01 00:00:00+00:00,2022-02-01 00:00:00,2022-02-01 00:00:00,know instruct agency ban bitcoin russia get wo...,0.022304
1331666,2022-01-31 23:59:55,2022-01-31,BitcoinFeesCash,Updated Bitcoin transaction fees: \n \nBCH ...,2022-02-01 00:00:00,2022-02-01 00:00:00,2022-02-01 00:00:00+00:00,2022-02-01 00:00:00,2022-02-01 00:00:00,updated bitcoin transaction fee bch next block...,0.938034


In [27]:
current_df.to_csv("Tweets_with_sentiment.csv", index = False)

In [28]:
fin_df = pd.read_csv("Binance_BTCUSDT_1h.csv", )

In [29]:
def sign(x):
  return 1 if x>=0 else 0

In [30]:
fin_df.columns

Index(['Unix', 'Date', 'Symbol', 'Open', 'High', 'Low', 'Close', 'Volume BTC',
       'Volume USDT', 'tradecount'],
      dtype='object')

In [31]:
fin_df["Return"] = (fin_df["Close"] - fin_df["Open"])/fin_df["Open"] * 100
fin_df["Return_sign"] = fin_df["Return"].apply(sign)

In [32]:
fin_df

Unnamed: 0,Unix,Date,Symbol,Open,High,Low,Close,Volume BTC,Volume USDT,tradecount,Return,Return_sign
0,1730156400000,2024-10-28 23:00:00,BTCUSDT,69961.64,70145.50,69733.30,69962.21,928.304390,6.492362e+07,145948,0.000815,1
1,1730152800000,2024-10-28 22:00:00,BTCUSDT,69791.00,70270.00,69791.00,69961.65,3417.361800,2.391833e+08,313475,0.244516,1
2,1730149200000,2024-10-28 21:00:00,BTCUSDT,69644.57,69797.99,69518.35,69791.00,637.094310,4.439927e+07,82357,0.210253,1
3,1730145600000,2024-10-28 20:00:00,BTCUSDT,69629.99,69719.59,69427.79,69644.56,838.752480,5.832339e+07,115659,0.020925,1
4,1730142000000,2024-10-28 19:00:00,BTCUSDT,69212.12,69900.00,69150.01,69629.98,4352.023460,3.029711e+08,366750,0.603738,1
...,...,...,...,...,...,...,...,...,...,...,...,...
62982,1502956800000,2017-08-17 08:00:00,BTCUSDT,4333.32,4377.85,4333.32,4360.69,0.972807,4.239504e+03,28,0.631617,1
62983,1502953200000,2017-08-17 07:00:00,BTCUSDT,4316.62,4349.99,4287.41,4349.99,4.443249,1.924106e+04,25,0.773059,1
62984,1502949600000,2017-08-17 06:00:00,BTCUSDT,4330.29,4345.45,4309.37,4324.35,7.229691,3.128231e+04,36,-0.137173,0
62985,1502946000000,2017-08-17 05:00:00,BTCUSDT,4308.83,4328.69,4291.37,4315.32,23.234916,1.003048e+05,102,0.150621,1


In [33]:
pd.to_datetime(current_df["Date_hourly"][0])
# same timezone as in fin data

Timestamp('2022-01-01 01:00:00')

In [35]:
fin_df["Date"][58755]

'2018-02-11 03:28:14.789'

In [None]:
fin_df["Date"] = pd.to_datetime(fin_df["Date"])
# there are some early observations with a wrong data format

ValueError: unconverted data remains when parsing with format "%Y-%m-%d %H:%M:%S": ".789", at position 58755. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

In [36]:
fin_df[fin_df["Date"].str.contains("2022-01")]

Unnamed: 0,Unix,Date,Symbol,Open,High,Low,Close,Volume BTC,Volume USDT,tradecount,Return,Return_sign
24021,1643670000000,2022-01-31 23:00:00,BTCUSDT,38412.45,38510.28,38336.04,38466.90,574.60408,2.208489e+07,20525,0.141751,1
24022,1643666400000,2022-01-31 22:00:00,BTCUSDT,38410.09,38737.99,38398.58,38412.46,927.84680,3.576082e+07,30965,0.006170,1
24023,1643662800000,2022-01-31 21:00:00,BTCUSDT,38450.62,38531.53,38335.00,38410.09,923.75238,3.548381e+07,23243,-0.105408,0
24024,1643659200000,2022-01-31 20:00:00,BTCUSDT,38415.79,38563.37,38236.69,38450.62,1639.69627,6.300914e+07,35193,0.090666,1
24025,1643655600000,2022-01-31 19:00:00,BTCUSDT,38468.29,38744.00,38383.91,38415.79,1944.84957,7.494555e+07,46497,-0.136476,0
...,...,...,...,...,...,...,...,...,...,...,...,...
24760,1641009600000,2022-01-01 04:00:00,BTCUSDT,46813.21,46887.33,46591.23,46711.05,861.88389,4.027204e+07,23357,-0.218229,0
24761,1641006000000,2022-01-01 03:00:00,BTCUSDT,46811.77,46916.63,46760.12,46813.20,562.88971,2.636326e+07,19882,0.003055,1
24762,1641002400000,2022-01-01 02:00:00,BTCUSDT,46778.14,46928.94,46721.96,46811.77,485.16860,2.272067e+07,24364,0.071893,1
24763,1640998800000,2022-01-01 01:00:00,BTCUSDT,46656.14,46949.99,46574.06,46778.14,943.81539,4.412715e+07,31872,0.261488,1


In [37]:

fin_df_filtered = fin_df[fin_df["Date"].str.contains("2022-01")].reset_index(drop=True)

In [38]:
fin_df_filtered["Date"] = pd.to_datetime(fin_df_filtered["Date"])

In [39]:
fin_df_filtered = fin_df_filtered.sort_values(by="Date").reset_index(drop=True)

In [None]:
# once again to be sure
fin_df_filtered = fin_df_filtered[(fin_df_filtered["Date"]>=pd.to_datetime("2022-01-01")) & (fin_df_filtered["Date"]<pd.to_datetime("2022-02-01"))].sort_values(by="Date").reset_index(drop=True)

In [41]:
fin_df_filtered

Unnamed: 0,Unix,Date,Symbol,Open,High,Low,Close,Volume BTC,Volume USDT,tradecount,Return,Return_sign
0,1640995200000,2022-01-01 00:00:00,BTCUSDT,46216.93,46731.39,46208.37,46656.13,1503.33095,6.987999e+07,38608,0.950301,1
1,1640998800000,2022-01-01 01:00:00,BTCUSDT,46656.14,46949.99,46574.06,46778.14,943.81539,4.412715e+07,31872,0.261488,1
2,1641002400000,2022-01-01 02:00:00,BTCUSDT,46778.14,46928.94,46721.96,46811.77,485.16860,2.272067e+07,24364,0.071893,1
3,1641006000000,2022-01-01 03:00:00,BTCUSDT,46811.77,46916.63,46760.12,46813.20,562.88971,2.636326e+07,19882,0.003055,1
4,1641009600000,2022-01-01 04:00:00,BTCUSDT,46813.21,46887.33,46591.23,46711.05,861.88389,4.027204e+07,23357,-0.218229,0
...,...,...,...,...,...,...,...,...,...,...,...,...
739,1643655600000,2022-01-31 19:00:00,BTCUSDT,38468.29,38744.00,38383.91,38415.79,1944.84957,7.494555e+07,46497,-0.136476,0
740,1643659200000,2022-01-31 20:00:00,BTCUSDT,38415.79,38563.37,38236.69,38450.62,1639.69627,6.300914e+07,35193,0.090666,1
741,1643662800000,2022-01-31 21:00:00,BTCUSDT,38450.62,38531.53,38335.00,38410.09,923.75238,3.548381e+07,23243,-0.105408,0
742,1643666400000,2022-01-31 22:00:00,BTCUSDT,38410.09,38737.99,38398.58,38412.46,927.84680,3.576082e+07,30965,0.006170,1


In [None]:
sum(fin_df_filtered["Return_sign"])/len(fin_df_filtered)
# almost balanced return data

0.5134408602150538

In [43]:
current_df

Unnamed: 0,datetime,date,username,text,Date_hourly,Date_min,Date_day,Date_30m,Date_15m,cleaned_text,Positive_score
0,2022-01-01 00:00:00,2022-01-01,CryptoNerdApp,Current Price of Bitcoin:\n$46320 (-1.85%)\n\n...,2022-01-01 01:00:00,2022-01-01 00:01:00,2022-01-02 00:00:00+00:00,2022-01-01 00:30:00,2022-01-01 00:15:00,current price bitcoin btc bitcoin cryptocurren...,0.866387
1,2022-01-01 00:00:00,2022-01-01,TrendSpider,$BTC Continues to bounce off the YTD anchored ...,2022-01-01 01:00:00,2022-01-01 00:01:00,2022-01-02 00:00:00+00:00,2022-01-01 00:30:00,2022-01-01 00:15:00,btc continues bounce ytd anchored vwap bitcoin,0.960633
2,2022-01-01 00:00:00,2022-01-01,MadStudentScie1,Generating misunderstanding for Bitcoin!,2022-01-01 01:00:00,2022-01-01 00:01:00,2022-01-02 00:00:00+00:00,2022-01-01 00:30:00,2022-01-01 00:15:00,generating misunderstanding bitcoin,0.512859
3,2022-01-01 00:00:01,2022-01-01,WIRTUALapp,HAPPY NEW YEAR 2022! Experience a new way to e...,2022-01-01 01:00:00,2022-01-01 00:01:00,2022-01-02 00:00:00+00:00,2022-01-01 00:30:00,2022-01-01 00:15:00,happy new year experience new way exercise wir...,0.962380
4,2022-01-01 00:00:01,2022-01-01,takeoff_tech,【$TKO token #1】\n\nTKO token will be a utility...,2022-01-01 01:00:00,2022-01-01 00:01:00,2022-01-02 00:00:00+00:00,2022-01-01 00:30:00,2022-01-01 00:15:00,tko token tko token utility token service fee ...,0.897967
...,...,...,...,...,...,...,...,...,...,...,...
1331663,2022-01-31 23:59:46,2022-01-31,deyonte_btc,One of the largest asset managers on the plane...,2022-02-01 00:00:00,2022-02-01 00:00:00,2022-02-01 00:00:00+00:00,2022-02-01 00:00:00,2022-02-01 00:00:00,one largest asset manager planet trillion aum ...,0.027183
1331664,2022-01-31 23:59:52,2022-01-31,Baripondiss,"Lol... Oh, you just realized, you're still ear...",2022-02-01 00:00:00,2022-02-01 00:00:00,2022-02-01 00:00:00+00:00,2022-02-01 00:00:00,2022-02-01 00:00:00,lol oh realized youre still early welcome bitcoin,0.963527
1331665,2022-01-31 23:59:52,2022-01-31,galaxy_orion,@julesapril8 @jimcramer I know. \n\n- He instr...,2022-02-01 00:00:00,2022-02-01 00:00:00,2022-02-01 00:00:00+00:00,2022-02-01 00:00:00,2022-02-01 00:00:00,know instruct agency ban bitcoin russia get wo...,0.022304
1331666,2022-01-31 23:59:55,2022-01-31,BitcoinFeesCash,Updated Bitcoin transaction fees: \n \nBCH ...,2022-02-01 00:00:00,2022-02-01 00:00:00,2022-02-01 00:00:00+00:00,2022-02-01 00:00:00,2022-02-01 00:00:00,updated bitcoin transaction fee bch next block...,0.938034


In [44]:
df_current_grouped = current_df.groupby("Date_hourly").agg({"datetime":list,"text":"\n".join,"username":"\n".join,"Positive_score":"mean"}).reset_index()

In [45]:
df_current_grouped

Unnamed: 0,Date_hourly,datetime,text,username,Positive_score
0,2022-01-01 01:00:00,"[2022-01-01 00:00:00, 2022-01-01 00:00:00, 202...",Current Price of Bitcoin:\n$46320 (-1.85%)\n\n...,CryptoNerdApp\nTrendSpider\nMadStudentScie1\nW...,0.755837
1,2022-01-01 02:00:00,"[2022-01-01 01:00:00, 2022-01-01 01:00:00, 202...",Today's Cryptocurrency Fear And Greed Index: 2...,FearAndGreedBot\nOfficialOverbit\nBTC_AI_bot\n...,0.775810
2,2022-01-01 03:00:00,"[2022-01-01 02:00:00, 2022-01-01 02:00:00, 202...",Huobi Singapore would like to wish you a Happy...,HuobiSg\nAmazonCash4Cars\nBTC_AI_bot\nCalvinAy...,0.773046
3,2022-01-01 04:00:00,"[2022-01-01 03:00:00, 2022-01-01 03:00:00, 202...",1 ARS is 10.26 satoshis right now.\n\n1 satosh...,SatoshiARS_bot\nbitcoinpartyall\ncryptopediaK7...,0.760946
4,2022-01-01 05:00:00,"[2022-01-01 04:00:00, 2022-01-01 04:00:00, 202...",Buy Bitcoin $BTC @ 46813.2\nAn online trader h...,BTC_AI_bot\nnewscomauHQ\ncryptozone007\nCarbon...,0.771097
...,...,...,...,...,...
739,2022-01-31 20:00:00,"[2022-01-31 19:00:00, 2022-01-31 19:00:00, 202...",What's your favourite coins to stake and where...,CheekyCrypto\naburke626\nQuantumDelta\nBTC_AI_...,0.755899
740,2022-01-31 21:00:00,"[2022-01-31 20:00:00, 2022-01-31 20:00:00, 202...",Buy Bitcoin $BTC @ 38415.79\nLIVE: #Bitcoin be...,BTC_AI_bot\nCoinDesk\nbitcoinrate247\nSquatche...,0.760233
741,2022-01-31 22:00:00,"[2022-01-31 21:00:00, 2022-01-31 21:00:00, 202...",1 ARS is 12.17 satoshis right now.\n\n1 satosh...,SatoshiARS_bot\nCryptoTopCharts\nIBKR\nCryptoN...,0.765209
742,2022-01-31 23:00:00,"[2022-01-31 22:00:00, 2022-01-31 22:00:00, 202...","BTC Latest Block Info: Block 721243 holds 2,80...",btc_blockbot\nM1Seahawk\nnialljburke\nmurray_r...,0.789636


In [46]:
df_merged = pd.merge(df_current_grouped[["datetime","username", "text","Date_hourly", "Positive_score"]], fin_df_filtered.drop(["Unix", "Symbol"], axis=1), left_on="Date_hourly", right_on="Date", how='inner')


In [None]:
df_merged
# length 743, because our text data features are for the previous time interval

Unnamed: 0,datetime,username,text,Date_hourly,Positive_score,Date,Open,High,Low,Close,Volume BTC,Volume USDT,tradecount,Return,Return_sign
0,"[2022-01-01 00:00:00, 2022-01-01 00:00:00, 202...",CryptoNerdApp\nTrendSpider\nMadStudentScie1\nW...,Current Price of Bitcoin:\n$46320 (-1.85%)\n\n...,2022-01-01 01:00:00,0.755837,2022-01-01 01:00:00,46656.14,46949.99,46574.06,46778.14,943.81539,4.412715e+07,31872,0.261488,1
1,"[2022-01-01 01:00:00, 2022-01-01 01:00:00, 202...",FearAndGreedBot\nOfficialOverbit\nBTC_AI_bot\n...,Today's Cryptocurrency Fear And Greed Index: 2...,2022-01-01 02:00:00,0.775810,2022-01-01 02:00:00,46778.14,46928.94,46721.96,46811.77,485.16860,2.272067e+07,24364,0.071893,1
2,"[2022-01-01 02:00:00, 2022-01-01 02:00:00, 202...",HuobiSg\nAmazonCash4Cars\nBTC_AI_bot\nCalvinAy...,Huobi Singapore would like to wish you a Happy...,2022-01-01 03:00:00,0.773046,2022-01-01 03:00:00,46811.77,46916.63,46760.12,46813.20,562.88971,2.636326e+07,19882,0.003055,1
3,"[2022-01-01 03:00:00, 2022-01-01 03:00:00, 202...",SatoshiARS_bot\nbitcoinpartyall\ncryptopediaK7...,1 ARS is 10.26 satoshis right now.\n\n1 satosh...,2022-01-01 04:00:00,0.760946,2022-01-01 04:00:00,46813.21,46887.33,46591.23,46711.05,861.88389,4.027204e+07,23357,-0.218229,0
4,"[2022-01-01 04:00:00, 2022-01-01 04:00:00, 202...",BTC_AI_bot\nnewscomauHQ\ncryptozone007\nCarbon...,Buy Bitcoin $BTC @ 46813.2\nAn online trader h...,2022-01-01 05:00:00,0.771097,2022-01-01 05:00:00,46711.05,47555.55,46673.94,47192.55,1400.73642,6.618831e+07,41431,1.030805,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
738,"[2022-01-31 18:00:00, 2022-01-31 18:00:00, 202...",_bit4all\nCoinCrop\nHayess5178__\nBrian_V_Marv...,Funds are safu.\n\n#EmbraceTheFuture #GeraçaoB...,2022-01-31 19:00:00,0.780515,2022-01-31 19:00:00,38468.29,38744.00,38383.91,38415.79,1944.84957,7.494555e+07,46497,-0.136476,0
739,"[2022-01-31 19:00:00, 2022-01-31 19:00:00, 202...",CheekyCrypto\naburke626\nQuantumDelta\nBTC_AI_...,What's your favourite coins to stake and where...,2022-01-31 20:00:00,0.755899,2022-01-31 20:00:00,38415.79,38563.37,38236.69,38450.62,1639.69627,6.300914e+07,35193,0.090666,1
740,"[2022-01-31 20:00:00, 2022-01-31 20:00:00, 202...",BTC_AI_bot\nCoinDesk\nbitcoinrate247\nSquatche...,Buy Bitcoin $BTC @ 38415.79\nLIVE: #Bitcoin be...,2022-01-31 21:00:00,0.760233,2022-01-31 21:00:00,38450.62,38531.53,38335.00,38410.09,923.75238,3.548381e+07,23243,-0.105408,0
741,"[2022-01-31 21:00:00, 2022-01-31 21:00:00, 202...",SatoshiARS_bot\nCryptoTopCharts\nIBKR\nCryptoN...,1 ARS is 12.17 satoshis right now.\n\n1 satosh...,2022-01-31 22:00:00,0.765209,2022-01-31 22:00:00,38410.09,38737.99,38398.58,38412.46,927.84680,3.576082e+07,30965,0.006170,1


In [50]:
df_merged.to_csv("Hourly_merged_data.csv", index = False)