# Fetching data from [cryptocompare.com](https://www.cryptocompare.com)

In [1]:
import requests

class CryptoApi:
    def __init__(self) -> None:
        self.mother_endpoint = "https://min-api.cryptocompare.com/data"

        with open('token.txt', 'r') as f:
            self.TOKEN = f.readline()

        self.HEADER = {'Authorization': self.TOKEN,
                       "Content-Type": 'application/json'}


    def get_data(self, crypto:str, currency:str, period:str, period_count:int, allData=0):
        '''Returns crypto summary for a given period in specified currency.

        Args:
            crypto: str
                BTC/ETH/DOGE etc.
            currency: str
                USD/EUR/UAH etc.
            period: str 
                day/hour/minute.
            period_count: int
                last n of a period (n days)
                n = 1 returns revious day/hour/minute + current
            allData: int
                bool doen't work
                1 - get all records;
                0 - get specified amount of period_count.
        Returns:
            dict: json containing request's response.
        '''        

        endpoint = f'{self.mother_endpoint}/v2/histo{period}?fsym={crypto}&tsym={currency}&limit={period_count}'
        response = requests.get(endpoint, params={'allData':allData}, headers=self.HEADER)
        
        return response.json()
    
    def execute_custom_getrequest(self, endpoint:str):
        return requests.get(endpoint, headers=self.HEADER)
    
data = CryptoApi()
#data.get_data('btc','usd','day',29)

In [2]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed


date_range = pd.date_range(start='2023-04-11', end='2023-04-13', freq='d')

queries = [f"from:elonmusk since:{d1.strftime('%Y-%m-%d')} until:{d2.strftime('%Y-%m-%d')}"
           for d1, d2 in zip(date_range, date_range[1:])]


def parallel_download_tweets():
    def sequent_download_tweets(query):
        tweets = []
        for tweet in sntwitter.TwitterSearchScraper(query).get_items():
            tweets.append(tweet)
        return tweets

    tweets_list = []
    with ThreadPoolExecutor(max_workers=8) as executor:
        futures = [executor.submit(sequent_download_tweets, query) for query in queries]
            
        # Append results to the tweets list
        for future in as_completed(futures):
            tweets_list += future.result()
            
    return tweets_list


#tweets_df = pd.DataFrame(parallel_download_tweets())
#tweets_df.to_csv('elon_twt.csv')

In [None]:
from datetime import datetime

raw_data = data.get_data('btc','usd','day',30)

df = pd.DataFrame(raw_data['Data']['Data'])
df['time'] = df['time'].apply(lambda x: datetime.fromtimestamp(x).date())

raw_btc = data.get_data('btc','usd','day', 2000) #10 years bitcoin


btc_df = pd.DataFrame(raw_btc['Data']['Data'])
btc_df['time'] = btc_df['time'].apply(lambda x: datetime.fromtimestamp(x).date())

In [None]:
'''import matplotlib.pyplot as plt

filtered_df = btc_df[btc_df['time']>=datetime.strptime('2022-05-15','%Y-%m-%d').date()].copy()
filtered_df.index = filtered_df['time']


plt.figure(figsize=(12,8))
plt.plot(filtered_df['close'])
plt.plot(filtered_df['close'].rolling(28).mean())'''

"import matplotlib.pyplot as plt\n\nfiltered_df = btc_df[btc_df['time']>=datetime.strptime('2022-05-15','%Y-%m-%d').date()].copy()\nfiltered_df.index = filtered_df['time']\n\n\nplt.figure(figsize=(12,8))\nplt.plot(filtered_df['close'])\nplt.plot(filtered_df['close'].rolling(28).mean())"

In [None]:
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import numpy as np


def plotMovingAverage(series, window, plot_intervals=False, scale=1.96, plot_anomalies=False):
    """
        series - dataframe with timeseries
        window - rolling window size 
        plot_intervals - show confidence intervals
        plot_anomalies - show anomalies 
    """
    rolling_mean = series.rolling(window=window).mean()

    plt.figure(figsize=(18,10))
    plt.title("Moving average\n window size = {}".format(window))
    plt.plot(rolling_mean, "g", label="Rolling mean trend")

    # Plot confidence intervals for smoothed values
    if plot_intervals:
        mae = mean_absolute_error(series[window:], rolling_mean[window:])
        deviation = np.std(series[window:] - rolling_mean[window:])
        lower_bond = rolling_mean - (mae + scale * deviation)
        upper_bond = rolling_mean + (mae + scale * deviation)
        plt.plot(upper_bond, "r--", label="Upper Bond / Lower Bond")
        plt.plot(lower_bond, "r--")
        
        # Having the intervals, find abnormal values
        if plot_anomalies:
            anomalies = pd.DataFrame(index=series.index, columns=series.columns)
            anomalies[series < lower_bond] = series[series < lower_bond]
            anomalies[series > upper_bond] = series[series > upper_bond]
            plt.plot(anomalies, "ro", markersize=10, label='Anomalies')
        
    plt.plot(series[window:], label="Actual values")
    plt.legend(loc="upper left")
    plt.grid(True)
    
#plotMovingAverage(filtered_df[['close']], 30, plot_intervals=True, plot_anomalies=True)

In [None]:
def exponential_smoothing(series, alpha):
    """
        series - dataset with timestamps
        alpha - float [0.0, 1.0], smoothing parameter
    """
    result = [series[0]] # first value is same as series
    for n in range(1, len(series)):
        result.append(alpha * series[n] + (1 - alpha) * result[n-1])
    return result
    
def plotExponentialSmoothing(series, alphas):
    """
        Plots exponential smoothing with different alphas
        
        series - dataset with timestamps
        alphas - list of floats, smoothing parameters
        
    """
    with plt.style.context('seaborn-white'):    
        plt.figure(figsize=(15, 7))
        for alpha in alphas:
            plt.plot(exponential_smoothing(series, alpha), label=f"Alpha {alpha}")
            
        plt.plot(series.values, "c", label = "Actual")
        plt.legend(loc="best")
        plt.axis('tight')
        plt.title("Exponential Smoothing")
        plt.grid(True)
        
#plotExponentialSmoothing(currency.GEMS_GEMS_SPENT, [0.3, 0.05])
#plotExponentialSmoothing(filtered_df['close'], [0.05, 0.2])

In [None]:
def double_exponential_smoothing(series, alpha, beta):
    """
        series - dataset with timeseries
        alpha - float [0.0, 1.0], smoothing parameter for level
        beta - float [0.0, 1.0], smoothing parameter for trend
    """
    # first value is same as series
    result = [series[0]]
    for n in range(1, len(series)+1):
        if n == 1:
            level, trend = series[0], series[1] - series[0]
        if n >= len(series): # forecasting
            value = result[-1]
        else:
            value = series[n]
        last_level, level = level, alpha*value + (1-alpha)*(level+trend)
        trend = beta*(level-last_level) + (1-beta)*trend
        result.append(level+trend)
    return result

def plotDoubleExponentialSmoothing(series, alphas, betas):
    """
        Plots double exponential smoothing with different alphas and betas
        
        series - dataset with timestamps
        alphas - list of floats, smoothing parameters for level
        betas - list of floats, smoothing parameters for trend
    """
    
    with plt.style.context('seaborn-white'):    
        plt.figure(figsize=(20, 8))
        for alpha in alphas:
            for beta in betas:
                plt.plot(double_exponential_smoothing(series, alpha, beta), label="Alpha {}, beta {}".format(alpha, beta))
        plt.plot(series.values, label = "Actual")
        plt.legend(loc="best")
        plt.axis('tight')
        plt.title("Double Exponential Smoothing")
        plt.grid(True)
        
#plotDoubleExponentialSmoothing(filtered_df['close'], alphas=[0.9, 0.02], betas=[0.9, 0.02])

## Analysis of tweets

In [None]:
import pandas as pd

tweets_df = pd.read_csv('elon_tweets.csv', index_col=0)
tweets_df['date'] = pd.to_datetime(tweets_df['date'])

def select_notna_cols(df: pd.DataFrame, threshold: float):
    """
    Selects columns with at least the specified proportion of non-null values.
    
    Args:
        df (pandas.DataFrame): DataFrame to select columns from.
        threshold (float): The minimum proportion of non-null values a column must have to be selected.
    
    Returns:
        List[str]: The names of the selected columns.
    """

    proportions = df.notnull().mean()
    selected_cols = proportions[proportions >= threshold].index.tolist()
    
    return selected_cols

notna_cols = select_notna_cols(tweets_df, threshold=0.4)
mod_tweets_df = tweets_df[notna_cols].copy()

mod_tweets_df = (mod_tweets_df[mod_tweets_df['lang']=='en']
                 .drop(['id','url','source','sourceUrl'], axis=1)
                 .reset_index(drop=True)
                 .copy())


mod_tweets_df = mod_tweets_df.drop(['lang','inReplyToTweetId','conversationId'], axis=1)

In [None]:
'''
TODO: 
0) find a way to deal with multiple tweets for a day
1) merge 2 datasets into 
2) imput missing data, maybe try interpolation or expectation maximization
    2.1) compare with mean, median imput methods
3) ivestigate relationship within data, maybe correlation matrix etc
'''

#data.get_data('btc', 'usd', 2000)

'\nTODO: \n0) find a way to deal with multiple tweets for a day\n1) merge 2 datasets into \n2) imput missing data, maybe try interpolation or expectation maximization\n    2.1) compare with mean, median imput methods\n3) ivestigate relationship within data, maybe correlation matrix etc\n'

In [None]:
mod_tweets_df[mod_tweets_df['rawContent']!=mod_tweets_df['renderedContent']][['rawContent','renderedContent']].values[-1]


array(['@dogeofficialceo Passing butter\nhttps://t.co/BQodkxl2BZ',
       '@dogeofficialceo Passing butter\nm.youtube.com/watch?v=3ht-Zy…'],
      dtype=object)

In [None]:
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder()
mod_tweets_df['sourceLabel_encoded'] = encoder.fit_transform(mod_tweets_df['sourceLabel'].values.reshape(-1, 1))
mod_tweets_df['isReplied']   = [0 if type(tweet)==float else 1 for tweet in mod_tweets_df['inReplyToUser']]
mod_tweets_df['isMentioned'] = [0 if type(tweet)==float else 1 for tweet in mod_tweets_df['mentionedUsers']]

mod_tweets_df = mod_tweets_df.drop(['sourceLabel','inReplyToUser','mentionedUsers'], axis=1)


def extract_dict(line: str, prepare_to_df: False):
    """Extracts data from a dict represented as string and makes it a dict.

    Args:
        line (str): row of a Series/DataFrame to be preprocessed.
        prepare_to_df (bool): prepares extracted dict to be wrapped into DataFrame.

    Returns:
        dict: extracted dict from string.
    """    
    import re

    extracted_content = dict(re.findall(r"'(\w+)': '?({.*}|datetime.datetime\(.*\)|[\w\d/:\. ]*)'?", line))
    
    # Wraps dict values into lists to be easily represented as a DataFrame row.
    if prepare_to_df:
        for key,value in extracted_content.items():
            if value == '':
                extracted_content[key] = [None]
            else:
                extracted_content[key] = value
        
    return extracted_content


new_df = mod_tweets_df.copy()
extracted_df = (pd.DataFrame([*mod_tweets_df['user']
                              .apply(lambda x: extract_dict(x, True))])
                .drop(['username','id','displayname','verified','created','location',
                       'protected','link','profileImageUrl','profileBannerUrl','label'], axis=1))

new_df = pd.concat([new_df, extracted_df], axis=1)

In [None]:
new_df

Unnamed: 0,date,rawContent,renderedContent,user,replyCount,retweetCount,likeCount,quoteCount,sourceLabel_encoded,isReplied,isMentioned,rawDescription,renderedDescription,descriptionLinks,followersCount,friendsCount,statusesCount,favouritesCount,listedCount,mediaCount
0,2010-06-04 18:31:57+00:00,"Please ignore prior tweets, as that was someon...","Please ignore prior tweets, as that was someon...","{'username': 'elonmusk', 'id': 44196397, 'disp...",1198,658,6359,326,5.0,0,0,[None],[None],,134458744,193,24445,20702,119975,1492
1,2011-12-01 10:29:04+00:00,I made the volume on the Model S http://t.co/w...,I made the volume on the Model S ow.ly/i/mtD7 ...,"{'username': 'elonmusk', 'id': 44196397, 'disp...",28,12,80,8,5.0,0,0,[None],[None],,134458941,193,24445,20702,119975,1492
2,2011-12-01 09:55:11+00:00,Went to Iceland on Sat to ride bumper cars on ...,Went to Iceland on Sat to ride bumper cars on ...,"{'username': 'elonmusk', 'id': 44196397, 'disp...",42,27,198,7,5.0,0,0,[None],[None],,134458941,193,24445,20702,119975,1492
3,2011-12-03 08:22:07+00:00,That was a total non sequitur btw,That was a total non sequitur btw,"{'username': 'elonmusk', 'id': 44196397, 'disp...",42,17,144,8,7.0,0,0,[None],[None],,134458911,193,24445,20702,119975,1492
4,2011-12-03 08:20:28+00:00,"Great Voltaire quote, arguably better than Twa...","Great Voltaire quote, arguably better than Twa...","{'username': 'elonmusk', 'id': 44196397, 'disp...",35,34,86,5,7.0,0,0,[None],[None],,134458911,193,24445,20702,119975,1492
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18311,2023-04-09 01:03:23+00:00,@DimaZeniuk @SpaceX @SirineAti @captainarve @a...,@DimaZeniuk @SpaceX @SirineAti @captainarve @a...,"{'username': 'elonmusk', 'id': 44196397, 'disp...",202,249,3511,30,7.0,1,1,[None],[None],,134460421,193,24445,20702,119978,1492
18312,2023-04-09 01:01:50+00:00,@DimaZeniuk @SpaceX @SirineAti @captainarve @a...,@DimaZeniuk @SpaceX @SirineAti @captainarve @a...,"{'username': 'elonmusk', 'id': 44196397, 'disp...",1299,1216,20873,147,7.0,1,1,[None],[None],,134460421,193,24445,20702,119978,1492
18313,2023-04-09 00:40:39+00:00,@teslaownersSV Got to break a few eggs to make...,@teslaownersSV Got to break a few eggs to make...,"{'username': 'elonmusk', 'id': 44196397, 'disp...",4826,4900,113346,391,7.0,1,1,[None],[None],,134460421,193,24445,20702,119978,1492
18314,2023-04-09 00:33:35+00:00,@Jason @DeanPreston @GrowSF Good question,@Jason @DeanPreston @GrowSF Good question,"{'username': 'elonmusk', 'id': 44196397, 'disp...",166,179,2663,14,7.0,1,1,[None],[None],,134460421,193,24445,20702,119978,1492


In [None]:
def str_to_datetime(input_str):
    from datetime import datetime, timezone
    import re
    # use regular expressions to extract the date and time components
    match = re.search(r'(\d{4}),\s(\d{1,2}),\s(\d{1,2}),\s(\d{1,2}),\s(\d{1,2}),\s(\d{1,2})', input_str)
    if match:
        year, month, day, hour, minute, second = map(int, match.groups())

    # create a timezone object
    tz = timezone.utc

    # create a datetime object using the extracted components and timezone
    dt = datetime(year, month, day, hour, minute, second, tzinfo=tz)

    # print the resulting datetime object
    print(dt)
    
str_to_datetime(input_str)

2009-06-02 20:12:29+00:00
