# Fetching crypto and tweets data.

In [1]:
import requests

class CryptoApi:
    def __init__(self) -> None:
        self.mother_endpoint = "https://min-api.cryptocompare.com/data"

        with open('token.txt', 'r') as f:
            self.TOKEN = f.readline()

        self.HEADER = {'Authorization': self.TOKEN,
                       "Content-Type": 'application/json'}


    def get_data(self, crypto:str, currency:str, period:str, period_count:int, allData=0):
        '''Returns crypto summary for a given period in specified currency.

        Args:
            crypto: str
                BTC/ETH/DOGE etc.
            currency: str
                USD/EUR/UAH etc.
            period: str 
                day/hour/minute.
            period_count: int
                last n of a period (n days)
                n = 1 returns revious day/hour/minute + current
            allData: int
                bool doen't work
                1 - get all records;
                0 - get specified amount of period_count.
        Returns:
            dict: json containing request's response.
        '''        

        endpoint = f'{self.mother_endpoint}/v2/histo{period}?fsym={crypto}&tsym={currency}&limit={period_count}'
        response = requests.get(endpoint, params={'allData':allData}, headers=self.HEADER)
        
        return response.json()
    
    def execute_custom_getrequest(self, endpoint:str):
        return requests.get(endpoint, headers=self.HEADER)
    
data = CryptoApi()
#data.get_data('btc','usd','day',29)

In [2]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed


date_range = pd.date_range(start='2023-04-11', end='2023-04-13', freq='d')

queries = [f"from:elonmusk since:{d1.strftime('%Y-%m-%d')} until:{d2.strftime('%Y-%m-%d')}"
           for d1, d2 in zip(date_range, date_range[1:])]


def parallel_download_tweets():
    def sequent_download_tweets(query):
        tweets = []
        for tweet in sntwitter.TwitterSearchScraper(query).get_items():
            tweets.append(tweet)
        return tweets

    tweets_list = []
    with ThreadPoolExecutor(max_workers=8) as executor:
        futures = [executor.submit(sequent_download_tweets, query) for query in queries]
            
        # Append results to the tweets list
        for future in as_completed(futures):
            tweets_list += future.result()
            
    return tweets_list


#tweets_df = pd.DataFrame(parallel_download_tweets())
#tweets_df.to_csv('elon_twt.csv')

In [3]:
from datetime import datetime

raw_data = data.get_data('btc','usd','day',30)

df = pd.DataFrame(raw_data['Data']['Data'])
df['time'] = df['time'].apply(lambda x: datetime.fromtimestamp(x).date())

raw_btc = data.get_data('btc','usd','day', 2000) #10 years bitcoin


btc_df = pd.DataFrame(raw_btc['Data']['Data'])
btc_df['time'] = btc_df['time'].apply(lambda x: datetime.fromtimestamp(x).date())

In [4]:
'''import matplotlib.pyplot as plt

filtered_df = btc_df[btc_df['time']>=datetime.strptime('2022-05-15','%Y-%m-%d').date()].copy()
filtered_df.index = filtered_df['time']


plt.figure(figsize=(12,8))
plt.plot(filtered_df['close'])
plt.plot(filtered_df['close'].rolling(28).mean())'''

"import matplotlib.pyplot as plt\n\nfiltered_df = btc_df[btc_df['time']>=datetime.strptime('2022-05-15','%Y-%m-%d').date()].copy()\nfiltered_df.index = filtered_df['time']\n\n\nplt.figure(figsize=(12,8))\nplt.plot(filtered_df['close'])\nplt.plot(filtered_df['close'].rolling(28).mean())"

# Analysis of tweets

## Diving into sparse columns

In [34]:
import pandas as pd

'''
TODO: GENERAL DATA TASKS:
0) find a way to deal with multiple tweets for a day
1) merge 2 datasets into 
2) imput missing data, maybe try interpolation or expectation maximization
    2.1) compare with mean, median imput methods
3) ivestigate relationship within data, maybe correlation matrix etc
'''

tweets_df = pd.read_csv('elon_tweets.csv', index_col=0)
tweets_df['date'] = pd.to_datetime(tweets_df['date'])

sparse_cols = tweets_df.columns[tweets_df.notnull().mean() < 1.0].values.copy()


In [36]:
tweets_df[sparse_cols].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21775 entries, 0 to 21774
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   links             1425 non-null   object 
 1   media             1370 non-null   object 
 2   retweetedTweet    0 non-null      float64
 3   quotedTweet       609 non-null    object 
 4   inReplyToTweetId  16915 non-null  float64
 5   inReplyToUser     16915 non-null  object 
 6   mentionedUsers    16386 non-null  object 
 7   coordinates       0 non-null      float64
 8   place             0 non-null      float64
 9   hashtags          50 non-null     object 
 10  cashtags          1 non-null      object 
 11  card              853 non-null    object 
 12  viewCount         2934 non-null   float64
 13  vibe              18 non-null     object 
dtypes: float64(5), object(9)
memory usage: 2.5+ MB


In [None]:
# TODO: uncomment this part after dealing with sus columns

'''mod_tweets_df = tweets_df[notna_cols].copy()

mod_tweets_df = (mod_tweets_df[mod_tweets_df['lang']=='en']
                 .drop(['id','url','source','sourceUrl'], axis=1)                 
                 .reset_index(drop=True)
                 .copy())


#mod_tweets_df = mod_tweets_df.drop(['lang','inReplyToTweetId','conversationId'], axis=1)
mod_tweets_df = mod_tweets_df.drop(['lang'], axis=1)'''

In [None]:
'''print(len(mod_tweets_df))
print(len(mod_tweets_df['conversationId'].unique()))
print(len(mod_tweets_df['inReplyToTweetId'].unique()))
print(mod_tweets_df['isReplied'].sum())
print(mod_tweets_df['isReplied'].where(mod_tweets_df['isReplied'] == 0).dropna().shape[0])'''

## Data cleaning and preprocessing

In [None]:
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder()
mod_tweets_df['sourceLabel_encoded'] = encoder.fit_transform(mod_tweets_df['sourceLabel'].values.reshape(-1, 1))
mod_tweets_df['isReplied']   = [0 if type(tweet)==float else 1 for tweet in mod_tweets_df['inReplyToUser']]
mod_tweets_df['isMentioned'] = [0 if type(tweet)==float else 1 for tweet in mod_tweets_df['mentionedUsers']]

#mod_tweets_df = mod_tweets_df.drop(['sourceLabel','inReplyToUser','mentionedUsers'], axis=1)


def extract_dict(line: str, prepare_to_df: False):
    """Extracts data from a dict represented as string and makes it a dict.

    Args:
        line (str): row of a Series/DataFrame to be preprocessed.
        prepare_to_df (bool): prepares extracted dict to be wrapped into DataFrame.

    Returns:
        dict: extracted dict from string.
    """    
    import re

    extracted_content = dict(re.findall(r"'(\w+)': '?({.*}|datetime.datetime\(.*\)|[\w\d/:\. ]*)'?", line))
    
    # Wraps dict values into lists to be easily represented as a DataFrame row.
    if prepare_to_df:
        for key,value in extracted_content.items():
            if value == '':
                extracted_content[key] = [None]
            else:
                extracted_content[key] = value
        
    return extracted_content


new_df = mod_tweets_df.copy()     
extracted_df = (pd.DataFrame([*mod_tweets_df['user']
                              .apply(lambda x: extract_dict(x, True))])
                )

new_df = (pd.concat([new_df, extracted_df], axis=1)
            .drop(['user','username','id','displayname','verified','created',
                    'location','protected','profileImageUrl','profileBannerUrl',
                    'rawDescription','renderedDescription','favouritesCount',
                    'friendsCount','mediaCount','statusesCount'], axis=1))
'''.drop(['user','username','id','displayname','verified','created',
                 'location','protected','link','profileImageUrl','profileBannerUrl',
                 'label', 'rawDescription','renderedDescription','mediaCount','favouritesCount',
                 'descriptionLinks','statusesCount','friendsCount'], axis=1))'''

In [None]:
# Converting columns containing numbers to int after extraction.
for column in new_df:
    if 'Count' in column:
        new_df[column] = new_df[column].astype('Int64').copy()

In [None]:
'''
TODO:
1) Check whenether @tags are ok for text analysis, 
   how to treat them.
2) Make a column with number of tags.
'''
import seaborn as sns

plt.figure(figsize=(10,7))
sns.heatmap(new_df.corr(),vmin=-1, vmax=1, annot=True, linecolor='black',linewidths=1)

In [None]:
new_df[['rawContent','isReplied','isMentioned']].query("rawContent.str.contains('@')")

new_df['mentionsCount'] = new_df['rawContent'].str.count(r'@[\w\d]+')
new_df[['rawContent','mentionsCount']]

In [None]:
new_df

In [None]:
import re

'''

'''

new_df['mentions'] = new_df['rawContent'].apply(lambda x : re.findall(r'(@[^\s]+)', x))

count = 0
for a,b in new_df[['mentionsCount','mentions']].values:
    if a==len(b):
        count +=1 
print(count==len(new_df))

In [None]:
tweets_df.columns[tweets_df.columns.isin(new_df.columns)==False]

In [None]:
def plot_binary(col_names, plot_size=(16,6)):
    fig, ax = plt.subplots(ncols=len(col_names))
    fig.set_size_inches(plot_size)   
    
    for i in range(len(col_names)):
        freqs, bins, _ = ax[i].hist(new_df[col_names[i]].values, weights=np.ones(len(new_df))/len(new_df), bins=2, edgecolor='black')
        ax[i].set_title(col_names[i])
        
        bin_centers = 0.5 * (bins[:-1] + bins[1:])
        ax[i].set_xticks(bin_centers, [0,1])
        ax[i].set_yticks(freqs)
        ax[i].set_ylim([0,1])

    
plot_binary(['isReplied','isMentioned'])

In [None]:
new_df['charCount'] = new_df['rawContent'].apply(lambda x: len(x))

In [None]:
links = tweets_df[tweets_df.columns[tweets_df.columns.isin(new_df.columns)==False]]['links'].value_counts().copy()

tweets_df[tweets_df['rawContent']=='True']['quotedTweet'].iloc[0]

In [None]:
tweets_df['quotedTweet'].unique().shape[0]

In [None]:
new_df = new_df.drop('descriptionLinks', axis=1)

In [None]:
new_df['quotedTweet']