# Fetching crypto and tweets data.

In [24]:
from sklearn.preprocessing import OrdinalEncoder
from crypto_api import CryptoApi
import pandas as pd
import numpy as np
import re


#btc_df = pd.read_csv(r'Data\btc_data.csv',usecols=lambda x: x != "Unnamed: 0")

# Analysis of tweets

In [25]:
'''
TODO: GENERAL DATA TASKS:
0) find a way to deal with multiple tweets for a day
1) merge 2 datasets into 
2) imput missing data, maybe try interpolation or expectation maximization
    2.1) compare with mean, median imput methods
3) ivestigate relationship within data, maybe correlation matrix etc
'''

tweets_df = pd.read_csv(r'Data/elon_tweets.csv', index_col=0)
tweets_df['date'] = pd.to_datetime(tweets_df['date'])
                                                
tweets_df = (tweets_df
             .dropna(axis=1, how='all')
             .drop(['vibe','cashtags'], axis=1)) # 1 and 18 notna values respectively  

## Dealing with sparse columns

In [26]:
sparse_cols = tweets_df.columns[tweets_df.notnull().mean() < 1.0].values.copy()

mod_tweets_df = tweets_df.copy()
mod_tweets_df = (mod_tweets_df[mod_tweets_df['lang']=='en']
                 .drop(['id','url','source','sourceUrl'], axis=1)                 
                 .reset_index(drop=True)
                 .copy())

mod_tweets_df = mod_tweets_df.drop(['lang'], axis=1)

## Data cleaning and preprocessing

In [27]:
encoder = OrdinalEncoder()
mod_tweets_df['sourceLabel_encoded'] = encoder.fit_transform(mod_tweets_df['sourceLabel'].values.reshape(-1, 1))
mod_tweets_df['isReplied']   = [0 if type(tweet)==float else 1 for tweet in mod_tweets_df['inReplyToUser']]
mod_tweets_df['isMentioned'] = [0 if type(tweet)==float else 1 for tweet in mod_tweets_df['mentionedUsers']]

#mod_tweets_df = mod_tweets_df.drop(['sourceLabel','inReplyToUser','mentionedUsers'], axis=1)


def extract_dict(line: str, prepare_to_df: False):
    """Extracts data from a dict represented as string and makes it a dict.

    ## Parameters:
        line (str): row of a Series/DataFrame to be preprocessed.
        prepare_to_df (bool): prepares extracted dict to be wrapped into DataFrame.

    ## Returns:
        dict: extracted dict from string.
    """    

    extracted_content = dict(re.findall(r"'(\w+)': '?({.*}|datetime.datetime\(.*\)|[\w\d/:\. ]*)'?", line))
    
    # Wraps dict values into lists to be easily represented as a DataFrame row.
    if prepare_to_df:
        for key,value in extracted_content.items():
            if value == '':
                extracted_content[key] = [None]
            else:
                extracted_content[key] = value
        
    return extracted_content


new_df = mod_tweets_df.copy()     
extracted_df = (pd.DataFrame([*mod_tweets_df['user']
                              .apply(lambda x: extract_dict(x, True))])
                )

new_df = (pd.concat([new_df, extracted_df], axis=1)
            .drop(['user','username','id','displayname','verified','created',
                    'location','protected','profileImageUrl','profileBannerUrl',
                    'rawDescription','renderedDescription','favouritesCount',
                    'friendsCount','mediaCount','statusesCount'], axis=1))

In [28]:
# Converting columns containing numbers to int after extraction.
for column in new_df:
    if 'Count' in column:
        new_df[column] = new_df[column].astype('Int64').copy()

In [29]:
new_df[['rawContent','isReplied','isMentioned']].query("rawContent.str.contains('@')")

new_df['mentionsCount'] = new_df['rawContent'].str.count(r'@[\w\d]+')
new_df['mentions'] = new_df['rawContent'].apply(lambda x : re.findall(r'(@[^\s]+)', x))

count = 0
for a,b in new_df[['mentionsCount','mentions']].values:
    if a==len(b):
        count +=1 
print(count==len(new_df))

new_df['charCount'] = new_df['rawContent'].apply(lambda x: len(x))
new_df = new_df.drop('descriptionLinks', axis=1)

True


In [30]:
#links = tweets_df[tweets_df.columns[tweets_df.columns.isin(new_df.columns)==False]]['links'].value_counts().copy()

tweets_df[tweets_df['rawContent']=='True']['quotedTweet'].iloc[0]

'{\'url\': \'https://twitter.com/ggreenwald/status/1625871270737809408\', \'date\': datetime.datetime(2023, 2, 15, 14, 54, 52, tzinfo=datetime.timezone.utc), \'rawContent\': "The corporate media\'s ability to -- overnight -- turn anyone who dissents in anyway into some sort of fascist or even Hitler-like figure, and then have millions of their followers go around mindlessly repeating it, is both impressive and chilling:", \'renderedContent\': "The corporate media\'s ability to -- overnight -- turn anyone who dissents in anyway into some sort of fascist or even Hitler-like figure, and then have millions of their followers go around mindlessly repeating it, is both impressive and chilling:", \'id\': 1625871270737809408, \'user\': {\'username\': \'ggreenwald\', \'id\': 16076032, \'displayname\': \'Glenn Greenwald\', \'rawDescription\': \'Journalist; Author; Host, @SystemUpdate_; Columnist, @Folha; Co-Founder: The Intercept, @TheInterceptBr; @abrigo_hope, @FreedomofPress, @ongcriadefavela.

# Model Bulding

## Training pipeline 

In [68]:
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import corpora, matutils, utils, models
from gensim.models import ldamodel, ldamulticore
import spacy


def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    url_pattern = [{"label": "URL",
                    "pattern": [{"LIKE_URL": True}]}]

    ruler = nlp.add_pipe('entity_ruler', before='ner')
    ruler.add_patterns(url_pattern)
    
    texts_out = []
    if type(texts)!=list:
        texts = [texts]
    
    for text in texts:
        # TODO: consider using nlp.pipe which should be faster
        doc = nlp(text)
        cleaned_text = []
        for token in doc:
            if token.ent_type_ != 'URL' and not token.is_stop and token.pos_ in allowed_postags:
                cleaned_text.append(token.lemma_)
        final = ' '.join(cleaned_text)
        texts_out.append(final)

    return texts_out





# lemmatized_texts = lemmatization(nlp, new_df['rawContent'])
with open('lemmatized_texts.txt', 'r', encoding="utf-8") as f:
    lemmatized_texts = f.readlines()
    
lemmatized_texts = [line.replace('\n','') for line in lemmatized_texts]

In [32]:
def create_ngrams(texts):
    data_words = []
    for text in texts:
        new = utils.simple_preprocess(text)
        data_words.append(new)

    bigrams_phrases  = models.Phrases(data_words, min_count=3, threshold=50)
    trigrams_phrases = models.Phrases(bigrams_phrases[data_words], threshold=50)

    bigram  = models.phrases.Phraser(bigrams_phrases)
    trigram = models.phrases.Phraser(trigrams_phrases)

    data_bigrams = [bigram[doc] for doc in data_words]
    data_bigrams_trigrams = [trigram[bigram[doc]] for doc in data_bigrams]
    
    return data_bigrams_trigrams


data_bigrams_trigrams = create_ngrams(lemmatized_texts)

In [33]:
def vectorize_texts(texts_ngrams):
    id2word = corpora.Dictionary(texts_ngrams)
    corpus = [id2word.doc2bow(text) for text in texts_ngrams]

    return id2word, corpus

id2word, corpus = vectorize_texts(data_bigrams_trigrams)

#tfidf = models.TfidfModel(corpus, id2word=id2word)
# tfidf_vectorizer = TfidfVectorizer(max_df=0.6,
#                                    min_df=5,
#                                    ngram_range=(1,3))
# tfidf_matrix = tfidf_vectorizer.fit_transform(lemmatized_texts)

# id2word = dict((v, k) for k, v in tfidf_vectorizer.vocabulary_.items())
# corpus = matutils.Sparse2Corpus(tfidf_matrix.T)

In [34]:
# low_value = 0.03
# words = []
# words_missing_in_tfidf = []

# for i in range(0, len(corpus)):
#     bow = corpus[i]
#     low_value_words = []
#     tfidf_ids = [id for id,_ in tfidf[bow]]
#     bow_idf = [id for id,_ in bow]
#     low_value_words = [id for id, value in tfidf[bow] if value < low_value]
#     drops = low_value_words+words_missing_in_tfidf
    
#     for item in drops:
#         words.append(id2word[item])
    
#     # words with tfidf score of 0 will be missing
#     words_missing_in_tfidf = [id for id in bow_idf if id not in tfidf_ids]
#     new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
#     corpus[i] = new_bow

In [35]:
# id2word = corpora.Dictionary(data_words)
# corpus = [id2word.doc2bow(text) for text in data_words]

In [36]:
from gensim.models import CoherenceModel, LdaMulticore


def bayesian_tuning(model, params_grid: dict, texts, verbose=False):
    # TODO: implement bayesian tuning
    models_scores = {}
    for i in range(95,170,5):
        lda_model = model(corpus=params_grid['corpus'],
                          num_topics=i,
                          id2word=params_grid['id2word'],
                          random_state=1,
                          passes=10,
                          per_word_topics=True)
        
        coherence_model_lda = CoherenceModel(model=lda_model, 
                                             texts=texts, 
                                             corpus=params_grid['corpus'], 
                                             dictionary=params_grid['id2word']
                                             )
        coherence_score = coherence_model_lda.get_coherence()
        
        models_scores.update({lda_model: coherence_score})
        
        if verbose:
            print(f'Topics {i:<3}: {coherence_score}')
    
    return models_scores


params_grid = {'corpus':corpus,  
               'num_topics':10, 
               'id2word':id2word, 
               'random_state':1, 
               'update_every':1, 
               'chunksize':3000, 
               'passes':2}

# lda_models_scores = bayesian_tuning(LdaMulticore, params_grid, texts, verbose=True)

In [248]:
import sklearn
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis


pyLDAvis.enable_notebook()

lda_model = LdaMulticore(corpus=corpus,
                         num_topics=30,
                         id2word=id2word,
                         random_state=1,
                         passes=10,
                         per_word_topics=True)

vis = gensimvis.prepare(lda_model, corpus, id2word)
vis

In [249]:
puppy_text = 'Rocket launch is tommorow, hope for good landing, satelite awaits for us.'

from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline

preprocessing_pipeline = Pipeline([
    ('lemmatization', FunctionTransformer(lemmatization)),
    ('trigrams', FunctionTransformer(create_ngrams)),
    ('vectorization', FunctionTransformer(vectorize_texts))
    ])

preprocessing_pipeline.transform(puppy_text)

(<gensim.corpora.dictionary.Dictionary at 0x27d8c74d310>,
 [[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]])

In [250]:
from sklearn.pipeline import Pipeline

def make_custom_pipeline(steps):
    for i, step in enumerate(steps):
        steps.insert(i, (step[0], FunctionTransformer(step[1])))
        steps.remove(step)

    return Pipeline(steps)


steps = [('lemmatization', lemmatization),
         ('trigrams', create_ngrams),
         ('vectorization', vectorize_texts)]

preprocessing_pipeline = make_custom_pipeline(steps)
id2word_n, corpus_n = preprocessing_pipeline.transform(puppy_text)

In [253]:
new_df['vectorized'] = preprocessing_pipeline.transform(new_df['rawContent'].values.tolist())[1]
new_df['TopicsProbs'] = new_df['vectorized'].apply(lambda x: dict(lda_model.get_document_topics(x, minimum_probability=0)))

In [291]:
new_df[['rawContent','TopicsProbs']]

Unnamed: 0,rawContent,TopicsProbs
0,"Please ignore prior tweets, as that was someon...","{0: 0.005557781, 1: 0.48198515, 2: 0.005557781..."
1,I made the volume on the Model S http://t.co/w...,"{0: 0.0066692233, 1: 0.0066692233, 2: 0.006669..."
2,Went to Iceland on Sat to ride bumper cars on ...,"{0: 0.22591753, 1: 0.0030327349, 2: 0.00303273..."
3,That was a total non sequitur btw,"{0: 0.008346841, 1: 0.008346841, 2: 0.00834684..."
4,"Great Voltaire quote, arguably better than Twa...","{0: 0.003336929, 1: 0.2654933, 2: 0.003336929,..."
...,...,...
18311,@DimaZeniuk @SpaceX @SirineAti @captainarve @a...,"{0: 0.003333522, 1: 0.003333522, 2: 0.00333352..."
18312,@DimaZeniuk @SpaceX @SirineAti @captainarve @a...,"{0: 0.003333626, 1: 0.003333626, 2: 0.00333362..."
18313,@teslaownersSV Got to break a few eggs to make...,"{0: 0.006667992, 1: 0.006667992, 2: 0.00666799..."
18314,@Jason @DeanPreston @GrowSF Good question,"{0: 0.011111161, 1: 0.011111161, 2: 0.01111116..."
