In [1]:
import pandas as pd
import numpy as np
import warnings
import string
import re
import spacy
import plotly.express as px
warnings.filterwarnings('ignore')

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, LatentDirichletAllocation as LDA

In [2]:
btc_tweets = pd.read_csv('Bitcoin_tweets.csv')
btc_text = btc_tweets[['text', 'date']]
btc_text.loc[:, 'text'] = btc_text['text'].str.lower()
btc_text.dropna(inplace=True)
btc_text = btc_text[~btc_text['date'].str.contains('[a-zA-Z]')]
btc_text['date'] = pd.to_datetime(btc_text['date'], format='%Y-%m-%d')
btc_text['date'] = btc_text['date'].dt.strftime('%Y-%m-%d')

In [3]:
df_list_small = []
for _, frame in btc_text.groupby(by='date'):
    temp = frame.sample(10, random_state=132434)
    df_list_small.append(temp)
btc_text_small = pd.concat(df_list_small)

df_list_big = []
for _, frame in btc_text.groupby(by='date'):
    temp = frame.sample(100, random_state=132434)
    df_list_big.append(temp)
btc_text_big = pd.concat(df_list_big)

In [4]:
btc_text_small.reset_index(inplace=True, drop=True)
btc_text_big.reset_index(inplace=True, drop=True)

In [5]:
def contractions(df):
    contractions_dict = { "ain't": "are not","'s":" is","aren't": "are not",
                     "can't": "cannot","can't've": "cannot have",
                     "'cause": "because","could've": "could have","couldn't": "could not",
                     "couldn't've": "could not have", "didn't": "did not","doesn't": "does not",
                     "don't": "do not","hadn't": "had not","hadn't've": "had not have",
                     "hasn't": "has not","haven't": "have not","he'd": "he would",
                     "he'd've": "he would have","he'll": "he will", "he'll've": "he will have",
                     "how'd": "how did","how'd'y": "how do you","how'll": "how will",
                     "I'd": "I would", "I'd've": "I would have","I'll": "I will",
                     "I'll've": "I will have","I'm": "I am","I've": "I have", "isn't": "is not",
                     "it'd": "it would","it'd've": "it would have","it'll": "it will",
                     "it'll've": "it will have", "let's": "let us","ma'am": "madam",
                     "mayn't": "may not","might've": "might have","mightn't": "might not", 
                     "mightn't've": "might not have","must've": "must have","mustn't": "must not",
                     "mustn't've": "must not have", "needn't": "need not",
                     "needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
                     "oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
                     "shan't've": "shall not have","she'd": "she would","she'd've": "she would have",
                     "she'll": "she will", "she'll've": "she will have","should've": "should have",
                     "shouldn't": "should not", "shouldn't've": "should not have","so've": "so have",
                     "that'd": "that would","that'd've": "that would have", "there'd": "there would",
                     "there'd've": "there would have", "they'd": "they would",
                     "they'd've": "they would have","they'll": "they will",
                     "they'll've": "they will have", "they're": "they are","they've": "they have",
                     "to've": "to have","wasn't": "was not","we'd": "we would",
                     "we'd've": "we would have","we'll": "we will","we'll've": "we will have",
                     "we're": "we are","we've": "we have", "weren't": "were not","what'll": "what will",
                     "what'll've": "what will have","what're": "what are", "what've": "what have",
                     "when've": "when have","where'd": "where did", "where've": "where have",
                     "who'll": "who will","who'll've": "who will have","who've": "who have",
                     "why've": "why have","will've": "will have","won't": "will not",
                     "won't've": "will not have", "would've": "would have","wouldn't": "would not",
                     "wouldn't've": "would not have","y'all": "you all", "y'all'd": "you all would",
                     "y'all'd've": "you all would have","y'all're": "you all are",
                     "y'all've": "you all have", "you'd": "you would","you'd've": "you would have",
                     "you'll": "you will","you'll've": "you will have", "you're": "you are",
                     "you've": "you have"}


    contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))

    def expand_contractions(text,contractions_dict=contractions_dict):
        def replace(match):
            return contractions_dict[match.group(0)]
        return contractions_re.sub(replace, text)
    
    df['text']=df['text'].apply(lambda x:expand_contractions(x))
    return df

In [6]:
def numbers_spaces_punctuation(df):
    df['cleaned']=df['text'].apply(lambda x: re.sub('\w*\d\w*','', x))
    df['cleaned']=df['cleaned'].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x))
    df['cleaned']=df['cleaned'].apply(lambda x: re.sub(' +',' ',x))# Removing extra spaces
    return df

def lemmatization(df):
    nlp = spacy.load('en_core_web_sm',disable=['parser', 'ner'])
    df['lemmatized']=df['cleaned'].apply(lambda x: ' '.join(
        [token.lemma_ for token in list(nlp(x)) if (token.is_stop==False)]))
    return df

def remove_stop_words(df):
    stop_words = list(stopwords.words('english'))
    for i in range(len(stop_words)):
        stop_words[i] = ' ' + stop_words[i] + ' '
    df['lemmatized'] = df['lemmatized'].replace('|'.join(stop_words), '', regex=True).str.strip()
    return df

def data_preprocessing(df):
    df = contractions(df)
    df = numbers_spaces_punctuation(df)
    df = lemmatization(df)
    df = remove_stop_words(df)
    return df

In [7]:
#!spacy download en
preprocessed_small = data_preprocessing(btc_text_small)
preprocessed_big = data_preprocessing(btc_text_big)

In [8]:
def create_tfidf(text):
    vectorizer = TfidfVectorizer()#попробовать другие min_df(0.05)
    X = vectorizer.fit_transform(text)
    feature_names = vectorizer.get_feature_names()
    return pd.DataFrame(X.toarray(), columns=feature_names).set_index(text.index), vectorizer

frequences_small, vector_model_small = create_tfidf(preprocessed_small['lemmatized'])
frequences_big, vector_model_big = create_tfidf(preprocessed_big['lemmatized'])

In [9]:
def perform_tsne(freq_matrix, n=2):
    return TSNE(n).fit_transform(freq_matrix)

tsne_vectors = perform_tsne(frequences_small)
px.scatter(x=tsne_vectors[:, 0], y=tsne_vectors[:, 1], hover_name=btc_text_small['text'])

In [10]:
def perform_pca(freq_matrix, n=2):
    return PCA(n).fit_transform(freq_matrix)

pca_vectors = perform_pca(frequences_big)
px.scatter(x=pca_vectors[:, 0], y=pca_vectors[:, 1], hover_name=btc_text_big['text'])

На основе графиков после понижения размерности до 2 сложно что-то сказать о данных, tsne и pca вероятнее всего надо применять для улучшения показателей каких-то предсказательных моделей.

In [11]:
def perform_lda(freq_matrix, vectorizer, n=2):#данная функция должна возвращать 3 самые обсуждаемые темы и 3 
    lda = LDA(n, random_state=0)                   #самых ключевых слова в каждой теме, но она пока не работает
    lda_topics = lda.fit_transform(freq_matrix)
    #lda_topics = np.squeeze(lda_topics, axis=0)
    n_topics_indices = lda_topics.argsort()[-3:][::-1]

    top_topics_words_dists = []
    for i in n_topics_indices:
        top_topics_words_dists.append(lda.components_[i])
    
    print(top_topics_words_dists)
    shape=(3 * 3, lda.components_.shape[1])
    keywords = np.zeros(shape=shape)
    for i, topic in enumerate(top_topics_words_dists):
        n_keywords_indices = topic.argsort()[-3:][::-1]
        for k, j in enumerate(n_keywords_indices):
            keywords[i * 3 + k, j] = 1
    keywords = vectorizer.inverse_transform(keywords)
    keywords = [keyword[0] for keyword in keywords]
    return keywords  

#perform_lda(frequences_small, vector_model_small)