In [2]:
import pandas as pd
import html
#nltk.download() #only for the first time running it
from nltk.corpus import stopwords
import nltk
import matplotlib.pyplot as plt
import json
import regex as re
from textblob import TextBlob
import numpy
from wordcloud import WordCloud, STOPWORDS

import igraph
import networkx as nx
import tqdm
import pickle

In [None]:
#create dataframe on media posts USE FOR ALL DATA - Don't run now
def createDataFrame():  
    with open("datain/nft_search_tweets_sample.jsonl", encoding='utf8') as f:
        for line in f:
            row = json.loads(line)
            try:
                if row["lang"] == 'en' and len(row.get('referenced_tweets', [])) == 0 and len(row.get('entities', ['hashtags'])) != 0:
                    yield (row['id'],
                            row['text'],
                            row['entities']['hashtags'],
                            row['public_metrics']['retweet_count'],
                            row['public_metrics']['quote_count'],
                            row['public_metrics']['reply_count'],
                            row['public_metrics']['like_count'],
                            row['attachments']['media_keys'])
            except KeyError:
                pass

#create csv
df = pd.DataFrame(createDataFrame())
df.columns =['id', 'text', 'hashtags', 'retweet_count', 'quote_count', 'reply_count', 'like_count', 'media_keys']
df.to_json('datain/nft_tweets.jsonl', orient='records', index=True, lines= True)
output = pd.read_json("datain/nft_tweets.jsonl", lines = True)
output['total'] = output[['retweet_count', 'quote_count','reply_count', 'like_count']].sum(axis=1)
#sort by highest total
output = output.sort_values(by = 'total', ascending = False)
#top 100
output = output.head(100)
output.to_json('datain/nft_top_100_tweets.jsonl', orient='records', index=True, lines= True)

In [None]:
#import the data
data = pd.read_json("datain/nft_top_100_tweets.jsonl", lines=True)

In [None]:
sample_data = pd.read_json("datain/nft_search_tweets_sample.jsonl", lines=True)

In [None]:
#prep stop words
stop_words = stopwords.words('english')
stop_words.append('rt')
stop_words.append('nft')
# stop_words.append('#nft')

#function for cleaning a tweet (remove mentions, hashtags, links, html entities, stop words. And make sure it's only letters)
def clean_tweet(tweet):
        '''
        Utility function to clean tweet text by removing links, special characters
        using simple regex statements.
        '''
        tweet = str.lower(tweet)
        # tweet = ' '.join(re.sub("(@[A-Za-z0-9_]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())
        tweet = ' '.join(re.sub("(@[A-Za-z0-9_]+)|(#[A-Za-z0-9_]+)", " ", tweet).split()) # remove mentions and hashtags
        # tweet = ' '.join(re.sub("(@[A-Za-z0-9_]+)", " ", tweet).split())
        tweet = re.sub("(http\S+|http)", "", tweet, flags=re.MULTILINE) # remove links
        tweet = re.sub('\&\w+', "", tweet) # remove html entities
        tweet = re.sub('[^a-zA-Z# ]+', ' ', tweet) # make sure tweet is only letters
        # stem & remove stop words
        # tweet = ' '.join([PorterStemmer().stem(word=word) for word in tweet.split() if word not in stop_words])
        tweet = ' '.join([word for word in tweet.split() if word not in stop_words])
        return tweet

#clean data
for i in sample_data.index:
    text = sample_data["text"][i]
    cleaned_text = clean_tweet(text)
    cleaned_text = html.unescape(cleaned_text)
    sample_data["text"][i] = cleaned_text

In [None]:
sample_data.to_json('datain/nft_search_tweets_sample_cleaned.jsonl', orient='records', index=True, lines= True)

In [None]:
tweet_text = sample_data['text'].str.cat(sep=' ')
tokens = nltk.word_tokenize(tweet_text)
most_common = pd.DataFrame(nltk.ngrams(tokens, 1)).value_counts().to_frame()
# terms_count = term_data['text'].value_counts().to_dict()
# terms_count = pd.DataFrame.from_dict(terms_count, orient='index')

# terms_count.to_html("dataout/terms_count.html")

most_common.to_html('dataout/term_freq.html')

In [None]:
#nft_top_100_sentiment <- Has interactions and sentiment totals
positivity = ''
output = pd.read_json("datain/nft_top_100_cleaned_interactions.jsonl", lines = True)
def getSentiment():
    for i in output.index:
        row = TextBlob(output.iloc[i]['text'])
        if row.sentiment.polarity >= 0.7:
                positivity = 'mostly_positive'
        elif row.sentiment.polarity <= -0.7:
            positivity = 'mostly_negative'
        elif row.sentiment.polarity > -0.7 and row.sentiment.polarity < -0.4:
            positivity = 'negative'
        elif row.sentiment.polarity > 0.4 and row.sentiment.polarity < 0.7:
                positivity = 'positive'
        else:
            positivity = 'nuetral'
        yield  row.sentiment.polarity, row.sentiment.subjectivity, positivity
        
df = pd.DataFrame(getSentiment())
df.columns =['polarity', 'subjectivity', 'positivity']
output['polarity'] = df['polarity']
output['subjectivity'] = df['subjectivity']
output['positivity'] = df['positivity']
output.to_json('datain/nft_top_100_cleaned_interactions_sentiment.jsonl', orient='records', index=True, lines= True)

In [None]:
# Word cloud of top 100 NFT's
top_100_word_cloud = pd.read_json("datain/nft_top_100_cleaned_interactions_sentiment.jsonl", lines=True)
# Generate a word cloud image
stopwords = set(STOPWORDS)
stopwords.update(("t", "co", "https", "t", "amp", "RT"))
wordcloud = WordCloud(stopwords=stopwords,background_color='white', max_words=1000,contour_color='#023075',contour_width=3,colormap='rainbow').generate(' '.join(top_100_word_cloud['text']))
# create image as cloud
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
# store to file
plt.savefig("cloud.png", format="png")
plt.show()

In [None]:
# Comment sentiment
#### Read from nft_search_tweets_sample - need replied to with text and author
#### Original Poster
def createOriginalDataFrame():  
    with open("datain/nft_search_tweets_sample.jsonl", encoding='utf8') as f:
        for line in f:
            row = json.loads(line)
            try:
                if row["lang"] == 'en'and len(row.get('referenced_tweets', [])) == 0:
                    yield (row['id'],
                            row['text'],
                            row['author_id'],
                            row['public_metrics']['retweet_count'],
                            row['public_metrics']['quote_count'],
                            row['public_metrics']['reply_count'],
                            row['public_metrics']['like_count'],
                            row['attachments']['media_keys'],
                            row['created_at']
                    )
            except KeyError:
                pass

def createRepliesDataFrame():  
    with open("datain/nft_search_tweets_sample.jsonl", encoding='utf8') as f:
        for line in f:
            row = json.loads(line)
            try:
                if row["lang"] == 'en'and len(row.get('referenced_tweets', [])) != 0:
                    yield (row['id'],
                            row['text'],
                            row['author_id'],
                            row['in_reply_to_user_id'],
                            row['public_metrics']['retweet_count'],
                            row['public_metrics']['quote_count'],
                            row['public_metrics']['reply_count'],
                            row['public_metrics']['like_count'],
                            row['attachments']['media_keys'],
                            row['created_at']
                    )
            except KeyError:
                pass
#create csv
original_data = pd.DataFrame(createOriginalDataFrame())
original_data.columns =['id', 'text', 'in_reply_to_user_id', 'retweet_count', 'quote_count', 'reply_count', 'like_count', 'media_keys', 'created_at']
original_data.to_json('datain/nft_original_tweets.jsonl', orient='records', index=True, lines= True)
original_data['total'] = original_data[['retweet_count', 'quote_count','reply_count', 'like_count']].sum(axis=1)

replies_data = pd.DataFrame(createRepliesDataFrame())
replies_data.columns =['id', 'text', 'reply_author_id', 'in_reply_to_user_id', 'retweet_count', 'quote_count', 'reply_count', 'like_count', 'media_keys', 'created_at']
replies_data.to_json('datain/nft_replies_tweets.jsonl', orient='records', index=True, lines= True)
replies_data['total'] = replies_data[['retweet_count', 'quote_count','reply_count', 'like_count']].sum(axis=1)

In [None]:
original_data.reindex()
replies_data.reindex()

all_data = pd.merge(original_data, replies_data, on='in_reply_to_user_id').dropna()

# replies_data['matched_id'] = pd.Series((original_data.original_author_id.isin(replies_data.reply_author_id)))

In [None]:
def getSentiment():
    for i in all_data.index:
        row = TextBlob(all_data.iloc[i]['text_y'])
        if row.sentiment.polarity >= 0.7:
                positivity = 'mostly_positive'
        elif row.sentiment.polarity <= -0.7:
            positivity = 'mostly_negative'
        elif row.sentiment.polarity > -0.7 and row.sentiment.polarity < -0.4:
            positivity = 'negative'
        elif row.sentiment.polarity > 0.4 and row.sentiment.polarity < 0.7:
                positivity = 'positive'
        else:
            positivity = 'nuetral'
        yield  row.sentiment.polarity, row.sentiment.subjectivity, positivity
    # with open("datain/cleaned.txt", encoding='utf8') as f:
    #     for line in f:
    #         row = TextBlob(line)
    #         if row.sentiment.polarity >= 0.7:
    #             positivity = 'mostly_positive'
    #         elif row.sentiment.polarity <= -0.7:
    #             positivity = 'mostly_negative'
    #         elif row.sentiment.polarity > -0.7 and row.sentiment.polarity < -0.4:
    #             positivity = 'negative'
    #         elif row.sentiment.polarity > 0.4 and row.sentiment.polarity < 0.7:
    #              positivity = 'positive'
    #         else:
    #             positivity = 'nuetral'
    #         yield line, row.sentiment.polarity, row.sentiment.subjectivity, positivity

df = pd.DataFrame(getSentiment())
df.columns =['polarity', 'subjectivity', 'positivity']
all_data['polarity'] = df['polarity']
all_data['subjectivity'] = df['subjectivity']
all_data['positivity'] = df['positivity']
all_data.to_json('datain/matched_author_replied_sentiment.jsonl', orient='records', index=True, lines= True)

In [None]:
def createTagDataFrame():  
    with open("datain/nft_search_tweets_sample.jsonl", encoding='utf8') as f:
        for line in f:
            row = json.loads(line)
            try:
                if row["lang"] == 'en' and len(row.get('entities', ['hashtags'])) != 0:
                    yield (row['id'],
                            row['text'],
                            row['entities']['hashtags'])
            except KeyError:
                pass
#create csv
tag_data = pd.DataFrame(createTagDataFrame())
tag_data.columns =['id', 'text', 'hashtags']
new_df = pd.concat([pd.DataFrame(pd.json_normalize(x)) for x in tag_data['hashtags']],ignore_index=True)

tags_count = new_df['tag'].value_counts().to_dict()
counts = pd.DataFrame.from_dict(tags_count, orient='index')


In [None]:
tags_count.to_html("dataout/tags_count.html")


In [None]:
interactions_sentiment_df = pd.read_json('datain/matched_author_replied_sentiment.jsonl', lines=True)
interactions_sentiment_df['created_at_x'] = interactions_sentiment_df['created_at_x'].astype('datetime64[ns]')
interactions_sentiment_df['created_at_y'] = interactions_sentiment_df['created_at_y'].astype('datetime64[ns]')

In [None]:
#interactions vs time
interactions_sentiment_df.sort_values(by = 'created_at_x', ascending = False)
plt.bar(interactions_sentiment_df['created_at_x'].dt.hour, interactions_sentiment_df['total_x'])

In [None]:
#Node user
#Edge from - to.
# all_data['id_x']

# all_data['id_y']

G = nx.from_pandas_edgelist(all_data, 'id_y', 'id_x') #Turn df into graph
pos = nx.spring_layout(G) #specify layout for visual

f, ax = plt.subplots(figsize=(50, 50))
plt.style.use('ggplot')
nodes = nx.draw_networkx_nodes(G, pos,
                               alpha=0.8)
nodes.set_edgecolor('k')
nx.draw_networkx_labels(G, pos, font_size=8)
nx.draw_networkx_edges(G, pos, width=1.0, alpha=0.2)


In [None]:
G.number_of_edges()

In [4]:
#total vs polarity corr
sentiment_vs_interactions = pd.read_json('datain/nft_top_100_cleaned_interactions_sentiment.jsonl', lines=True)

lines = plt.xcorr(sentiment_vs_interactions['polarity'], sentiment_vs_interactions['total'], maxlags=9, usevlines=True)

plt.title('Sentiment vs Total Interactions')

plt.xlabel('Sentiment')

plt.ylabel('Interactions')    

plt.grid(True)

plt.axhline(0, color='red', lw=2)

plt.show()

#Does this mean no correlation?

AttributeError: module 'matplotlib.pyplot' has no attribute 'ycorr'

In [None]:
# adds the title
plt.title('Correlation between Polarity and Interactions')
  
# plot the data
plt.scatter(sentiment_vs_interactions['total'], sentiment_vs_interactions['polarity'])
  
# fits the best fitting line to the data
plt.xcorr(sentiment_vs_interactions['total'], sentiment_vs_interactions['polarity'])
  
# Labelling axes
plt.xlabel('interactions')
plt.ylabel('polarity')

sentiment_vs_interactions["total"].corr(sentiment_vs_interactions["polarity"])

In [48]:
df_expected = pd.concat([sentiment_vs_interactions, sentiment_vs_interactions['hashtags'].apply(pd.Series)], axis = 1).drop('hashtags', axis = 1)

for i in range(13):
    df_expected = pd.concat([df_expected, df_expected[i].apply(pd.Series)], axis = 1).drop([i, 0, 'start', 'end'], axis = 1)

df_expected

Unnamed: 0,id,text,retweet_count,quote_count,reply_count,like_count,media_keys,total,polarity,subjectivity,...,tag,tag.1,tag.2,tag.3,tag.4,tag.5,tag.6,tag.7,tag.8,tag.9
0,1369983392155009024,celebrating lilmoon rockets reveal giveway wor...,917,186,1920,1076,[3_1369983389210710019],4099,0.311111,0.344444,...,BinanceSmartChain,BNB,,,,,,,,
1,1395797789662187520,fleek officially running allowing dapps run we...,820,22,57,2522,[3_1395783651007377411],3421,0.083333,0.583333,...,,,,,,,,,,
2,1393223147630075904,sft live ready trade mcap circ supply price ac...,1545,5,5,865,[3_1393223052079546377],2420,0.067677,0.533333,...,,,,,,,,,,
3,1375754272437116928,let knock nfts head want revenge please watch ...,762,48,26,1326,[7_1375754125414166528],2162,0.212500,0.550000,...,,,,,,,,,,
4,1380255678925742080,new drop coming tomorrow stay tuned,97,7,35,1836,[3_1380255253862379521],1975,0.136364,0.454545,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1369447537111875584,play cards right audio experience available et...,36,4,15,101,[7_1369447396887891972],156,0.228571,0.645238,...,,,,,,,,,,
96,1374851744157605888,know dropping one tomorrow single edition,7,1,16,130,[3_1374851735928377349],154,-0.071429,0.214286,...,Ethereum,bitcoin,,,,,,,,
97,1383371108137668608,rainbow part rainbow series eth,15,2,9,125,"[3_1383369061678010374, 3_1383370426395500556]",151,0.000000,0.000000,...,,,,,,,,,,
98,1378962295171117056,night show got,8,0,76,66,[16_1378962285079621635],150,0.000000,0.000000,...,,,,,,,,,,
