In [1]:
import pandas as pd
import numpy as np
import csv 
import dateutil.parser
import re

#text processing and cleaning
from textblob import TextBlob
import nltk
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

#adapted from The AI & DS Channel (2022)

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Max\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Max\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
df = pd.read_csv('cryptocurrencyL.csv') #Reads CSV with tweets we want to add sentiment to

In [3]:
df.head() #Checks CSV

Unnamed: 0,author id,created_at,geo,id,lang,like_count,quote_count,reply_count,retweet_count,source,tweet
0,380085192,2021-01-07 23:59:53+00:00,,1347332160508846080,en,0,0,0,2,Twitter for iPhone,RT @chadandjt: Discussing #bitcoin and #crypto...
1,955234754462445568,2021-01-07 23:59:53+00:00,,1347332158931800065,en,2,0,0,1,Crypto Watch Bot,Best #cryptocurrency risk-adjusted returns in ...
2,3256899989,2021-01-07 23:59:49+00:00,,1347332140556558341,en,0,0,0,0,Twitter Web App,@kucoincom Bee Network is the fastest growing ...
3,3256899989,2021-01-07 23:59:44+00:00,,1347332118955831297,en,0,0,0,0,Twitter Web App,@GoTurkey Bee Network is the fastest growing c...
4,1686414942,2021-01-07 23:59:37+00:00,,1347332091797667841,en,0,0,1,0,Twitter Web App,"@GhisBernard Speaking of Bitcoin, you should ..."


In [4]:
df.columns #Checks columns

Index(['author id', 'created_at', 'geo', 'id', 'lang', 'like_count',
       'quote_count', 'reply_count', 'retweet_count', 'source', 'tweet'],
      dtype='object')

In [5]:
text_df = df.drop(['author id', 'geo', 'id','lang', 'like_count', 'quote_count', 'reply_count',
'retweet_count','source'], axis=1) #Drops specififed headers from tweet dataset
text_df.head() #Check dropped headers

Unnamed: 0,created_at,tweet
0,2021-01-07 23:59:53+00:00,RT @chadandjt: Discussing #bitcoin and #crypto...
1,2021-01-07 23:59:53+00:00,Best #cryptocurrency risk-adjusted returns in ...
2,2021-01-07 23:59:49+00:00,@kucoincom Bee Network is the fastest growing ...
3,2021-01-07 23:59:44+00:00,@GoTurkey Bee Network is the fastest growing c...
4,2021-01-07 23:59:37+00:00,"@GhisBernard Speaking of Bitcoin, you should ..."


In [6]:
text_df.head(None)

Unnamed: 0,created_at,tweet
0,2021-01-07 23:59:53+00:00,RT @chadandjt: Discussing #bitcoin and #crypto...
1,2021-01-07 23:59:53+00:00,Best #cryptocurrency risk-adjusted returns in ...
2,2021-01-07 23:59:49+00:00,@kucoincom Bee Network is the fastest growing ...
3,2021-01-07 23:59:44+00:00,@GoTurkey Bee Network is the fastest growing c...
4,2021-01-07 23:59:37+00:00,"@GhisBernard Speaking of Bitcoin, you should ..."
...,...,...
41252,2022-01-13 23:48:19+00:00,RT @senseishibtoken: The word of $SENSEI is tr...
41253,2022-01-13 23:48:19+00:00,RT @KittyLin2222: Harmony One Robot! 😍😍🥰\nhttp...
41254,2022-01-13 23:48:19+00:00,RT @elis_tech: In order to thank community mem...
41255,2022-01-13 23:48:18+00:00,RT @senseishibtoken: The word of $SENSEI is tr...


In [7]:
text_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41257 entries, 0 to 41256
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   created_at  41257 non-null  object
 1   tweet       41257 non-null  object
dtypes: object(2)
memory usage: 644.8+ KB


In [8]:
def proc(tweet): #Outlines characters to clean in collected tweet text
    tweet = re.sub(r"https\S+|www\S+https\S+", '',tweet, flags=re.MULTILINE)
    tweet = re.sub(r'\@w+|\#','',tweet)
    tweet = re.sub(r'[^\w\s]','',tweet)
    text_tokens = word_tokenize(tweet)
    filtered_text = [w for w in text_tokens if not w in stop_words]
    return " ".join(filtered_text)

In [9]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Max\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
text_df.tweet = text_df['tweet'].apply(proc) #Applies cleaning function to the dataframe

In [11]:
text_df = text_df.drop_duplicates('tweet') #Removes duplicate tweets

In [12]:
stemmer = PorterStemmer() #stemming function
def stemming(data):
    text = [stemmer.stem(word) for word in data]
    return data

In [13]:
text_df['tweet'] = text_df['tweet'].apply(lambda x: stemming(x)) #applies stemming function to dataframe

In [14]:
text_df.head() #Checks application of stemming function

Unnamed: 0,created_at,tweet
0,2021-01-07 23:59:53+00:00,RT chadandjt Discussing bitcoin cryptocurrency
1,2021-01-07 23:59:53+00:00,Best cryptocurrency riskadjusted returns past ...
2,2021-01-07 23:59:49+00:00,kucoincom Bee Network fastest growing cryptocu...
3,2021-01-07 23:59:44+00:00,GoTurkey Bee Network fastest growing cryptocur...
4,2021-01-07 23:59:37+00:00,GhisBernard Speaking Bitcoin join MEGA Pump Si...


In [15]:
def polarity(tweet): #Polarity function
    return TextBlob(tweet).sentiment.polarity

In [16]:
text_df['polarity'] = text_df['tweet'].apply(polarity) #Applies polarity function to the text dataframe

In [17]:
text_df.head(20) #Check polarity application

Unnamed: 0,created_at,tweet,polarity
0,2021-01-07 23:59:53+00:00,RT chadandjt Discussing bitcoin cryptocurrency,0.0
1,2021-01-07 23:59:53+00:00,Best cryptocurrency riskadjusted returns past ...,0.375
2,2021-01-07 23:59:49+00:00,kucoincom Bee Network fastest growing cryptocu...,0.0
3,2021-01-07 23:59:44+00:00,GoTurkey Bee Network fastest growing cryptocur...,0.0
4,2021-01-07 23:59:37+00:00,GhisBernard Speaking Bitcoin join MEGA Pump Si...,0.5
5,2021-01-07 23:59:37+00:00,jsblokland Bee Network fastest growing cryptoc...,0.0
6,2021-01-07 23:59:36+00:00,RT gemsays Why whole banking system scam Filme...,0.233333
8,2021-01-07 23:59:27+00:00,TheCryptoCandy HGerbal Bee Network fastest gro...,0.0
9,2021-01-07 23:59:23+00:00,YSEC getting closer release Cant wait farm div...,0.0
10,2021-01-07 23:59:22+00:00,The next 14 months going amazing Bitcoin Ether...,0.3


In [18]:
def sentiment(label): #Sentiment function
    if label<0:
        return "Negative"
    elif label ==0:
        return "Neutral"
    elif label>0:
        return "Positive"

In [19]:
text_df['sentiment'] = text_df['polarity'].apply(sentiment) #Applies sentiment function to data frame

In [20]:
text_df.head() #Check sentiment application

Unnamed: 0,created_at,tweet,polarity,sentiment
0,2021-01-07 23:59:53+00:00,RT chadandjt Discussing bitcoin cryptocurrency,0.0,Neutral
1,2021-01-07 23:59:53+00:00,Best cryptocurrency riskadjusted returns past ...,0.375,Positive
2,2021-01-07 23:59:49+00:00,kucoincom Bee Network fastest growing cryptocu...,0.0,Neutral
3,2021-01-07 23:59:44+00:00,GoTurkey Bee Network fastest growing cryptocur...,0.0,Neutral
4,2021-01-07 23:59:37+00:00,GhisBernard Speaking Bitcoin join MEGA Pump Si...,0.5,Positive


In [21]:
text_df.to_csv("cryptoL1.csv") #Creates CSV with cleaned tweet text, sentiment labbels and removed personal information