In [None]:
import pandas as pd
import numpy as nnp
import matplotlib.pyplot as plt
import textblob
import os
import seaborn as sns

In [None]:
os.chdir("/content/drive/MyDrive/Sentiment analysis of crypto through twitter/")
os.listdir()

['__MACOSX',
 'twitter_data_clean.csv',
 'bitcoin.ipynb',
 'bitcoin-usd.csv',
 'config.ini',
 'twitter_data.csv',
 'twitter.ipynb',
 '.ipynb_checkpoints']

In [None]:
df=pd.read_csv("/content/drive/MyDrive/Sentiment analysis of crypto through twitter/twitter_data.csv")
df.head()

Unnamed: 0,tweets,likes,time
0,RT @MatthewHyland_: #Bitcoin has broken out of...,0,2022-03-28 01:05:39
1,"Cryptocurrency prices today: Bitcoin, Ether, C...",0,2022-03-28 01:05:39
2,@ChinaPumpWXC Vitteey MEXC : Vitteey Token Is ...,0,2022-03-28 01:05:39
3,RT @CedYoungelman: #Bitcoin Facts https://t.co...,0,2022-03-28 01:05:39
4,"@BigCheds Bitcoin will make many millionaires,...",0,2022-03-28 01:05:39


In [None]:
df.tail()

Unnamed: 0,tweets,likes,time
1995,#Bitcoin 45k SELL WALL has completely been bro...,0,2022-03-28 00:56:46
1996,RT @zerohedge: *BITCOIN'S WEEKEND RALLY ERASES...,0,2022-03-28 00:56:46
1997,RT @APompliano: Bitcoin miners were paid $42.4...,0,2022-03-28 00:56:45
1998,@instagram wuold probably become the new OpenS...,1,2022-03-28 00:56:45
1999,buckle the fuck up 🚀 👀 #bitcoin #crypto $btc h...,9,2022-03-28 00:56:44


In [None]:
df.size

6000

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tweets  2000 non-null   object
 1   likes   2000 non-null   int64 
 2   time    2000 non-null   object
dtypes: int64(1), object(2)
memory usage: 47.0+ KB


All are the categorical data (object data type)

# Clean the data


In [None]:
df.isnull().sum()

tweets    0
likes     0
time      0
dtype: int64

Let's extract the specific review of the particular customer as we can say 150th customer 

# Clean the text for sentiment 

In [None]:
df["Tweets"]=df["Tweets"].apply(cleanTxt)
df.head()

In [None]:
from typing import TextIO
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords


def cleanTxt(text):
  text=re.sub('[^a-zA-Z]'," ",text)
  text=re.sub(r"RT[\s]+"," ",text)
  text=re.sub(r"http\S|www\S|https\S+","",text)
  text=text.lower()
  for word in text:
    if not word in stopwords.words("english"):
      return text
  return text
  
  


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
df.tail()

# Apply Sentiment polarity for the Tweets

Let's extract the specific tweet of the 150th row

In [None]:
df['Tweets'][150]

We use **textblob** function to analyse the polarity and the subjectivity of sentiments of the tweets 


*   The value of polarity can be between the -1 to 1. 
*   The reviews with the Negative polarity values shows the negative sentiments  
*   The reviews with the positive polarity values  indicates the positive sentiments
*   Subjectivity value can be rise between 0 and 1
*   It quantify the amount of personal opinion and the actual information contain in that text
*   The higher the value of subjectivity contain the review of the personal opinion of that particular customer
*   The value nears to zero consist of the actual information of the review
not the personal opinion of the particular customer



In [None]:
from textblob import TextBlob
text_blob_object=TextBlob(df['Tweets'][150])
print(text_blob_object.sentiment)

This 150th tweets contain positive sentiments due to the positive value of polarity also personal opinion in review more than 50%.

Let's add an element in the dataset which sentiment polarity as I have mentioned before the positive polarity contains positive sentiments, negative polarity contains negative sentiments 

We will create **function**  where each tweet will contain their sentimental polarity at the end of table

---



In [None]:
def find_pol(Tweets):
  return TextBlob(Tweets).sentiment.polarity

def find_subjectivity(Tweets):
  return TextBlob(Tweets).sentiment.subjectivity

  
df["Subjectivity"]=df["Tweets"].apply(find_subjectivity)

df['Sentiment_Polarity']=df['Tweets'].apply(find_pol)
df.head()


In [None]:
# Plot the polarity and subjectivity 
plt.figure(figsize=(8,6))
for i in range(0, df.shape[0]):
  plt.scatter(df['Sentiment_Polarity'][i],df["Subjectivity"][i],color="Blue")

plt.title("Sentiment Analysis")
plt.xlabel("Polarity")
plt.ylabel("Subjectivity")
plt.show

In [None]:
#Create a function to compute the negative, positive and neutral analysis 
def getAnalysis(score):
  if score<0:
    return "Negative"
  elif score==0:
    return "Neutral"
  else:
    return 'Positive'

df["Analysis"]=df["Sentiment_Polarity"].apply(getAnalysis)
df.head()

In [None]:
# Get the percentage of positive tweets
ptweets=df[df.Analysis=="Positive"]
ptweets=ptweets["Tweets"]

round( (ptweets.shape[0]/df.shape[0]) *100,1 )

In [None]:
# Get the percentage of negative tweets
ntweets=df[df.Analysis=="Negative"]
ntweets=ntweets["Tweets"]

round( (ntweets.shape[0]/df.shape[0]) *100,1 )

In [None]:
# Get the percentage of neutrl tweets
nutweets=df[df.Analysis=="Neutral"]
nutweets=nutweets["Tweets"]

round( (nutweets.shape[0]/df.shape[0]) *100,1 )

In [None]:
#Show the value counts
df["Analysis"].value_counts()

In [None]:
sns.displot(df['Sentiment_Polarity'])

This graph indicates that the highest number of customer reviewed neutral sentiment polarity then most of them contain positive sentiment and few contains negative sentiments

Now, we are going to obtain the negative sentiments or negative tweets from our data. Analyse only top five records


In [None]:
negative_sentiments= df[df.Sentiment_Polarity <0].Tweets.head()
print(negative_sentiments)


These are most negative reviews or sentiments, we can say that it is having negative polarity. If the polarity is less than zero it is containing negative sentiments

In [None]:
df["Tweets"][32]

Now, we are going to obtain the all most positive tweets or sentiments as they are having positve polarity. Analyse only top five records

In [None]:
positive_sentiments=df[df.Sentiment_Polarity >0].Tweets.head()
print(positive_sentiments)

If the polarity is greater than zero it is containing positive sentiments

In [None]:
df["Tweets"][2]

In [None]:
#Show the value counts
df["Analysis"].value_counts()

#plot and visualize the counts
plt.title("Sentiment Analysis")
plt.xlabel("Sentiments")
plt.ylabel("counts")
df["Analysis"].value_counts().plot(kind="bar")
plt.show()


# Count Vectorizing

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report


In [None]:
tfidf=TfidfVectorizer(max_features=5000)
X=df["Tweets"]
y=df["Analysis"]

X=tfidf.fit_transform(X).toarray()



TF-IDF is better than Count Vectorizers because it not only focuses on the frequency of words present in the corpus but also provides the importance of the words. We can then remove the words that are less important for analysis, hence making the model building less complex by reducing the input dimensions

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [None]:
classifier=LinearSVC()
classifier.fit(X_train,y_train)

In [None]:
# from sklearn.naive_bayes import GaussianNB
# classifier=GaussianNB()

# classifier.fit(X_train,y_train)

In [None]:
# from sklearn.naive_bayes import MultinomialNB
# classifier=MultinomialNB()
# classifier.fit(X_train,y_train)

In [None]:
# from sklearn.svm import SVC
# classifier=SVC()
# classifier.fit(X_train,y_train)

In [None]:
# from sklearn.neighbors import KNeighborsClassifier
# classifier=KNeighborsClassifier(n_neighbors=5)
# classifier.fit(X_train,y_train)

In [None]:
# from sklearn.linear_model import LogisticRegression
# classifier=LogisticRegression()
# classifier.fit(X_train,y_train)

In [None]:
y_pred=classifier.predict(X_test)

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
#Making the confusion Matrix
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,y_pred)
print(cm)

In [None]:
from sklearn import metrics
print(metrics.accuracy_score(y_test,y_pred))