<a href="https://www.kaggle.com/code/jeliusheneriko/sentiment-analysis?scriptVersionId=182053484" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra


import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
## Some more other important libraries

import matplotlib.pyplot as plt
import seaborn as sns
import re
from matplotlib import style
style.use('ggplot')
from textblob import TextBlob
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,ConfusionMatrixDisplay


In [None]:
# Reloding the datasets
import os

# List all files in the directory
directory_path = '/kaggle/input/pfizer-vaccine-tweets'
for dirname, _, filenames in os.walk(directory_path):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
df = pd.read_csv('/kaggle/input/pfizer-vaccine-tweets/vaccination_tweets.csv')
df.head()

In [None]:
# checking data info 
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.columns

In [None]:
text_df = df.drop(['id', 'user_name', 'user_location', 'user_description', 'user_created',
       'user_followers', 'user_friends', 'user_favourites', 'user_verified',
       'date', 'hashtags', 'source', 'retweets', 'favorites',
       'is_retweet'],axis = 1)
text_df.head()

In [None]:
num_of_rows_to_print = 5

for i in range(num_of_rows_to_print):
    print(text_df['text'].iloc[i],'\n')

In [None]:
text_df.info()

In [None]:
# # Data procedding
# def data_processing(text):
#     text = text.lower()
#     text = re.sub(r"https\S+|www\S+", '', text, flags=re.MULTILINE)
#     text = re.sub(r"\@\w+|\#", '', text)
#     text = re.sub(r"[^\w\s]", '', text)
#     text_tokens = word_tokenize(text)
#     stop_words = set(stopwords.words('english'))
#     filtered_text = [w for w in text_tokens if not w in stop_words]
#     return " ".join(filtered_text)  # Join tokens with spaces


In [None]:
# Define the data processing function
def data_processing(text):
    text = text.lower()
    text = re.sub(r"https\S+|www\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r"\@\w+|\#", '', text)
    text = re.sub(r"[^\w\s]", '', text)
    text_tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_text = [w for w in text_tokens if not w in stop_words]
    return " ".join(filtered_text)  # Join tokens with spaces

# Stemming function
stemmer = PorterStemmer()
def stemming(text):
    words = text.split()  # Split the text into words
    stemmed_words = [stemmer.stem(word) for word in words]
    return " ".join(stemmed_words)  # Join stemmed words with spaces


In [None]:
text_df['text'] = text_df['text'].apply(data_processing)

# Apply stemming using lambda function
text_df['text'] = text_df['text'].apply(lambda x: stemming(x))

# Display the first few rows to check the changes
print(text_df.head())

In [None]:
text_df.info()

In [None]:
# Creatiing polarity of our 
def polarity(text):
    return TextBlob(text).sentiment.polarity

In [None]:
text_df['polarity'] = text_df['text'].apply(polarity)

In [None]:
text_df.head(10)

In [None]:
# Adding the sentiment column to a dataframe 
def sentiment(label):
    if label>0:
        return "positive"
    elif label == 0:
        return "neutral"
    elif label <0:
        return "negative"
text_df["sentiment"] = text_df["polarity"].apply(sentiment)
text_df.head()

In [None]:
fig = plt.figure(figsize=(5,5))
sns.countplot(x="sentiment",data=text_df)


In [None]:
fig = plt.figure(figsize=(7,7))
colors = ("yellowgreen","gold","red")
wp = {"linewidth":2,"edgecolor":"black"}
tags = text_df["sentiment"].value_counts()
explode = (0.1,0.1,0.1)
tags.plot(kind="pie",autopct="%1.1f%%",shadow=True,colors=colors,
         startangle=90,wedgeprops= wp,explode=explode,label='')
plt.title("Distribution of Sentiments")

In [None]:
post_tweets = text_df[text_df.sentiment == 'positive']
post_tweets = post_tweets.sort_values(['polarity'],ascending=False)
post_tweets.head()

In [None]:
# Creating a word cloud

text =''.join([word for word in post_tweets['text']])
plt.figure(figsize=(20,15),facecolor='None')
wordcloud = WordCloud(max_words=500,width= 1600 ,height=800).generate(text)
plt.imshow(wordcloud,interpolation='bilinear')
plt.axis("off")
plt.title("Most frequent words in positive tweets: ",fontsize=19)
plt.show()


In [None]:
neg_tweets = text_df[text_df.sentiment == 'positive']
neg_tweets = neg_tweets.sort_values(['polarity'],ascending=False)
neg_tweets.head()

In [None]:
# Creating a word cloud

text =''.join([word for word in neg_tweets['text']])
plt.figure(figsize=(20,15),facecolor='None')
wordcloud = WordCloud(max_words=500,width= 1600 ,height=800).generate(text)
plt.imshow(wordcloud,interpolation='bilinear')
plt.axis("off")
plt.title("Most frequent words in negative tweets: ",fontsize=19)
plt.show()


In [None]:
# Vectorization of the words in pour data frame and create a bigram model that will capture 
# only two words ,and for n gram n words
vect  = CountVectorizer(ngram_range=(1,2)).fit(text_df["text"])

# Extracting feature names 
feature_names = vect.get_feature_names_out()
print("Number of features: {}\n".format(len(feature_names)))
print("First twenty features: {}\n".format(feature_names[:20]))



In [None]:
x = text_df["text"]
y = text_df["sentiment"]

x = vect.transform(x)

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

# # showing the size of the following datasets clusters
print("The size of x_train:",(x_train.shape))
print("The size of x_test:",(x_test.shape))
print("The size of y_train:",(y_train.shape))
print("The size of y_test:",(y_test.shape))

In [None]:
# arr_data = ['x_train','x_test','y_train','y_test']
# for name, data in zip(arr_data, arr_data):
#     print(f"Size of {name}: {data.shape if hasattr(data, 'shape') else len(data)}")

In [None]:
# Fitting the models
# Logistic regression model

Logreg = LogisticRegression()
Logreg.fit(x_train,y_train)
Logreg_pred = Logreg.predict(x_test)

# Calculating the accuracy

Logreg_acc = accuracy_score(Logreg_pred,y_test)
print("The accuracy score: {:.2f}%".format(Logreg_acc*100))

In [None]:
# How to disable warnigs
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Printing the confusion matrix
print(confusion_matrix(y_test,Logreg_pred))
print('\n')
print(classification_report(y_test, Logreg_pred))

In [None]:
# Official way for displaying confusion matrix
style.use('classic')
cm = confusion_matrix(y_test,Logreg_pred,labels=Logreg.classes_)
display = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=Logreg.classes_)
display.plot()

In [None]:
# Performing hyper-parameter tuning
from sklearn.model_selection import GridSearchCV

In [None]:
# Setting parametr values 
param_grid = {'C':[0.001,0.01,0.1,1,10]}
grid = GridSearchCV(LogisticRegression(),param_grid)
grid.fit(x_train,y_train)


In [None]:
# Printing best parameters
print("The best parameters: ",grid.best_params_)

In [None]:
# Prediction 
y_pred1 = grid.predict(x_test)

# Logistic regression accuracy
log_reg_acc = accuracy_score(y_pred1,y_test)
print("Test accuracy is : {:.2f}%".format(log_reg_acc*100))

In [None]:
# CONFUSION MATRIX AND CLASSIFICATION REPORT
print(confusion_matrix(y_test,y_pred1))
print('\n')
print(classification_report(y_test,y_pred1))

In [None]:
# stylising our confusion matrix
style.use('classic')
cm = confusion_matrix(y_test,y_pred1,labels=Logreg.classes_)
display = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=Logreg.classes_)
display.plot()

In [None]:
from sklearn.svm import LinearSVC
svc_model = LinearSVC()
svc_model.fit(x_train,y_train)

In [None]:
# Printing the best parameters
svc_pred = svc_model.predict(x_test)
svc_acc = accuracy_score(svc_pred,y_test)
print("Best parameters: ",grid.best_params_)

In [None]:
# Printing the confusion matrix and the classification report 
# Printing the confusion matrix
print(confusion_matrix(y_test,svc_pred))
print('\n')
print(classification_report(y_test,svc_pred))

In [None]:
# stylising our confusion matrix
style.use('classic')
cm = confusion_matrix(y_test,svc_pred,labels=Logreg.classes_)
display = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=Logreg.classes_)
display.plot()

In [None]:
grid = {
    'C':[0.01,0.1,1,10],
    'kernel':['linear','poly','rbf','sigmoid',],
    'degree':[1,3,5,7],
    'gamma':[0.01,1]
    
}

grid = GridSearchCV(svc_model,param_grid)
grid.fit(x_train,y_train)