In [66]:
# Install Libraries
!pip install textblob
!pip install tweepy




In [67]:
# utilities
import re
import numpy as np
import pandas as pd
import re
import string
# plotting
import seaborn as sns
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from matplotlib import ticker
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
from sqlalchemy import create_engine
import psycopg2
from config import db_password
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func
# nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from nltk.corpus import words
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.util import *
nltk.download('stopwords')
nltk.download('vader_lexicon')
# sklearn
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
# Global Parameters
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prave\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\prave\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [68]:
# Create engine to connect and store in SQL database
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/tweet_project"
engine = create_engine(db_string)
session = Session(engine)


In [69]:
tweets_df= pd.read_sql_query('''SELECT * FROM tweets;''', engine)
tweets_df.head()

Unnamed: 0,user_name,user_location,user_verified,date,text,hashtags,is_retweet
0,MyNewsNE,Assam,False,2020-08-18 12:55:00,Australia to Manufacture Covid-19 Vaccine and ...,['CovidVaccine'],False
1,Ann-Maree O’Connor,"Adelaide, South Australia",False,2020-08-18 12:45:00,@michellegrattan @ConversationEDU This is what...,,False
2,Rajesh Tadepalli,"Hyderabad, India",False,2020-08-18 12:34:00,@PrivilRodrigues @yatish57 @deepkaranahuja @sh...,,False
3,AKisASocialisolationist wash yer damn hands,The Great Pacific Northwest,False,2020-08-18 12:30:00,"@MSNBC Well, let’s qualify that: would anyone ...",['CovidVaccine'],False
4,Dr. Joseph Santoro,"Washington, DC 20009",False,2020-08-18 12:15:00,"Most countries, without the ability to make #V...",['Vaccines'],False


In [70]:
def clean_tweet(temp):
    temp = str(temp).lower()
    temp = re.sub("'", "", str(temp)) # to avoid removing contractions in english
    temp = re.sub("@[A-Za-z0-9_]+","", str(temp))
    temp = re.sub("#[A-Za-z0-9_]+","", str(temp))
    temp = re.sub(r"www.\S+", "", temp)
    temp = re.sub(r"http\S+", "", temp)
    temp = re.sub('[()!?]', ' ', temp)
    temp = re.sub('\[.*?\]',' ', temp)
    temp = re.sub("[^a-z0-9]"," ", temp)
    temp = temp.split()
    stopwords =  stop_words
    temp = [w for w in temp if not w in stopwords]
    temp = " ".join(word for word in temp)
    return temp
tweets_df['text'] = tweets_df['text'].map(lambda x: clean_tweet(x))
tweets_df.head()

Unnamed: 0,user_name,user_location,user_verified,date,text,hashtags,is_retweet
0,MyNewsNE,Assam,False,2020-08-18 12:55:00,australia manufacture covid 19 vaccine give ci...,['CovidVaccine'],False
1,Ann-Maree O’Connor,"Adelaide, South Australia",False,2020-08-18 12:45:00,passes leadership country voucher something w,,False
2,Rajesh Tadepalli,"Hyderabad, India",False,2020-08-18 12:34:00,,,False
3,AKisASocialisolationist wash yer damn hands,The Great Pacific Northwest,False,2020-08-18 12:30:00,well let qualify would anyone party get vaccin...,['CovidVaccine'],False
4,Dr. Joseph Santoro,"Washington, DC 20009",False,2020-08-18 12:15:00,countries without ability make locally forced ...,['Vaccines'],False


In [71]:
tokenized_tweets = tweets_df['text'].apply(lambda x: x.split())
tokenized_tweets.head()

0    [australia, manufacture, covid, 19, vaccine, g...
1    [passes, leadership, country, voucher, somethi...
2                                                   []
3    [well, let, qualify, would, anyone, party, get...
4    [countries, without, ability, make, locally, f...
Name: text, dtype: object

In [72]:
#Stemming is a rule-based process of stripping the suffixes (“ing”, “ly”, “es”, “s” etc) from a word. 
stemmer = PorterStemmer()
tokenized_tweets = tokenized_tweets.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
tweets_df['tokenized']= tokenized_tweets
tweets_df.head()

Unnamed: 0,user_name,user_location,user_verified,date,text,hashtags,is_retweet,tokenized
0,MyNewsNE,Assam,False,2020-08-18 12:55:00,australia manufacture covid 19 vaccine give ci...,['CovidVaccine'],False,"[australia, manufactur, covid, 19, vaccin, giv..."
1,Ann-Maree O’Connor,"Adelaide, South Australia",False,2020-08-18 12:45:00,passes leadership country voucher something w,,False,"[pass, leadership, countri, voucher, someth, w]"
2,Rajesh Tadepalli,"Hyderabad, India",False,2020-08-18 12:34:00,,,False,[]
3,AKisASocialisolationist wash yer damn hands,The Great Pacific Northwest,False,2020-08-18 12:30:00,well let qualify would anyone party get vaccin...,['CovidVaccine'],False,"[well, let, qualifi, would, anyon, parti, get,..."
4,Dr. Joseph Santoro,"Washington, DC 20009",False,2020-08-18 12:15:00,countries without ability make locally forced ...,['Vaccines'],False,"[countri, without, abil, make, local, forc, re..."


In [73]:
from textblob import TextBlob

def get_subjectivity(text):
    return TextBlob(text).sentiment.subjectivity

def get_polarity(text):
    return TextBlob(text).sentiment.polarity

def get_sentiment(score):
    if score > 0:
        return 'Positive'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Negative'

In [74]:
tweets_df['subjectivity'] = tweets_df['text'].apply(get_subjectivity)
tweets_df['polarity'] = tweets_df['text'].apply(get_polarity)
tweets_df['label'] = tweets_df['polarity'].apply(get_sentiment)
tweets_df.head(10)

Unnamed: 0,user_name,user_location,user_verified,date,text,hashtags,is_retweet,tokenized,subjectivity,polarity,label
0,MyNewsNE,Assam,False,2020-08-18 12:55:00,australia manufacture covid 19 vaccine give ci...,['CovidVaccine'],False,"[australia, manufactur, covid, 19, vaccin, giv...",0.8,0.4,Positive
1,Ann-Maree O’Connor,"Adelaide, South Australia",False,2020-08-18 12:45:00,passes leadership country voucher something w,,False,"[pass, leadership, countri, voucher, someth, w]",0.0,0.0,Neutral
2,Rajesh Tadepalli,"Hyderabad, India",False,2020-08-18 12:34:00,,,False,[],0.0,0.0,Neutral
3,AKisASocialisolationist wash yer damn hands,The Great Pacific Northwest,False,2020-08-18 12:30:00,well let qualify would anyone party get vaccin...,['CovidVaccine'],False,"[well, let, qualifi, would, anyon, parti, get,...",0.6,-0.1,Negative
4,Dr. Joseph Santoro,"Washington, DC 20009",False,2020-08-18 12:15:00,countries without ability make locally forced ...,['Vaccines'],False,"[countri, without, abil, make, local, forc, re...",0.2,-0.3,Negative
5,VUMC OAP,"Nashville, TN",False,2020-08-18 11:57:00,zooms charts 1st week hear episode,"['DNA', 'vaccines', 'pandemic', 'COVID19', 'Co...",False,"[zoom, chart, 1st, week, hear, episod]",0.0,0.0,Neutral
6,HrNxt.com,India,False,2020-08-18 11:12:00,biocon executive chairperson kiran mazumdar sh...,,False,"[biocon, execut, chairperson, kiran, mazumdar,...",0.0,0.0,Neutral
7,Mohammadali Naseri,TEHRAN,False,2020-08-18 11:04:00,,"['Covid19Millionares', 'covid19', 'corona', 'C...",False,[],0.0,0.0,Neutral
8,LabTwin - Voice & AI-powered digital lab assis...,"Berlin, Germany",False,2020-08-18 11:02:00,great news vaccine entered phase 3 trial read,['Pharmaceutical'],False,"[great, news, vaccin, enter, phase, 3, trial, ...",0.75,0.8,Positive
9,BioDrivers,"Surat, Gujarat",False,2020-08-18 10:46:00,dangerous yet come d614,"['CovidVaccine', 'Corona', 'Immunization', 'Co...",False,"[danger, yet, come, d614]",0.9,-0.6,Negative


In [79]:
tweets_chart = tweets_df[['user_name', 'date', 'label']].groupby(['date', 'label']).count().reset_index()
tweets_chart.columns = ['date', 'label', 'counts']
tweets_chart.head()

Unnamed: 0,date,label,counts
0,2020-01-09 00:09:00,Positive,1
1,2020-01-09 00:49:00,Neutral,1
2,2020-01-09 01:04:00,Neutral,1
3,2020-01-09 01:09:00,Neutral,1
4,2020-01-09 01:28:00,Neutral,1


In [82]:
tweets_df.head()

Unnamed: 0,user_name,user_location,user_verified,date,text,hashtags,is_retweet,tokenized,subjectivity,polarity,label
0,MyNewsNE,Assam,False,2020-08-18 12:55:00,australia manufacture covid 19 vaccine give ci...,['CovidVaccine'],False,"[australia, manufactur, covid, 19, vaccin, giv...",0.8,0.4,Positive
1,Ann-Maree O’Connor,"Adelaide, South Australia",False,2020-08-18 12:45:00,passes leadership country voucher something w,,False,"[pass, leadership, countri, voucher, someth, w]",0.0,0.0,Neutral
2,Rajesh Tadepalli,"Hyderabad, India",False,2020-08-18 12:34:00,,,False,[],0.0,0.0,Neutral
3,AKisASocialisolationist wash yer damn hands,The Great Pacific Northwest,False,2020-08-18 12:30:00,well let qualify would anyone party get vaccin...,['CovidVaccine'],False,"[well, let, qualifi, would, anyon, parti, get,...",0.6,-0.1,Negative
4,Dr. Joseph Santoro,"Washington, DC 20009",False,2020-08-18 12:15:00,countries without ability make locally forced ...,['Vaccines'],False,"[countri, without, abil, make, local, forc, re...",0.2,-0.3,Negative


In [83]:
col = ['user_name','user_location','label']
tweets_new_df= tweets_df[col]
tweets_new_df.head()

Unnamed: 0,user_name,user_location,label
0,MyNewsNE,Assam,Positive
1,Ann-Maree O’Connor,"Adelaide, South Australia",Neutral
2,Rajesh Tadepalli,"Hyderabad, India",Neutral
3,AKisASocialisolationist wash yer damn hands,The Great Pacific Northwest,Negative
4,Dr. Joseph Santoro,"Washington, DC 20009",Negative


In [84]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
tweets_new_df['label'] = le.fit_transform(tweets_new_df['label'])
tweets_new_df.head()

Unnamed: 0,user_name,user_location,label
0,MyNewsNE,Assam,2
1,Ann-Maree O’Connor,"Adelaide, South Australia",1
2,Rajesh Tadepalli,"Hyderabad, India",1
3,AKisASocialisolationist wash yer damn hands,The Great Pacific Northwest,0
4,Dr. Joseph Santoro,"Washington, DC 20009",0


In [93]:
# Create our features
X = tweets_df
X = X.drop("label", axis=1)

y = tweets_df["label"]

In [94]:
tweets_df.dtypes


user_name                object
user_location            object
user_verified            object
date             datetime64[ns]
text                     object
hashtags                 object
is_retweet               object
tokenized                object
subjectivity            float64
polarity                float64
label                    object
dtype: object

In [95]:
y.dtypes


dtype('O')

In [96]:
SentimentText = tweets_df['text']
sentiment_text_list = SentimentText
textfile = open("C:/Users/prave/Module-20/covidvaccine.txt", "w")
for element in sentiment_text_list:
        textfile.write(element + "\n")
textfile.close()

In [97]:
#Vectorizer 1: Finding the unigram representation
from sklearn.feature_extraction.text import CountVectorizer
vectorizer=CountVectorizer()

In [98]:
# fitting the vectorizer
X=vectorizer.fit_transform(SentimentText)

In [99]:
# getting the target values i.e wheather the tweets are positive or negative
train_data = tweets_df
y = train_data['label']

In [100]:
X.shape

(437326, 93082)

In [101]:
y.shape

(437326,)

In [102]:
col = ['user_name','user_location','label']
tweet_new_df= tweets_df[col]
tweet_new_df.head()

Unnamed: 0,user_name,user_location,label
0,MyNewsNE,Assam,Positive
1,Ann-Maree O’Connor,"Adelaide, South Australia",Neutral
2,Rajesh Tadepalli,"Hyderabad, India",Neutral
3,AKisASocialisolationist wash yer damn hands,The Great Pacific Northwest,Negative
4,Dr. Joseph Santoro,"Washington, DC 20009",Negative


In [103]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [104]:
# Resample the training data with the BalancedRandomForestClassifier
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'Positive': 148246, 'Neutral': 148246, 'Negative': 148246})

In [105]:
# Create our features
X = tweets_df
X = X.drop("label", axis=1)
y = tweets_df["label"]

In [106]:
X.dtypes

user_name                object
user_location            object
user_verified            object
date             datetime64[ns]
text                     object
hashtags                 object
is_retweet               object
tokenized                object
subjectivity            float64
polarity                float64
dtype: object

In [107]:
y.dtypes

dtype('O')

In [108]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
   y, random_state=1, stratify=y)

In [109]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(solver='lbfgs',
     random_state=1)


In [110]:
clf.fit(X_train, y_train)

ValueError: could not convert string to float: 'BatMuzzy'

In [54]:
y_pred = clf.predict(X_test)

NotFittedError: This LogisticRegression instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [66]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.9823107598873158


In [67]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
   intercept_scaling=1, max_iter=100, multi_class='warn', penalty='12',
   random_state=1, solver='lbfgs', warm_start=False)

LogisticRegression(multi_class='warn', penalty='12', random_state=1)

In [46]:
# Calculate the balanced accuracy score

from sklearn.metrics import balanced_accuracy_score

y_pred = clf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.9718407282995601

In [47]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[13039,   294,   564],
       [  168, 48691,   229],
       [  427,   252, 45668]], dtype=int64)

In [48]:
# Print the imbalanced classification report

from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

   Negative       0.96      0.94      0.99      0.95      0.97      0.93     13897
    Neutral       0.99      0.99      0.99      0.99      0.99      0.98     49088
   Positive       0.98      0.99      0.99      0.98      0.99      0.97     46347

avg / total       0.98      0.98      0.99      0.98      0.99      0.97    109332



In [18]:
X=tweets_df
y = tweets_df["label"].values
y[:5]

array(['Positive', 'Neutral', 'Neutral', 'Negative', 'Negative'],
      dtype=object)

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=1)

# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaler
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

ValueError: could not convert string to float: 'Your Kensington/ChinaTown Officer'

In [74]:
X=tweets_new_df.drop(columns="label")

In [75]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
   y,  random_state=1, stratify=y)
X_train.shape

(327994, 1)

In [84]:
from sklearn.svm import SVC
tweets_new_df = SVC(kernel='linear')

In [88]:
tweets_new_df.fit(X_train, y_train)

AttributeError: 'DataFrame' object has no attribute 'fit'