# Machine Learning Model

### Libraries

In [None]:
# Import all dependencies

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import re
import nltk
import nltk.corpus
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
from nltk.stem import WordNetLemmatizer
from sqlalchemy import create_engine
import psycopg2
from config import db_password
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func

# Text Polarity
from textblob import TextBlob

# Text Vectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Word Cloud
from wordcloud import WordCloud
# Label Encoding
from sklearn.preprocessing import LabelEncoder

# TF-IDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Resampling
from imblearn.over_sampling import SMOTE
from collections import Counter

# Splitting Dataset
from sklearn.model_selection import train_test_split

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score

# Model Metrics
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


### Import Data from Database

In [2]:
# Connect to database
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/Twitter_Project"
engine = create_engine(db_string)
session = Session(engine)

In [3]:
# Import table into notebook
nlp_tweets=pd.read_sql_query('''SELECT * FROM "NLP_tweets";''', engine)
nlp_tweets

Unnamed: 0,user_name,date,text,sentiment
0,95210,2020-08-18,australia manufacture covid19 vaccine give cit...,positive
1,11636,2020-08-18,michellegrattan conversationedu passes leaders...,neutral
2,110933,2020-08-18,privilrodrigues yatish57 deepkaranahuja shrist...,neutral
3,2908,2020-08-18,msnbc well let’s qualify would anyone party ge...,positive
4,40578,2020-08-18,countries without ability make vaccines locall...,negative
...,...,...,...,...
437121,142258,2021-11-15,45 urban bengaluru covidvaccine availability 1...,
437122,142258,2021-11-15,1844 bbmp bengaluru covidvaccine availability ...,
437123,142258,2021-11-15,1844 urban bengaluru covidvaccine availability...,
437124,50505,2021-11-15,promote vaccines leaving stronger russia vacci...,


### Feature Engineering and Data Pre-processing

In [4]:
# Drop unwanted columns
nlp_tweets= nlp_tweets.drop(columns=["user_name", "date"])
nlp_tweets

Unnamed: 0,text,sentiment
0,australia manufacture covid19 vaccine give cit...,positive
1,michellegrattan conversationedu passes leaders...,neutral
2,privilrodrigues yatish57 deepkaranahuja shrist...,neutral
3,msnbc well let’s qualify would anyone party ge...,positive
4,countries without ability make vaccines locall...,negative
...,...,...
437121,45 urban bengaluru covidvaccine availability 1...,
437122,1844 bbmp bengaluru covidvaccine availability ...,
437123,1844 urban bengaluru covidvaccine availability...,
437124,promote vaccines leaving stronger russia vacci...,


In [5]:
# Value Counts
nlp_tweets["sentiment"].value_counts()

positive    192753
neutral     165231
negative     78942
Name: sentiment, dtype: int64

In [6]:
# Encode Target Variable
Encoder = LabelEncoder()
nlp_tweets["sentiment"] = Encoder.fit_transform(nlp_tweets["sentiment"])

In [7]:
# Rename Column
nlp_tweets.rename(columns = {"sentiment":"label"}, inplace = True)

In [8]:
nlp_tweets["label"].value_counts()

2    192753
1    165231
0     78942
3       200
Name: label, dtype: int64

In [9]:
nlp_tweets

Unnamed: 0,text,label
0,australia manufacture covid19 vaccine give cit...,2
1,michellegrattan conversationedu passes leaders...,1
2,privilrodrigues yatish57 deepkaranahuja shrist...,1
3,msnbc well let’s qualify would anyone party ge...,2
4,countries without ability make vaccines locall...,0
...,...,...
437121,45 urban bengaluru covidvaccine availability 1...,3
437122,1844 bbmp bengaluru covidvaccine availability ...,3
437123,1844 urban bengaluru covidvaccine availability...,3
437124,promote vaccines leaving stronger russia vacci...,3


#### 0= Negative, 1= Neutral, 2= Positive, 3= None

In [13]:
# Exporting Table to Database

# Save as CSV just in case
nlp_tweets.to_csv("encoded_data.csv", index=True, header=True)

# Create engine to connect and store in SQL database
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/Twitter_Project"
engine = create_engine(db_string)
nlp_tweets.to_sql(name='encoded_data', con=engine, method="multi")

In [10]:
# Drop the "None" rows
nlp_tweets = nlp_tweets.drop(nlp_tweets[nlp_tweets.label==3].index)
nlp_tweets

Unnamed: 0,text,label
0,australia manufacture covid19 vaccine give cit...,2
1,michellegrattan conversationedu passes leaders...,1
2,privilrodrigues yatish57 deepkaranahuja shrist...,1
3,msnbc well let’s qualify would anyone party ge...,2
4,countries without ability make vaccines locall...,0
...,...,...
436921,45 urban bengaluru covidvaccine availability 2...,2
436922,1844 bbmp bengaluru covidvaccine availability ...,2
436923,1844 urban bengaluru covidvaccine availability...,2
436924,45 urban bengaluru covidvaccine availability 2...,2


In [11]:
nlp_tweets["label"].value_counts()

2    192753
1    165231
0     78942
Name: label, dtype: int64

In [12]:
# Defining the vectorizer with total words of 10,000 and with bigram model
TF_IDF = TfidfVectorizer(max_features = 10000, ngram_range = (2, 2))

# Fitting and transforming the tweets into a matrix of weighed words
X = TF_IDF.fit_transform(nlp_tweets["text"])

# Check matrix shape
X.shape

(436926, 10000)

In [13]:
# Declare the target variable
y = nlp_tweets["label"]

### Resampling the dataset
*There are comparatively less negative tweets than there are positive and neutral ones. SMOTE (Synthetic Minority Oversampling Technique) will be used to balance the data.*

In [14]:
# Before SMOTE resampling
Counter(y)

Counter({2: 192753, 1: 165231, 0: 78942})

In [15]:
# Resampling 
Balancer = SMOTE(random_state = 42)
X_final, y_final = Balancer.fit_resample(X, y)

In [16]:
# After SMOTE resampling
Counter(y_final)

Counter({2: 192753, 1: 192753, 0: 192753})

### Splitting the data into train and test

*A 70-30 split will be used*

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size = 0.3, random_state = 42)

## Model Selection and Evaluation

*A few different models will be tested for accuracy. Naive-Bayes was originally selected, as it is a fairly reliable classifier.*

### Logistic Regression

In [22]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver='lbfgs', random_state=42, max_iter=10000)
logreg.fit(X_final, y_final)

LogisticRegression(max_iter=10000, random_state=42)

In [23]:
# Predictions
y_pred = logreg.predict(X_test)

In [24]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.43750177708512705

In [25]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[33028 12339 12338]
 [21737 22285 13754]
 [22807 14634 20556]]


### Naive-Bayes

In [27]:
# Train the Naive-Bayes model using the resampled data
naive_b = BernoulliNB()
naive_b.fit(X_final, y_final)

BernoulliNB()

In [28]:
# Predictions
y_pred = naive_b.predict(X_test)

In [29]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.41326234194923295

In [30]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[29120 10034 18551]
 [19025 17666 21085]
 [21198 11896 24903]]


### Balanced Random Forest Classifier

In [None]:
# Train the Random Forest Classifier using the resampled data
from imblearn.ensemble import BalancedRandomForestClassifier
brf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)

# Fitting the model
brf_model = brf_model.fit(X_final, y_final)

# Evaluate the model
y_pred = brf_model.predict(X_final, y_final)

In [None]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

### Multinomial Naive-Bayes

In [None]:
# Train the Multinomial Naive-Bayes using the resampled data
mnnb = MultinomialNB()

# Fitting the model
mnnb = MultinomialNB.fit(X_final, y_final)

# Evaluate the model
y_pred = mnnb.predict(X_final, y_final)

In [None]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)