# Data Class

In [1]:
import random

class Sentiment:
    POSITIVE = "POSITIVE"
    NEGATIVE = "NEGATIVE" 

class Data:
    def __init__(self, tweet, replies, retweets, likes):
        self.tweet = tweet
        self.replies = replies
        self.retweets = retweets
        self.likes = likes
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.replies >= '200' or self.retweets >= '300' or self.likes >= '500':
            return Sentiment.POSITIVE
        else:
            return Sentiment.NEGATIVE
        
class DataContainer:
    def __init__(self, data):
        self.data = data
        
    def get_text(self):
        return [x.tweet for x in self.data]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.data]
        
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.data))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.data))
        negative_shrunk = negative[:len(positive)]
        self.data = positive + negative_shrunk
        random.shuffle(self.data)

# Load Data

In [2]:
import csv

data = []
i = 1
faulty_rows = 0
max_replies = 0
max_retweets = 0
max_likes = 0

with open(r"final_data.csv", encoding = 'utf-8') as f:    
    csvReader = csv.reader(f, delimiter=',', quotechar='"')
    for row in csvReader:
        if (i > 2000000): #total tweets = 54243060 (faulty rows included)
            break
        if (len(row) == 4):
            if(row[1].isnumeric() and row[2].isnumeric() and row[3].isnumeric()):
                tweet = row[0]
                replies_count = row[1]
                retweets_count = row[2]
                likes_count = row[3]
                data.append(Data(tweet, replies_count, retweets_count, likes_count))
                #Find max replies, retweets, likes
                if (int(replies_count) > max_replies):
                    max_replies = int(replies_count)
                if (int(retweets_count) > max_retweets):
                    max_retweets = int(retweets_count)    
                if (int(likes_count) > max_likes):
                    max_likes = int(likes_count)
            else:
                print("Error: row " + str(i) + " was incorrect.")
                faulty_rows = faulty_rows + 1
        else:
            print("Error: row " + str(i) + " was not complete.")
            faulty_rows = faulty_rows + 1
        i = i + 1
f.close()

print(data[850].sentiment)
print(data[850].tweet)
print(f'\nData length: ' + str(len(data)))
print('Faulty rows: ' + str(faulty_rows))
print('Max replies: ' + str(max_replies))
print('Max retweets: ' + str(max_retweets))
print('Max likes: ' + str(max_likes))

Error: row 1512903 was not complete.
Error: row 1574085 was not complete.
Error: row 1668073 was not complete.
Error: row 1728665 was not complete.
Error: row 1790437 was not complete.
Error: row 1817089 was not complete.
Error: row 1875356 was not complete.
Error: row 1932282 was not complete.
Error: row 1967754 was not complete.
NEGATIVE
@monicaonairtalk We stopped at the mask. Before corona the Lord led me, showed me a plot to inoculate the world with a plant-based vaccine. My husband thought I was nuts, then the news of the “virus” came. Praise God he could see. We don’t know his status, but he hasn’t served since March 2020

Data length: 1999991
Faulty rows: 9
Max replies: 6382
Max retweets: 19778
Max likes: 81874


# Text Preprocessing

In [3]:
import re
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

for i in range(0, len(data)):
    # Remove all the special characters (punctuation removal)
    document = re.sub(r'[^a-zA-Z0-9]', ' ', str(data[i].tweet))
    
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    
    # Converting to Lowercase
    document = document.lower()
    
    # Lemmatization
    document = document.split()

    # Tokenization for stemming
    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)
    
    data[i].tweet = document

print(data[850].tweet)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lambr\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\lambr\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


monicaonairtalk we stopped at the mask before corona the lord led me showed me plot to inoculate the world with plant based vaccine my husband thought wa nut then the news of the virus came praise god he could see we don know his status but he hasn served since march 2020


# Prepare Data

In [4]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(data, test_size=0.2, random_state=42)

train_container = DataContainer(training)
test_container = DataContainer(test)

train_container.evenly_distribute()
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_container.evenly_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEGATIVE))

143525
143525


# Bag of words vectorization to TFIDF values

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import numpy as np

vectorizer = TfidfVectorizer(max_features=1500, min_df=20, max_df=0.3, stop_words=stopwords.words('english'))

train_x_vectors = np.asarray(vectorizer.fit_transform(train_x).todense())
test_x_vectors = np.asarray(vectorizer.transform(test_x).todense())

print(test_x[0])
print(test_x_vectors[0])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lambr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


august5th 1 in last 24 hour pakistan coronavirus case increased by 5661 nationally reaching 1 053 660 derived from 62 462 test death increased by 60 current covid19 critical case are 192 recovery ha been of 1454 covid thursdaythoughts breakingnews corona ncoc http co eofebfrmjn
[0. 0. 0. ... 0. 0. 0.]


# Load model

In [6]:
import pickle

with open(r'sentiment_classifier_2000000_tweets.pkl', 'rb') as f:
    classification_model = pickle.load(f)

# Use model

In [10]:
test_set = ['Attention Covid is real! He affects our respiratory organs and system!']
new_test = vectorizer.transform(test_set).toarray()

print(classification_model)
print()
print()
print(classification_model[0].predict(new_test))
print(classification_model[1].predict(new_test))
print(classification_model[2].predict(new_test))
print(classification_model[3].predict(new_test))
print(classification_model[4].predict(new_test))
print(classification_model[5].predict(new_test))
print(classification_model[6].predict(new_test))

[LinearSVC(C=1, max_iter=10000, random_state=42), DecisionTreeClassifier(random_state=42, splitter='random'), KNeighborsClassifier(n_jobs=-1, weights='distance'), GaussianNB(), LogisticRegression(n_jobs=-1, random_state=42, solver='sag'), RandomForestClassifier(max_features='log2', n_estimators=200, n_jobs=-1,
                       random_state=42), MLPClassifier(max_iter=10, random_state=42, warm_start=True)]


['NEGATIVE']
['NEGATIVE']
['NEGATIVE']
['NEGATIVE']
['NEGATIVE']
['NEGATIVE']
['NEGATIVE']
