In [1]:
import pandas as pd
import numpy as np
import string
from string import punctuation

from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [3]:
#Storing the Data in DataFrames 
lockdown=pd.read_csv("/content/drive/MyDrive/ProjectPhase2/nyt_topic/nyt_topic_lockdown.csv")
masking=pd.read_csv("/content/drive/MyDrive/ProjectPhase2/nyt_topic/nyt_topic_masking.csv")
vaccination=pd.read_csv("/content/drive/MyDrive/ProjectPhase2/nyt_topic/nyt_topic_vaccination.csv")
secondary_vaccination=pd.read_csv("/content/drive/MyDrive/ProjectPhase2/changeorg_topic/changeorg_topic_vaccination.csv")

In [4]:
lockdownTF=lockdown['lockdown'].tolist()
counterT=0
counterF=0
for i in lockdownTF:
  if i == True:
    counterT+=1
  else:
    counterF+=1

print(counterT)
print(counterF)

81
732


In [5]:
maskingTF=masking['masking'].tolist()
counterT=0
counterF=0
for i in maskingTF:
  if i == True:
    counterT+=1
  else:
    counterF+=1

print(counterT)
print(counterF)

218
595


In [6]:
vaccinationTF=vaccination['vaccination'].tolist()
counterT=0
counterF=0
for i in vaccinationTF:
  if i == True:
    counterT+=1
  else:
    counterF+=1

print(counterT)
print(counterF)

477
336


In [7]:
vaccination

Unnamed: 0,text,vaccination
0,This was predicted in the Spring,False
1,Insist on vaccinations or make them stay home....,True
2,If employers can make a drug test mandatory as...,True
3,Dozens to hundreds of people crowding together...,False
4,If people were bleeding out their eyes and ear...,True
...,...,...
808,I went out for lunch yesterday with a friend. ...,False
809,"Even a year and a half later, I don't understa...",True
810,Let me get this straight. Vaccinated people ar...,True
811,I have not suffered from a cold or the flu for...,False


1) DATA PARTITIONING

---



In [8]:
#Partitioning the Primary Data into Train (70%) and Test (30%)
statement_train,statement_test,label_train,label_test=train_test_split(vaccination.text,vaccination.vaccination,train_size=0.7,random_state=50)

In [9]:
#Considering whole Secondary Data as Test Data
secondary_statement_test,secondary_label_test=secondary_vaccination.text,secondary_vaccination.vaccination

In [10]:
# type(statement_test)
# type(secondary_label_test)
# label_test
# label_train

2) BASELINE MODEL TRAINING

---



In [11]:
# Baseline Model Training with BOW
count_vect = CountVectorizer()
statement_train_final=count_vect.fit_transform(statement_train)
#print(len(count_vect.vocabulary_))
statement_test_final=count_vect.transform(statement_test)
#print(len(count_vect.vocabulary_))
secondary_statement_test_final=count_vect.transform(secondary_statement_test)

In [12]:
#statement_train_final.shape
#statement_test_final
#len(count_vect.get_feature_names())
#type(statement_train_final)

3) Model Evaluation 1

---



In [13]:
from sklearn.linear_model import LogisticRegression
statement_train_final_array=statement_train_final.toarray()
label_train_final_array=label_train


In [14]:
lr_model = LogisticRegression().fit(statement_train_final_array,label_train_final_array)
predicted_primary = lr_model.predict(statement_test_final.toarray())
# type(statement_train_final_array)

In [15]:
#print(statement_test)
#predicted_primary

In [16]:
np.mean(predicted_primary==label_test)

0.9139344262295082

In [17]:
secondary_statement_test_final_array=secondary_statement_test_final.toarray()
predicted_secondary=lr_model.predict(secondary_statement_test_final_array)

In [18]:
np.mean(predicted_secondary==secondary_label_test)

0.9866666666666667

Accuracy of Primary Dataset : 91.39344262295082 %                       


Accuracy of Secondary Dataset : 98.66666666666667 %


In [19]:
#statement_train

4) Feature Engineering                                                

---

i) Data Cleaning + lemmatization ii)TF-IDF iii) regular Expression ( Specific Word / Word Sequence

In [20]:
# Feature 1 - Data cleaning and Lemmatization
def cleaning(text):

  stemm = PorterStemmer()
  lemm = WordNetLemmatizer()
  text_lower = text.lower()
  #print(type(document))
  word_token = word_tokenize(text_lower)
  #print(words)
  word_length = [word for word in word_token if len(word) > 1]
  word_alphabet = [word for word in word_length if word.isalpha()]
  words_without_stopword = [word for word in word_alphabet if word not in stopwords.words("english")]
  #Stemming degrades the accuracy 
  #words = [stemm.stem(word) for word in tokens]
  lemmatized_words = [lemm.lemmatize(word, pos='v') for word in words_without_stopword]
  # join words to make sentence
  final_word = " ".join(lemmatized_words)
  
  return final_word

In [21]:
#cleaning primary train data
statement_train_clean=statement_train.apply(cleaning)
statement_train_clean.shape
print(type(statement_train))
#cleaning primary test data
statement_test_clean=statement_test.apply(cleaning)
#cleaning secondary test data
statement_test_clean_secondary=secondary_statement_test.apply(cleaning)

<class 'pandas.core.series.Series'>


In [22]:
#Finding the word with most occurances for feature 3
from collections import Counter
def count(text):
  text1=text.lower()
  text2=word_tokenize(text1)
  #text3=Counter(text2)
  return text2
nb=[]
varr=statement_train_clean.apply(count)
for i in varr:
  for j in i:
    nb.append(j)
    
nb1=Counter(nb)
#newvar=statement_train.str.split()
#varr=Counter(newvar)
#varr
#var2=Counter(varr)
#var2
nb1.most_common(5)

[('vaccinate', 349),
 ('get', 321),
 ('people', 306),
 ('mask', 240),
 ('vaccine', 234)]

In [23]:
#statement_train_clean
#statement_test_clean
#statement_test_clean_secondary

In [24]:
#fit Bag of Words Model
count_vect = CountVectorizer()
#primary train 
statement_train_final_clean=count_vect.fit_transform(statement_train_clean)
#primary test
statement_test_final_clean=count_vect.transform(statement_test_clean)
#secondary test
statement_test_final_clean_secondary=count_vect.transform(statement_test_clean_secondary)

In [25]:
#Feature 2 - Tf-Idf 
  

tfidf_transformer = TfidfTransformer()
#primary train
tf_transformer = tfidf_transformer.fit_transform(statement_train_final_clean)
#primary test
tf_transformer_test = tfidf_transformer.transform(statement_test_final_clean)
#secondary test
tf_transformer_test_secondary = tfidf_transformer.transform(statement_test_final_clean_secondary)
tf_transformer.shape

(569, 3435)

In [26]:
#statement_test_final.shape
#type(tf_transformer)

In [27]:
#checking the model accuracy after 2 features
#lr_model11 = LogisticRegression().fit(tf_transformer.toarray(),label_train)
#predicted_primary1 = lr_model11.predict(tf_transformer_test.toarray())
#np.mean(predicted_primary1==label_test)
#predicted_primary11=lr_model11.predict(tf_transformer_test_secondary.toarray())
#np.mean(predicted_primary11==secondary_label_test)\
#statement_train

In [28]:
#Feature 3 - Regular Expression 
import re
from textblob import TextBlob
def regex(text):
  #unique words
  #return len(set(text.split()))
  #Statement Sentiment
  #return TextBlob(text).sentiment.polarity
  return int(bool(re.search(r"vaccin\w+",text)))

In [29]:
#Applying the Regular Expresssion and Adding the feature to the model using np.insert

#for primary train
sentiment_statement_train=statement_train.apply(regex)
#sentiment_statement_train_final=[np.round(x) for x in sentiment_statement_train]
#print(sentiment_statement_train)

final_statement_train_primary = np.insert(tf_transformer.todense(),tf_transformer.shape[1],sentiment_statement_train,axis=1)
#final_statement.shape

#for primary test
sentiment_statement_test=statement_test.apply(regex)
#sentiment_statement_test_final=[np.round(x) for x in sentiment_statement_test]
#print(sentiment_statement_test)

final_statement_test_primary = np.insert(tf_transformer_test.todense(),tf_transformer_test.shape[1],sentiment_statement_test,axis=1)

#for secondary test
sentiment_statement_test_secondary=secondary_statement_test.apply(regex)
#sentiment_statement_test_final_secondary=[np.round(x) for x in sentiment_statement_test_secondary]
#print(sentiment_statement_test_final_secondary)

final_statement_test_secondary = np.insert(tf_transformer_test_secondary.todense(),tf_transformer_test_secondary.shape[1],sentiment_statement_test_secondary,axis=1)

In [30]:
#Accuracies after feature engineering
#74.59
#87.29
#88.11
#88
#88.52
#np.asarray(final_statement).shape
#statement_test_final_clean.shapet
#type(final_statement_train_primary)

5) Model Evalutation 2

---



In [31]:
from sklearn.linear_model import LogisticRegression
final_Accuracy=LogisticRegression().fit(final_statement_train_primary,label_train)
#Accuracy of the Primary Data set after Feature Engineering
prediction=final_Accuracy.predict(final_statement_test_primary)
np.mean(prediction==label_test)



0.930327868852459

In [32]:
#Accuract if the Secondary Dataset after Feature Engineering
predicted_secondary2 = final_Accuracy.predict(final_statement_test_secondary)
np.mean(predicted_secondary2==secondary_label_test)



0.9873333333333333

Accuracy of Primary Dataset : 93.0327868852459 %

Accuracy of Secondary Dataset : 98.73333333333333 %

In [34]:
#Gaussian_model2 = GaussianNB().fit(final_statement_train_primary,label_train)
#predicted_primary2 = Gaussian_model2.predict(final_statement_test_primary)
#np.mean(predicted_primary2==label_test)
#Gaussian_model3 = GaussianNB().fit(np.asarray(final_statement_train_primary),label_train_final_array)

In [36]:
#Checking


text1=pd.Series("I already got the second dose of my pfizer vaccine and I do not have any side effects")
#text2=pd.Series("Lockdown is must")
#text3=pd.Series("Everyone needs to wear mask and maintain 5 feet distance ")

test_cleaning=text1.apply(cleaning)
test_next=count_vect.transform(test_cleaning)
test_next2 = tfidf_transformer.transform(test_next)
test_next3=text1.apply(regex)
test_final_1 = np.insert(test_next2.todense(),test_next2.shape[1],test_next3,axis=1)
test_predict=final_Accuracy.predict(test_final_1)
test_predict



array([ True])

In [None]:
# References
# 1)https://developers.google.com/edu/python/regular-expressions#:~:text=%5Cw%20%2D%2D%20(lowercase%20w),between%20word%20and%20non%2Dword
# 2)https://python.plainenglish.io/understand-regular-expression-in-python-3189979a749
# 3)https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
# 4)https://gist.github.com/DerrickHiggins/20c77745b080e3d493231424d7da9a2f