In [1]:
"""
Ali Guzelyel
CS 585 - Phase 3
Create Model and train it.
Do Feature Engineering.
"""
import numpy as np
import pandas as pd
import sklearn
from sklearn import metrics

In [2]:
df_p_vac = pd.read_csv("twitter_topic_vaccination.csv")
df_p_mask = pd.read_csv("twitter_topic_masking_and_distancing.csv")
df_p_lock = pd.read_csv("twitter_topic_lockdown.csv")

df_s_vac = pd.read_csv("changeorg_topic_vaccination.csv")
df_s_mask = pd.read_csv("changeorg_topic_masking_and_distancing.csv")
df_s_lock = pd.read_csv("changeorg_topic_lockdown.csv")

In [3]:
train_vac_trues = len(df_p_vac.loc[df_p_vac['label']==True])
train_mask_trues = len(df_p_mask.loc[df_p_mask['label']==True])
train_lock_trues = len(df_p_lock.loc[df_p_lock['label']==True])

In [4]:
train_vac_trues

780

In [5]:
train_mask_trues

293

In [6]:
train_lock_trues

28

In [7]:
from sklearn.model_selection import train_test_split
df_p = pd.DataFrame()
df_p["text"] = df_p_vac["text"]
df_p["vaccination"] = df_p_vac.loc[:,"label"]
df_train, df_test = train_test_split(df_p, test_size=0.3)
df_train.head()

Unnamed: 0,text,vaccination
1298,The #CDC tells state public health officials t...,True
737,#CovidVaccine Gujarati always Rock 😂😂😂 https:/...,True
1334,I will not receive any #COVIDVaccine until #Dr...,True
976,So it has only taken less than 2 months for co...,True
1335,#COVID19 #CovidVaccine #Russia\nWhen the covid...,True


In [8]:
df_s = pd.DataFrame()
df_s["text"] = df_s_vac["text"]
df_s["vaccination"] = df_s_vac.loc[:,"label"]
df_s.head()

Unnamed: 0,text,vaccination
0,Save Western Ghats from annihilation,False
1,Save Beacon Hill Park,False
2,Government of India: Don't Tax Medical Bills,False
3,NO FAILING @ UCR,False
4,WIAA - Let them play football!,False


## Bag of Words

## Tokenize

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
vac_train_counts = count_vect.fit_transform(df_train["text"])

In [10]:
vac_train_counts.shape

(1050, 5155)

## Frequencies


In [11]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
vac_train_tfidf = tfidf_transformer.fit_transform(vac_train_counts)

In [12]:
vac_train_tfidf.shape

(1050, 5155)

## Training a Model with SVM

In [13]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss='hinge', penalty='l2',
                           alpha=1e-3, random_state=42,
                           max_iter=5, tol=None)

### Fit the model (Train)

In [14]:
clf.fit(vac_train_tfidf, df_train["vaccination"])

SGDClassifier(alpha=0.001, max_iter=5, random_state=42, tol=None)

### Test On primary test dataset

In [15]:
vac_new_counts = count_vect.transform(df_test["text"])
vac_new_tfidf = tfidf_transformer.transform(vac_new_counts)

predicted = clf.predict(vac_new_tfidf)
np.mean(predicted == df_test["vaccination"])

0.88

### Test On secondary test dataset

In [16]:
vac_new_counts = count_vect.transform(df_s["text"])
vac_new_tfidf = tfidf_transformer.transform(vac_new_counts)

predicted = clf.predict(vac_new_tfidf)
np.mean(predicted == df_s["vaccination"])

0.7926666666666666

## Feature Engineering

### Find Features

In [17]:
import re

In [18]:
def feature_function_1(text):
    return int(bool(re.search(r"[vV]acc|[vV]ax", text))) #vaccine, vaxxine, vax, Coronavaccine, coronavax 
new_feature_array_1 = df_train["text"].apply(feature_function_1)

In [19]:
sum(new_feature_array_1) #how many were flagged?

582

In [20]:
def feature_function_2(text):
    return int(bool(re.search(r"[tT]rial|[tT]est", text))) #test, trial, vax test, trials, tests,
new_feature_array_2 = df_train["text"].apply(feature_function_2)

In [21]:
sum(new_feature_array_2) #how many were flagged?

96

In [22]:
def feature_function_3(text):
    return int(bool(re.search(r"[dD]evelo", text))) # develop, developping, vax developers, etc. 
new_feature_array_3 = df_train["text"].apply(feature_function_3)

In [23]:
sum(new_feature_array_3) #how many were flagged?

37

In [24]:
def feature_function_4(text):
    return int(bool(re.search(r"[sS]hot", text)))  #shot, booster shot, first shot, second shot, third shot
new_feature_array_4 = df_train["text"].apply(feature_function_4)

In [25]:
sum(new_feature_array_4) #how many were flagged?

# I am not taking this into consideration

18

### Add Features to list

In [26]:
vac_train_tfidf_2 = np.insert(vac_train_tfidf.todense(), 
                              vac_train_tfidf.shape[1],
                              [new_feature_array_1,new_feature_array_2, new_feature_array_3],
                              axis=1)
#new_feature_array_4

In [27]:
vac_train_tfidf_2.shape

(1050, 5158)

### Remove Random features to match number of features

In [28]:
vac_train_tfidf_2_match = np.delete(vac_train_tfidf_2, [15, 2000, 4000], 1) #3000

In [29]:
vac_train_tfidf_2_match.shape

(1050, 5155)

## Train Model with new frequencies

### Create model with new frequiencies

In [30]:
clf.fit(vac_train_tfidf_2_match, df_train["vaccination"])

SGDClassifier(alpha=0.001, max_iter=5, random_state=42, tol=None)

### Test On primary test dataset

In [31]:
vac_new_counts = count_vect.transform(df_test["text"])
vac_new_tfidf = tfidf_transformer.transform(vac_new_counts)

In [32]:
predicted = clf.predict(vac_new_tfidf)
np.mean(predicted == df_test["vaccination"])

0.4711111111111111

### Test On secondary test dataset

In [33]:
vac_new_counts = count_vect.transform(df_s["text"])
vac_new_tfidf = tfidf_transformer.transform(vac_new_counts)

In [34]:
predicted = clf.predict(vac_new_tfidf)
np.mean(predicted == df_s["vaccination"])

0.988