# Set-up of the project

In [1]:
# Basic packages
import pandas as pd 
import numpy as np
import re
import collections
import matplotlib.pyplot as plt
from tensorflow import keras
import tensorflow as tf

# Packages for data preparation
from sklearn.model_selection import train_test_split
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# Packages for modeling
from keras import models
from keras import layers
from keras import regularizers

from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lynette/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Set some parameters that will be used throughout the notebook.

We read in the csv with the tweets data and perform a random shuffle. It's a good practice to shuffle the data before splitting between a train and test set. We'll only keep the video decription column as input and the Relvancy column as the target.

In [2]:
df = pd.read_csv('handlabel_feature.csv')
df.columns

Index(['Unnamed: 0', 'video_id', 'channel_title', 'channel_id',
       'video_publish_date', 'video_title', 'video_description',
       'video_category', 'video_view_count', 'video_comment_count',
       'video_like_count', 'video_dislike_count', 'video_thumbnail',
       'video_tags', 'collection_date', 'science.topic', 'Relevancy',
       'attitude', 'Text/video', 'search.term', 'cld2', 'transcript',
       'transcript_nchar', 'videoid', 'conspiracy', 'var_r', 'var_g', 'var_b',
       'var_h', 'var_s', 'var_v', 'var_bright', 'var_bright_sd',
       'var_contrast', 'var_colorful', 'median_r', 'median_g', 'median_b',
       'median_h', 'median_s', 'median_v', 'median_bright', 'median_bright_sd',
       'median_contrast', 'median_colorful', 'r_mean', 'g_mean', 'b_mean',
       'h_mean', 's_mean', 'v_mean', 'bright_mean', 'lightning_mean',
       'contrast_mean', 'colorful_mean', 'color_lag'],
      dtype='object')

In [3]:
X = df[['transcript']]

In [4]:
X

Unnamed: 0,transcript
0,with coronavirus completely changing our way o...
1,you all know I'm a big fan of conspiracy theor...
2,how do you handle an epidemic in the age of fa...
3,CaptionUnavailable
4,what's up guys Stephen here and welcome back t...
...,...
402,so the government work for us we don't work fo...
403,but even if 5g has nothing to do with this cor...
404,um some people believe that 5g like like for c...
405,this is a podcast from the South China Morning...


In [5]:
X = X[(X["transcript"] != 'CaptionUnavailable') & (X["transcript"] != 'VideoUnavailable')]

In [6]:
X.shape

(313, 1)

In [7]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['attitude'] = le.fit_transform(df['attitude'])

In [8]:
y = df['attitude'][(df["transcript"] != 'CaptionUnavailable') & (df["transcript"] != 'VideoUnavailable')]
y

0      0
1      0
2      0
4      0
5      1
      ..
402    1
403    0
404    0
405    0
406    0
Name: attitude, Length: 313, dtype: int64

# Data preparation

## Data cleaning

The first thing we'll do is removing stopwords. These words do not have any value for predicting the sentiment.Also, we remove the http link in the texts

In [19]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [130]:
!pip3 install beautifulsoup4

Collecting beautifulsoup4
[?25l  Downloading https://files.pythonhosted.org/packages/d1/41/e6495bd7d3781cee623ce23ea6ac73282a373088fcd0ddc809a047b18eae/beautifulsoup4-4.9.3-py3-none-any.whl (115kB)
[K     |████████████████████████████████| 122kB 3.5MB/s eta 0:00:01
[?25hCollecting soupsieve>1.2; python_version >= "3.0" (from beautifulsoup4)
  Downloading https://files.pythonhosted.org/packages/02/fb/1c65691a9aeb7bd6ac2aa505b84cb8b49ac29c976411c6ab3659425e045f/soupsieve-2.1-py3-none-any.whl
Installing collected packages: soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.9.3 soupsieve-2.1
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [9]:
from tqdm import tqdm
from bs4 import BeautifulSoup
import nltk
nltk.download()
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
lemmatizer = WordNetLemmatizer()
def clean_sentences(df):
    reviews = []

    for sent in tqdm(df['transcript']):
        
        #remove html content
        review_text = BeautifulSoup(sent).get_text()
        
        #remove non-alphabetic characters
        review_text = re.sub("[^a-zA-Z]"," ", review_text)
    
        #tokenize the sentences
        words = word_tokenize(review_text.lower())
    
        #stop words removal
        omit_words = set(stopwords.words('english'))
        words = [x for x in words if x not in omit_words]
        
        #lemmatize each word to its lemma
        lemma_words = [lemmatizer.lemmatize(i) for i in words]
    
        reviews.append(lemma_words)

    return(reviews)

#cleaned reviews for both train and test set retrieved
train_sentences = clean_sentences(X)
print(len(train_sentences))


showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


100%|██████████| 313/313 [00:06<00:00, 46.10it/s]

313





In [141]:
type(train_sentences)

list

In [139]:
!pip3 install --upgrade gensim

Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/70/cf/87b25b265d23498b2b70ce873495cf7ef91394c4baff240210e26f3bc18a/gensim-3.8.3-cp37-cp37m-macosx_10_9_x86_64.whl (24.2MB)
[K     |████████████████████████████████| 24.2MB 15.0MB/s eta 0:00:01
Collecting smart-open>=1.8.1 (from gensim)
[?25l  Downloading https://files.pythonhosted.org/packages/18/9c/a16951b5a66c86f0ea8ff5aca8d5c700138e708a76412ee7a2ec7fbd4b44/smart_open-4.1.0.tar.gz (116kB)
[K     |████████████████████████████████| 122kB 13.0MB/s eta 0:00:01
[?25hBuilding wheels for collected packages: smart-open
  Building wheel for smart-open (setup.py) ... [?25ldone
[?25h  Created wheel for smart-open: filename=smart_open-4.1.0-cp37-none-any.whl size=106205 sha256=14fdc46604f5469e9e897ff7cb11a539d1a1137f86b3a6bd8062effa2875e356
  Stored in directory: /Users/lynette/Library/Caches/pip/wheels/eb/83/5c/ead33ff91d363db5c2527b563746ba23887669c0221bd2484f
Successfully built smart-open
Installing collected 

In [10]:
from tqdm import tqdm 
tqdm.pandas(desc="progress-bar") 
from gensim.models.doc2vec import LabeledSentence

  from pandas import Panel


To implement doc2vec, we have to labelise or tag each tokenised tweet with unique IDs. We can do so by using Gensim’s LabeledSentence() function.

In [11]:
range(0,len(train_sentences))

range(0, 313)

In [12]:
def add_label(twt):
    output = []
    for i, s in zip(range(0,len(twt)), twt):
        output.append(LabeledSentence(s, ["transcript_" + str(i)]))
    return output

labeled_X = add_label(train_sentences) # label all the tweets

  after removing the cwd from sys.path.


In [147]:
labeled_X[:6]

[LabeledSentence(words=['coronavirus', 'completely', 'changing', 'way', 'life', 'many', 'question', 'forced', 'u', 'ask', 'like', 'grocery', 'store', 'worker', 'delivery', 'people', 'essential', 'worker', 'earn', 'living', 'wage', 'health', 'care', 'tied', 'employment', 'okay', 'romantic', 'relationship', 'houseplant', 'get', 'done', 'finally', 'leave', 'house', 'normal', 'thing', 'like', 'tongue', 'kiss', 'uber', 'driver', 'gon', 'na', 'one', 'question', 'one', 'question', 'people', 'want', 'answer', 'anything', 'hell', 'disease', 'come', 'virus', 'shut', 'world', 'people', 'happy', 'people', 'happy', 'accept', 'official', 'explanation', 'china', 'virus', 'originated', 'live', 'animal', 'market', 'somehow', 'jumped', 'bat', 'human', 'like', 'okay', 'think', 'plot', 'dark', 'night', 'accept', 'u', 'stuck', 'house', 'nothing', 'except', 'throw', 'cat', 'birthday', 'party', 'zoo', 'everyone', 'everyone', 'home', 'time', 'come', 'theory', 'exactly', 'think', 'whole', 'thing', 'went', 'als

In [13]:
import gensim

model_d2v = gensim.models.Doc2Vec(dm=1, # dm = 1 for ‘distributed memory’ model
                                  dm_mean=1, # dm_mean = 1 for using mean of the context word vectors
                                  vector_size=200, # no. of desired features
                                  window=5, # width of the context window                                  
                                  negative=7, # if > 0 then negative sampling will be used
                                  min_count=5, # Ignores all words with total frequency lower than 5.                                  
                                  workers=32, # no. of cores                                  
                                  alpha=0.1, # learning rate                                  
                                  seed = 23, # for reproducibility
                                 ) 

model_d2v.build_vocab([i for i in tqdm(labeled_X)])

model_d2v.train(labeled_X, total_examples= len(train_sentences), epochs=15)

100%|██████████| 313/313 [00:00<00:00, 521580.12it/s]


Preparing doc2vec Feature Set

In [14]:
docvec_arrays = np.zeros((len(train_sentences), 200)) 
for i in range(len(train_sentences)):
    docvec_arrays[i,:] = model_d2v.docvecs[i].reshape((1,200))    

docvec_df = pd.DataFrame(docvec_arrays) 
docvec_df.shape

(313, 200)

RandomForest

In [24]:
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score
precision = []
recall = []

In [25]:
from sklearn.model_selection import cross_validate
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
for i in range(10):
    forest = RandomForestClassifier(n_estimators=500,min_samples_split=5)
    cv = cross_validate(forest, docvec_df, y, cv=10,scoring = ['precision','recall'])
    precision.append(cv['test_precision'].mean())
    recall.append(cv['test_recall'].mean())



In [26]:
precision

[0.732587803320562,
 0.7337808215658604,
 0.7453188728216535,
 0.7401096778947167,
 0.7405828667994963,
 0.7338497309451146,
 0.7374235261401557,
 0.7356780365331532,
 0.7331245843412139,
 0.7351666579516968]

In [27]:
recall

[0.985930735930736,
 0.9766233766233766,
 0.9813852813852814,
 0.9766233766233766,
 0.9906926406926407,
 0.9720779220779221,
 0.9861471861471862,
 0.9906926406926407,
 0.985930735930736,
 0.9813852813852815]

In [28]:
import numpy as np
import scipy.stats as st


print(np.std(precision))
print(np.mean(precision))
print(st.t.interval(0.95, len(precision)-1, loc=np.mean(precision), scale=st.sem(precision)))



0.0038885369817932645
0.7367622578313624
(0.7338300972360468, 0.739694418426678)


In [29]:

print(np.std(recall))
print(np.mean(recall))
print(st.t.interval(0.95, len(recall)-1, loc=np.mean(recall), scale=st.sem(recall)))


0.0059214465752930046
0.9827489177489177
(0.9782838368208886, 0.9872139986769468)


In [30]:
import joblib
joblib.dump(forest, "random_forest_word_embedding.joblib")

['random_forest_word_embedding.joblib']