# Set-up of the project

In [1]:
# Basic packages
import pandas as pd 
import numpy as np
import re
import collections
import matplotlib.pyplot as plt
from tensorflow import keras
import tensorflow as tf

# Packages for data preparation
from sklearn.model_selection import train_test_split
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder



from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lynette/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Set some parameters that will be used throughout the notebook.

We read in the csv with the tweets data and perform a random shuffle. It's a good practice to shuffle the data before splitting between a train and test set. We'll only keep the video decription column as input and the Relvancy column as the target.

In [2]:
df = pd.read_csv('handlabel_feature.csv')
df.columns

Index(['Unnamed: 0', 'video_id', 'channel_title', 'channel_id',
       'video_publish_date', 'video_title', 'video_description',
       'video_category', 'video_view_count', 'video_comment_count',
       'video_like_count', 'video_dislike_count', 'video_thumbnail',
       'video_tags', 'collection_date', 'science.topic', 'Relevancy',
       'attitude', 'Text/video', 'search.term', 'cld2', 'transcript',
       'transcript_nchar', 'videoid', 'conspiracy', 'var_r', 'var_g', 'var_b',
       'var_h', 'var_s', 'var_v', 'var_bright', 'var_bright_sd',
       'var_contrast', 'var_colorful', 'median_r', 'median_g', 'median_b',
       'median_h', 'median_s', 'median_v', 'median_bright', 'median_bright_sd',
       'median_contrast', 'median_colorful', 'r_mean', 'g_mean', 'b_mean',
       'h_mean', 's_mean', 'v_mean', 'bright_mean', 'lightning_mean',
       'contrast_mean', 'colorful_mean', 'color_lag'],
      dtype='object')

In [3]:
df = df[(df["transcript"] != 'CaptionUnavailable') & (df["transcript"] != 'VideoUnavailable')]

In [4]:
df.shape

(313, 56)

In [5]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['attitude'] = le.fit_transform(df['attitude'])

# Data preparation

## Data cleaning

The first thing we'll do is removing stopwords. These words do not have any value for predicting the sentiment.Also, we remove the http link in the texts

In [6]:
from tqdm import tqdm
from bs4 import BeautifulSoup
import nltk
#nltk.download()
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
lemmatizer = WordNetLemmatizer()
def clean_sentences(df):
    reviews = []

    for sent in tqdm(df['transcript']):
        
        #remove html content
        review_text = BeautifulSoup(sent).get_text()
        
        #remove non-alphabetic characters
        review_text = re.sub("[^a-zA-Z]"," ", review_text)
    
        #tokenize the sentences
        words = word_tokenize(review_text.lower())
    
        #stop words removal
        omit_words = set(stopwords.words('english'))
        words = [x for x in words if x not in omit_words]
        
        #lemmatize each word to its lemma
        lemma_words = [lemmatizer.lemmatize(i) for i in words]
    
        reviews.append(lemma_words)

    return(reviews)

#cleaned reviews for both train and test set retrieved
train_sentences = clean_sentences(df)
print(len(train_sentences))


100%|██████████| 313/313 [00:06<00:00, 51.09it/s]

313





In [11]:
type(train_sentences)

list

In [7]:
from tqdm import tqdm 
tqdm.pandas(desc="progress-bar") 
from gensim.models.doc2vec import LabeledSentence

  from pandas import Panel


To implement doc2vec, we have to labelise or tag each tokenised tweet with unique IDs. We can do so by using Gensim’s LabeledSentence() function.

In [8]:
def add_label(twt):
    output = []
    for i, s in zip(range(0,len(twt)), twt):
        output.append(LabeledSentence(s, ["transcript_" + str(i)]))
    return output

labeled_X = add_label(train_sentences) # label all the tweets

  after removing the cwd from sys.path.


In [15]:
labeled_X[:6]

[LabeledSentence(words=['coronavirus', 'completely', 'changing', 'way', 'life', 'many', 'question', 'forced', 'u', 'ask', 'like', 'grocery', 'store', 'worker', 'delivery', 'people', 'essential', 'worker', 'earn', 'living', 'wage', 'health', 'care', 'tied', 'employment', 'okay', 'romantic', 'relationship', 'houseplant', 'get', 'done', 'finally', 'leave', 'house', 'normal', 'thing', 'like', 'tongue', 'kiss', 'uber', 'driver', 'gon', 'na', 'one', 'question', 'one', 'question', 'people', 'want', 'answer', 'anything', 'hell', 'disease', 'come', 'virus', 'shut', 'world', 'people', 'happy', 'people', 'happy', 'accept', 'official', 'explanation', 'china', 'virus', 'originated', 'live', 'animal', 'market', 'somehow', 'jumped', 'bat', 'human', 'like', 'okay', 'think', 'plot', 'dark', 'night', 'accept', 'u', 'stuck', 'house', 'nothing', 'except', 'throw', 'cat', 'birthday', 'party', 'zoo', 'everyone', 'everyone', 'home', 'time', 'come', 'theory', 'exactly', 'think', 'whole', 'thing', 'went', 'als

In [9]:
import gensim

model_d2v = gensim.models.Doc2Vec(dm=1, # dm = 1 for ‘distributed memory’ model
                                  dm_mean=1, # dm_mean = 1 for using mean of the context word vectors
                                  vector_size=200, # no. of desired features
                                  window=5, # width of the context window                                  
                                  negative=7, # if > 0 then negative sampling will be used
                                  min_count=5, # Ignores all words with total frequency lower than 5.                                  
                                  workers=32, # no. of cores                                  
                                  alpha=0.1, # learning rate                                  
                                  seed = 23, # for reproducibility
                                 ) 

model_d2v.build_vocab([i for i in tqdm(labeled_X)])

model_d2v.train(labeled_X, total_examples= len(train_sentences), epochs=15)

100%|██████████| 313/313 [00:00<00:00, 610045.14it/s]


Preparing doc2vec Feature Set

In [10]:
docvec_arrays = np.zeros((len(train_sentences), 200)) 
for i in range(len(train_sentences)):
    docvec_arrays[i,:] = model_d2v.docvecs[i].reshape((1,200))    

docvec_df = pd.DataFrame(docvec_arrays) 
docvec_df.shape

(313, 200)

concatenate visual and textual

In [11]:
visual = df[['var_r', 'var_g', 'var_b',
       'var_h', 'var_s', 'var_v', 'var_bright', 'var_bright_sd',
       'var_contrast', 'var_colorful', 'median_r', 'median_g', 'median_b',
       'median_h', 'median_s', 'median_v', 'median_bright', 'median_bright_sd',
       'median_contrast', 'median_colorful','color_lag']]
visual.reset_index(drop=True, inplace=True)

visual.shape

(313, 21)

In [12]:
X = pd.concat([visual, docvec_df], axis=1)
# X = X.dropna(how='all')

In [34]:
visual

Unnamed: 0,var_r,var_g,var_b,var_h,var_s,var_v,var_bright,var_bright_sd,var_contrast,var_colorful,median_r,median_g,median_b,median_h,median_s,median_v,median_bright,median_bright_sd,median_contrast,median_colorful
0,298.035544,256.526430,317.666638,176.226486,291.471955,318.839186,249.684164,58.312728,273.094250,131.323126,99.315580,85.044938,112.727762,86.092826,134.038498,134.587656,92.603380,66.266163,220.0,74.248203
1,14.562086,13.206167,14.473241,22.249816,3.087459,14.468497,13.461712,2.128646,6.360655,0.903765,126.550148,128.666820,130.396076,83.762660,39.437676,134.533176,128.116216,75.928568,241.0,19.761427
2,1397.200532,1812.288126,1125.290200,952.795220,3386.595470,942.447595,1478.249667,160.366149,1102.429909,528.665700,82.593592,44.906568,92.771928,116.763216,175.650236,106.260372,61.527896,54.770904,190.0,71.947426
3,800.175857,979.432379,765.202229,533.030459,899.264225,665.884765,850.382767,385.940453,2316.228370,115.950038,79.772896,73.095122,103.627240,107.586062,112.794364,110.728122,77.605316,50.424047,182.0,56.541083
4,611.670255,710.533599,704.285521,565.782339,267.640542,616.471287,661.184079,185.405393,48.207604,226.281947,178.297660,182.070140,183.746428,67.315794,5.285804,184.217756,181.100160,92.073127,255.0,10.850100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
308,147.788425,83.104502,343.138525,287.133037,738.014925,211.022546,73.414553,233.283992,762.744039,122.025988,111.254992,113.640668,101.466524,65.478744,56.372900,122.471632,112.078180,69.131249,237.0,30.316916
309,499.048934,671.771055,464.885271,768.802809,362.948260,492.587090,554.760968,93.525503,564.382108,60.941841,123.212802,119.276686,102.296032,79.687310,70.830036,129.006988,118.347988,63.955769,210.0,34.442498
310,12.184924,9.988667,8.442325,7.628704,8.160141,12.292979,9.858782,1.924338,13.182123,6.304640,126.875756,101.686932,90.995444,37.053376,85.170332,128.535556,108.091288,70.702156,249.0,54.941191
311,3.100336,12.856796,13.246367,0.652040,23.776415,2.342890,9.398054,2.273838,0.001471,6.499906,128.762536,80.070008,88.208350,124.785384,164.544432,141.974716,95.623774,77.683406,241.0,97.623431


In [13]:
docvec_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.063852,1.147090,0.915325,-0.300535,0.132474,-1.516511,-0.793414,-3.697046,0.750762,2.075684,...,0.310195,1.751785,-0.548795,-1.360437,-0.066572,2.808907,0.253389,0.672194,1.243521,-0.250078
1,-3.326771,-0.667929,0.149417,-0.915874,0.386926,1.238079,1.274287,0.557273,1.066555,-0.343273,...,3.106787,2.369942,-1.220326,0.020759,-0.914439,3.272829,-1.399584,-0.463638,0.086663,-1.879479
2,-1.148223,0.974999,-0.522925,2.469521,-0.568860,0.364378,-2.245167,0.621058,-0.118497,-2.016029,...,-2.030492,0.985518,0.249042,-2.048172,-0.938520,0.503929,-0.155237,0.610421,0.222706,-0.636165
3,-0.537238,2.477042,0.253470,-1.767251,0.773075,2.238947,-0.808677,0.078602,-1.611307,0.114276,...,1.090019,1.761208,1.033755,-3.164194,-0.452878,2.025705,0.576299,3.361025,0.530103,-2.824419
4,-4.020254,1.175915,-0.228314,2.198408,2.566578,-0.389477,-0.765165,-0.573514,3.325544,-1.622725,...,-1.141622,-0.290531,-1.407844,-0.545376,-1.333736,2.378992,0.334710,-0.649780,1.532445,-2.728537
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
308,-0.301367,0.820705,-0.215695,-0.020410,-0.028907,-0.701614,-0.756366,-0.309708,0.744445,0.256587,...,0.219806,0.079693,0.066113,-0.351161,0.684133,0.074552,0.511365,-0.072025,0.285010,-0.966182
309,-2.036741,-0.957528,1.868530,0.531641,1.366007,-2.121679,0.956008,1.025042,0.898618,0.327833,...,0.304169,-0.018351,-0.276976,-0.988548,-0.161790,4.840579,1.207502,1.387488,-0.821822,-0.588499
310,1.959923,0.591586,1.109166,0.145488,0.285519,0.882440,-0.980546,0.371921,1.721942,-0.483481,...,0.658422,1.258499,-0.371038,-1.893730,1.873775,5.045246,-1.303471,2.202464,1.182786,-2.982123
311,-0.837819,-0.523737,0.448967,2.180225,0.992661,0.889139,-1.538214,0.116432,-0.643177,0.031396,...,1.486087,2.790263,-1.487764,-4.487288,0.129369,-0.084066,0.157455,-1.478177,0.216741,-2.233892


In [14]:
X

Unnamed: 0,var_r,var_g,var_b,var_h,var_s,var_v,var_bright,var_bright_sd,var_contrast,var_colorful,...,190,191,192,193,194,195,196,197,198,199
0,298.035544,256.526430,317.666638,176.226486,291.471955,318.839186,249.684164,58.312728,273.094250,131.323126,...,0.310195,1.751785,-0.548795,-1.360437,-0.066572,2.808907,0.253389,0.672194,1.243521,-0.250078
1,14.562086,13.206167,14.473241,22.249816,3.087459,14.468497,13.461712,2.128646,6.360655,0.903765,...,3.106787,2.369942,-1.220326,0.020759,-0.914439,3.272829,-1.399584,-0.463638,0.086663,-1.879479
2,1397.200532,1812.288126,1125.290200,952.795220,3386.595470,942.447595,1478.249667,160.366149,1102.429909,528.665700,...,-2.030492,0.985518,0.249042,-2.048172,-0.938520,0.503929,-0.155237,0.610421,0.222706,-0.636165
3,800.175857,979.432379,765.202229,533.030459,899.264225,665.884765,850.382767,385.940453,2316.228370,115.950038,...,1.090019,1.761208,1.033755,-3.164194,-0.452878,2.025705,0.576299,3.361025,0.530103,-2.824419
4,611.670255,710.533599,704.285521,565.782339,267.640542,616.471287,661.184079,185.405393,48.207604,226.281947,...,-1.141622,-0.290531,-1.407844,-0.545376,-1.333736,2.378992,0.334710,-0.649780,1.532445,-2.728537
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
308,147.788425,83.104502,343.138525,287.133037,738.014925,211.022546,73.414553,233.283992,762.744039,122.025988,...,0.219806,0.079693,0.066113,-0.351161,0.684133,0.074552,0.511365,-0.072025,0.285010,-0.966182
309,499.048934,671.771055,464.885271,768.802809,362.948260,492.587090,554.760968,93.525503,564.382108,60.941841,...,0.304169,-0.018351,-0.276976,-0.988548,-0.161790,4.840579,1.207502,1.387488,-0.821822,-0.588499
310,12.184924,9.988667,8.442325,7.628704,8.160141,12.292979,9.858782,1.924338,13.182123,6.304640,...,0.658422,1.258499,-0.371038,-1.893730,1.873775,5.045246,-1.303471,2.202464,1.182786,-2.982123
311,3.100336,12.856796,13.246367,0.652040,23.776415,2.342890,9.398054,2.273838,0.001471,6.499906,...,1.486087,2.790263,-1.487764,-4.487288,0.129369,-0.084066,0.157455,-1.478177,0.216741,-2.233892


RandomForest

In [18]:
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score
precision = []
recall = []

In [19]:
from sklearn.model_selection import cross_validate
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier

for i in range(10):
    forest = RandomForestClassifier(n_estimators=500,
                                 max_features=None,max_depth=None,min_samples_split=2)
    cv = cross_validate(forest, X, df['attitude'], cv=10,scoring = ['precision','recall'])
    precision.append(cv['test_precision'].mean())
    recall.append(cv['test_recall'].mean())




In [20]:
precision

[0.7481760512794995,
 0.7547972952972953,
 0.7464058654236629,
 0.755531798314407,
 0.7462590933970243,
 0.7544891232649854,
 0.7478555174072415,
 0.7519595709773685,
 0.7548556998556998,
 0.7611840407357648]

In [21]:
recall

[0.9257575757575758,
 0.920995670995671,
 0.9212121212121211,
 0.9257575757575758,
 0.9114718614718613,
 0.9303030303030304,
 0.9209956709956708,
 0.9212121212121211,
 0.9028138528138527,
 0.9305194805194805]

In [22]:
import numpy as np
import scipy.stats as st


print(np.std(precision))
print(np.mean(precision))
print(st.t.interval(0.95, len(precision)-1, loc=np.mean(precision), scale=st.sem(precision)))




0.004634866652539224
0.7521514055952949
(0.7486564733298313, 0.7556463378607585)


In [23]:

print(np.std(recall))
print(np.mean(recall))
print(st.t.interval(0.95, len(recall)-1, loc=np.mean(recall), scale=st.sem(recall)))



0.008024583440524511
0.9211038961038961
(0.915052939801231, 0.9271548524065611)


In [24]:
import joblib
joblib.dump(forest, "random_forest_word_embedding_visual.joblib")

['random_forest_word_embedding_visual.joblib']

In [66]:
eli5.show_weights(perm,top = 50)

Weight,Feature
0.0825  ± 0.0735,x47
0.0349  ± 0.0127,x144
0.0222  ± 0.0156,x147
0.0159  ± 0.0000,x66
0.0159  ± 0.0000,x35
0.0159  ± 0.0000,x175
0.0127  ± 0.0238,x134
0.0127  ± 0.0370,x39
0.0127  ± 0.0546,x105
0.0127  ± 0.0127,x89
