# Set-up of the project

In [1]:
# Basic packages
import pandas as pd 
import numpy as np
import re
import collections
import matplotlib.pyplot as plt
from tensorflow import keras
import tensorflow as tf

# Packages for data preparation
from sklearn.model_selection import train_test_split
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder



from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score


[nltk_data] Error loading stopwords: <urlopen error [Errno 61]
[nltk_data]     Connection refused>


Set some parameters that will be used throughout the notebook.

We read in the csv with the tweets data and perform a random shuffle. It's a good practice to shuffle the data before splitting between a train and test set. We'll only keep the video decription column as input and the Relvancy column as the target.

In [2]:
df = pd.read_csv('normal_handlabel_feature.csv')
df.columns
df = df[df['attitude'] != 0]

In [3]:
df = df[(df["transcript"] != 'CaptionUnavailable') & (df["transcript"] != 'VideoUnavailable')]

In [4]:
df.shape

(460, 24)

In [5]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['attitude'] = le.fit_transform(df['attitude'])

# Data preparation

## Data cleaning

The first thing we'll do is removing stopwords. These words do not have any value for predicting the sentiment.Also, we remove the http link in the texts

In [7]:
def remove_stopwords(input_text):
    stopwords_list = stopwords.words('english')
    stopwords_list.append('The')
    # Some words which might indicate a certain sentiment are kept via a whitelist
    whitelist = ["n't", "not", "no"]
    words = str(input_text).split() 
    clean_words = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1] 
    return " ".join(clean_words) 
    
def remove_mentions(input_text):
    return re.sub(r'@\w+', '', input_text)


       
df.transcript = df.transcript.apply(remove_stopwords).apply(remove_mentions)
df.head()



Unnamed: 0.1,Unnamed: 0,video_id,attitude,transcript,var_r,var_g,var_b,var_h,var_s,var_v,...,median_r,median_g,median_b,median_h,median_s,median_v,median_bright,median_bright_sd,median_contrast,median_colorful
5,5,8ocWUAwQMhY,0,hi guys Daniel Alexander cannon logic Authorit...,611.670255,710.533599,704.285521,565.782339,267.640542,616.471287,...,178.29766,182.07014,183.746428,67.315794,5.285804,184.217756,181.10016,92.073127,255.0,10.8501
7,7,m63EyxOC7KM,0,New York new hot 97 app Ebro morning markets a...,452.35698,478.186275,419.175631,355.090668,468.120127,488.873792,...,58.932432,48.911764,50.621008,91.027888,61.974344,60.291964,52.09878,57.203108,223.0,22.04881
10,10,lXnLwtAS6Fk,0,5g 5g Network that's taking people completely ...,255.602928,227.562307,115.53451,82.652435,56.376545,233.16895,...,162.88566,127.777572,135.631998,73.192558,136.100062,191.345718,139.44729,43.736392,183.0,102.317861
21,22,3dGe2FXOQOw,0,hey guys welcome show today we're gonna discus...,346.92619,328.023541,306.858307,148.598571,82.680316,307.033028,...,199.799596,195.907756,194.418024,27.569884,9.146088,199.96034,196.905496,33.385873,135.0,12.9939
28,29,3ajBR3R8doM,0,hey everyone I'm I'm Shawn oh we've got bit ey...,29.58379,22.456832,16.950847,20.298046,39.189503,29.754383,...,143.555642,127.917176,135.183554,54.86553,46.82069,151.46278,133.42932,69.593465,226.0,47.625559


In [8]:
from tqdm import tqdm
from bs4 import BeautifulSoup
import nltk
#nltk.download()
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
lemmatizer = WordNetLemmatizer()
def clean_sentences(df):
    reviews = []

    for sent in tqdm(df['transcript']):
        
        #remove html content
        review_text = BeautifulSoup(sent).get_text()
        
        #remove non-alphabetic characters
        review_text = re.sub("[^a-zA-Z]"," ", review_text)
    
        #tokenize the sentences
        words = word_tokenize(review_text.lower())
    
        #stop words removal
        omit_words = set(stopwords.words('english'))
        words = [x for x in words if x not in omit_words]
        
        #lemmatize each word to its lemma
        lemma_words = [lemmatizer.lemmatize(i) for i in words]
    
        reviews.append(lemma_words)

    return(reviews)

#cleaned reviews for both train and test set retrieved
train_sentences = clean_sentences(df)
print(len(train_sentences))


100%|██████████| 460/460 [00:02<00:00, 168.84it/s]

460





In [11]:
type(train_sentences)

list

In [9]:
from tqdm import tqdm 
tqdm.pandas(desc="progress-bar") 
from gensim.models.doc2vec import LabeledSentence

  from pandas import Panel


To implement doc2vec, we have to labelise or tag each tokenised tweet with unique IDs. We can do so by using Gensim’s LabeledSentence() function.

In [10]:
def add_label(twt):
    output = []
    for i, s in zip(range(0,len(twt)), twt):
        output.append(LabeledSentence(s, ["transcript_" + str(i)]))
    return output

labeled_X = add_label(train_sentences) # label all the tweets

  after removing the cwd from sys.path.


In [15]:
labeled_X[:6]

[LabeledSentence(words=['coronavirus', 'completely', 'changing', 'way', 'life', 'many', 'question', 'forced', 'u', 'ask', 'like', 'grocery', 'store', 'worker', 'delivery', 'people', 'essential', 'worker', 'earn', 'living', 'wage', 'health', 'care', 'tied', 'employment', 'okay', 'romantic', 'relationship', 'houseplant', 'get', 'done', 'finally', 'leave', 'house', 'normal', 'thing', 'like', 'tongue', 'kiss', 'uber', 'driver', 'gon', 'na', 'one', 'question', 'one', 'question', 'people', 'want', 'answer', 'anything', 'hell', 'disease', 'come', 'virus', 'shut', 'world', 'people', 'happy', 'people', 'happy', 'accept', 'official', 'explanation', 'china', 'virus', 'originated', 'live', 'animal', 'market', 'somehow', 'jumped', 'bat', 'human', 'like', 'okay', 'think', 'plot', 'dark', 'night', 'accept', 'u', 'stuck', 'house', 'nothing', 'except', 'throw', 'cat', 'birthday', 'party', 'zoo', 'everyone', 'everyone', 'home', 'time', 'come', 'theory', 'exactly', 'think', 'whole', 'thing', 'went', 'als

In [11]:
import gensim

model_d2v = gensim.models.Doc2Vec(dm=1, # dm = 1 for ‘distributed memory’ model
                                  dm_mean=1, # dm_mean = 1 for using mean of the context word vectors
                                  vector_size=200, # no. of desired features
                                  window=5, # width of the context window                                  
                                  negative=7, # if > 0 then negative sampling will be used
                                  min_count=5, # Ignores all words with total frequency lower than 5.                                  
                                  workers=32, # no. of cores                                  
                                  alpha=0.1, # learning rate                                  
                                  seed = 23, # for reproducibility
                                 ) 

model_d2v.build_vocab([i for i in tqdm(labeled_X)])

model_d2v.train(labeled_X, total_examples= len(train_sentences), epochs=15)

100%|██████████| 460/460 [00:00<00:00, 1310720.00it/s]


Preparing doc2vec Feature Set

In [12]:
docvec_arrays = np.zeros((len(train_sentences), 200)) 
for i in range(len(train_sentences)):
    docvec_arrays[i,:] = model_d2v.docvecs[i].reshape((1,200))    

docvec_df = pd.DataFrame(docvec_arrays) 
docvec_df.shape

(460, 200)

concatenate visual and textual

In [14]:
visual = df[['var_r', 'var_g', 'var_b',
       'var_h', 'var_s', 'var_v', 'var_bright', 'var_bright_sd',
       'var_contrast', 'var_colorful', 'median_r', 'median_g', 'median_b',
       'median_h', 'median_s', 'median_v', 'median_bright', 'median_bright_sd',
       'median_contrast', 'median_colorful']]
visual.reset_index(drop=True, inplace=True)

visual.shape

(460, 20)

In [15]:
X = pd.concat([visual, docvec_df], axis=1)
# X = X.dropna(how='all')

In [16]:
visual

Unnamed: 0,var_r,var_g,var_b,var_h,var_s,var_v,var_bright,var_bright_sd,var_contrast,var_colorful,median_r,median_g,median_b,median_h,median_s,median_v,median_bright,median_bright_sd,median_contrast,median_colorful
0,611.670255,710.533599,704.285521,565.782339,267.640542,616.471287,661.184079,185.405393,48.207604,226.281947,178.297660,182.070140,183.746428,67.315794,5.285804,184.217756,181.100160,92.073127,255.0,10.850100
1,452.356980,478.186275,419.175631,355.090668,468.120127,488.873792,444.666475,45.390127,783.693329,53.253433,58.932432,48.911764,50.621008,91.027888,61.974344,60.291964,52.098780,57.203108,223.0,22.048810
2,255.602928,227.562307,115.534510,82.652435,56.376545,233.168950,201.218955,67.706699,411.482719,23.133398,162.885660,127.777572,135.631998,73.192558,136.100062,191.345718,139.447290,43.736392,183.0,102.317861
3,346.926190,328.023541,306.858307,148.598571,82.680316,307.033028,328.973162,179.992448,851.291335,56.403953,199.799596,195.907756,194.418024,27.569884,9.146088,199.960340,196.905496,33.385873,135.0,12.993900
4,29.583790,22.456832,16.950847,20.298046,39.189503,29.754383,20.640310,5.748194,48.051666,10.437365,143.555642,127.917176,135.183554,54.865530,46.820690,151.462780,133.429320,69.593465,226.0,47.625559
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
455,2356.947868,2175.446373,2087.640234,413.396530,535.737994,2313.298897,2198.912054,98.459260,1079.165688,183.170198,144.181532,132.765260,118.875124,38.749468,60.097208,148.890732,134.756384,59.259531,205.0,38.564053
456,2194.431494,2361.411495,2470.254090,399.127900,318.185826,2046.570149,2313.278504,433.935300,4425.830168,224.939034,240.961944,242.421684,238.640388,18.330064,9.601724,244.297704,242.398372,29.246004,114.0,29.139543
457,820.036766,1169.309838,1175.549922,335.253698,132.699446,853.096095,1051.933567,421.973576,3692.410128,67.948962,182.975658,178.023416,182.162994,83.686968,25.096562,190.164112,178.605554,55.327353,198.5,23.978003
458,273.171022,421.142066,412.096403,58.687157,346.328848,274.837309,329.884662,38.341451,633.204244,133.186585,141.727648,116.764844,105.792188,24.693460,70.736796,142.607112,122.703036,52.090848,196.0,40.338692


In [17]:
docvec_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.401070,2.178952,-0.541766,-0.123708,-0.204932,-1.006195,-0.840920,-1.191100,-1.808482,-0.885580,...,0.016725,0.870505,-0.712087,0.164931,-1.720208,2.650797,-3.077836,1.072253,-1.218775,-1.217151
1,-1.158126,-0.050573,0.055731,-0.111902,1.923681,-1.608303,0.095263,-1.580746,0.695333,1.105349,...,-1.008018,-1.456758,-0.781167,-1.587771,-1.276897,-0.064254,-0.378954,1.730564,1.876887,-0.406837
2,-0.742873,0.134164,-0.450080,0.062950,-0.068483,1.311516,0.639053,-0.518664,1.263019,-1.280586,...,1.686655,-0.323299,-1.153947,1.069242,0.398742,-0.713726,0.006747,-0.659041,0.511487,-0.632619
3,-1.335792,1.092429,1.346106,-1.280020,4.250268,-1.348398,2.412879,0.175345,-1.385141,-0.554121,...,-0.221625,-1.929995,-1.769377,-2.314843,-2.265045,0.850693,1.797908,-0.245289,-0.873459,0.013708
4,1.180889,0.060712,-0.223831,0.237361,1.390513,-1.534603,0.322486,2.071446,-0.647627,0.290929,...,0.428577,1.042851,-0.939377,-0.376209,-1.421430,0.803159,-0.745265,1.714708,-0.406656,-1.336926
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
455,0.301609,0.391061,-2.071143,-0.723146,2.060278,-0.964524,1.144924,-1.445047,-2.215353,0.769171,...,-0.629034,2.833997,-0.022616,1.408964,-0.408811,-0.190102,-0.687182,0.197048,0.410738,-0.191555
456,-1.378826,0.699878,0.525705,-2.144438,3.471616,-0.660398,0.215298,-0.134006,-2.577162,-1.824159,...,1.411198,-0.159259,-0.241040,-0.568594,-0.715349,-1.358637,-2.029507,1.276559,1.719026,-1.672112
457,-1.927128,1.452425,-1.442796,-2.514248,2.204914,-0.320127,0.081044,-0.612671,-1.780783,-1.599780,...,-1.949547,-0.870642,0.733538,-0.441564,1.817724,1.393000,0.563194,0.725627,-0.746363,0.051776
458,0.016064,0.481863,0.369742,-0.391209,-0.152094,-0.203701,-0.231342,-0.463153,-0.785362,0.253403,...,0.915232,-0.471345,0.308562,0.490578,-1.387309,0.346466,-0.055680,-0.085072,-0.457113,-0.857486


In [18]:
X

Unnamed: 0,var_r,var_g,var_b,var_h,var_s,var_v,var_bright,var_bright_sd,var_contrast,var_colorful,...,190,191,192,193,194,195,196,197,198,199
0,611.670255,710.533599,704.285521,565.782339,267.640542,616.471287,661.184079,185.405393,48.207604,226.281947,...,0.016725,0.870505,-0.712087,0.164931,-1.720208,2.650797,-3.077836,1.072253,-1.218775,-1.217151
1,452.356980,478.186275,419.175631,355.090668,468.120127,488.873792,444.666475,45.390127,783.693329,53.253433,...,-1.008018,-1.456758,-0.781167,-1.587771,-1.276897,-0.064254,-0.378954,1.730564,1.876887,-0.406837
2,255.602928,227.562307,115.534510,82.652435,56.376545,233.168950,201.218955,67.706699,411.482719,23.133398,...,1.686655,-0.323299,-1.153947,1.069242,0.398742,-0.713726,0.006747,-0.659041,0.511487,-0.632619
3,346.926190,328.023541,306.858307,148.598571,82.680316,307.033028,328.973162,179.992448,851.291335,56.403953,...,-0.221625,-1.929995,-1.769377,-2.314843,-2.265045,0.850693,1.797908,-0.245289,-0.873459,0.013708
4,29.583790,22.456832,16.950847,20.298046,39.189503,29.754383,20.640310,5.748194,48.051666,10.437365,...,0.428577,1.042851,-0.939377,-0.376209,-1.421430,0.803159,-0.745265,1.714708,-0.406656,-1.336926
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
455,2356.947868,2175.446373,2087.640234,413.396530,535.737994,2313.298897,2198.912054,98.459260,1079.165688,183.170198,...,-0.629034,2.833997,-0.022616,1.408964,-0.408811,-0.190102,-0.687182,0.197048,0.410738,-0.191555
456,2194.431494,2361.411495,2470.254090,399.127900,318.185826,2046.570149,2313.278504,433.935300,4425.830168,224.939034,...,1.411198,-0.159259,-0.241040,-0.568594,-0.715349,-1.358637,-2.029507,1.276559,1.719026,-1.672112
457,820.036766,1169.309838,1175.549922,335.253698,132.699446,853.096095,1051.933567,421.973576,3692.410128,67.948962,...,-1.949547,-0.870642,0.733538,-0.441564,1.817724,1.393000,0.563194,0.725627,-0.746363,0.051776
458,273.171022,421.142066,412.096403,58.687157,346.328848,274.837309,329.884662,38.341451,633.204244,133.186585,...,0.915232,-0.471345,0.308562,0.490578,-1.387309,0.346466,-0.055680,-0.085072,-0.457113,-0.857486


RandomForest

In [19]:
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score
precision = []
recall = []

In [20]:
from sklearn.model_selection import cross_validate
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier

for i in range(10):
    forest = RandomForestClassifier(n_estimators=500,
                                 max_features=None,max_depth=None,min_samples_split=2)
    cv = cross_validate(forest, X, df['attitude'], cv=10,scoring = ['precision','recall'])
    precision.append(cv['test_precision'].mean())
    recall.append(cv['test_recall'].mean())




In [21]:
precision

[0.8571145598496249,
 0.8595614948936451,
 0.8458622809991161,
 0.8550769178094473,
 0.8559196053788083,
 0.8525677266118443,
 0.85750481000481,
 0.8640133477633476,
 0.8513251672075202,
 0.8464193878543973]

In [22]:
recall

[0.9296798029556651,
 0.9364532019704432,
 0.9330049261083744,
 0.9294334975369457,
 0.9366995073891626,
 0.9399014778325123,
 0.9364532019704432,
 0.9330049261083744,
 0.9258620689655173,
 0.9295566502463053]

In [23]:
import numpy as np
import scipy.stats as st


print(np.std(precision))
print(np.mean(precision))
print(st.t.interval(0.95, len(precision)-1, loc=np.mean(precision), scale=st.sem(precision)))




0.005366952746112409
0.854536529837256
(0.8504895663050192, 0.8585834933694929)


In [24]:

print(np.std(recall))
print(np.mean(recall))
print(st.t.interval(0.95, len(recall)-1, loc=np.mean(recall), scale=st.sem(recall)))



0.004143864500337917
0.9330049261083744
(0.9298802351880852, 0.9361296170286636)


In [26]:
import joblib
joblib.dump(forest, "random_forest_word_embedding_visual(normal).joblib")

['random_forest_word_embedding_visual(normal).joblib']

In [66]:
eli5.show_weights(perm,top = 50)

Weight,Feature
0.0825  ± 0.0735,x47
0.0349  ± 0.0127,x144
0.0222  ± 0.0156,x147
0.0159  ± 0.0000,x66
0.0159  ± 0.0000,x35
0.0159  ± 0.0000,x175
0.0127  ± 0.0238,x134
0.0127  ± 0.0370,x39
0.0127  ± 0.0546,x105
0.0127  ± 0.0127,x89
