# Set-up of the project

In [1]:
# Basic packages
import pandas as pd 
import numpy as np
import re
import collections
import matplotlib.pyplot as plt
from tensorflow import keras
import tensorflow as tf

# Packages for data preparation
from sklearn.model_selection import train_test_split
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder



from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lynette/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Set some parameters that will be used throughout the notebook.

We read in the csv with the tweets data and perform a random shuffle. It's a good practice to shuffle the data before splitting between a train and test set. We'll only keep the video decription column as input and the Relvancy column as the target.

In [2]:
df = pd.read_csv('handlabel_feature.csv')
df.columns

Index(['Unnamed: 0', 'video_id', 'channel_title', 'channel_id',
       'video_publish_date', 'video_title', 'video_description',
       'video_category', 'video_view_count', 'video_comment_count',
       'video_like_count', 'video_dislike_count', 'video_thumbnail',
       'video_tags', 'collection_date', 'science.topic', 'Relevancy',
       'attitude', 'Text/video', 'search.term', 'cld2', 'transcript',
       'transcript_nchar', 'videoid', 'conspiracy', 'var_r', 'var_g', 'var_b',
       'var_h', 'var_s', 'var_v', 'var_bright', 'var_bright_sd',
       'var_contrast', 'var_colorful', 'median_r', 'median_g', 'median_b',
       'median_h', 'median_s', 'median_v', 'median_bright', 'median_bright_sd',
       'median_contrast', 'median_colorful', 'r_mean', 'g_mean', 'b_mean',
       'h_mean', 's_mean', 'v_mean', 'bright_mean', 'lightning_mean',
       'contrast_mean', 'colorful_mean', 'color_lag'],
      dtype='object')

In [3]:
df = df[(df["transcript"] != 'CaptionUnavailable') & (df["transcript"] != 'VideoUnavailable')]

In [5]:
df.shape

(313, 56)

In [6]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['attitude'] = le.fit_transform(df['attitude'])

# Data preparation

## Data cleaning

The first thing we'll do is removing stopwords. These words do not have any value for predicting the sentiment.Also, we remove the http link in the texts

In [10]:
from tqdm import tqdm
from bs4 import BeautifulSoup
import nltk
#nltk.download()
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
lemmatizer = WordNetLemmatizer()
def clean_sentences(df):
    reviews = []

    for sent in tqdm(df['transcript']):
        
        #remove html content
        review_text = BeautifulSoup(sent).get_text()
        
        #remove non-alphabetic characters
        review_text = re.sub("[^a-zA-Z]"," ", review_text)
    
        #tokenize the sentences
        words = word_tokenize(review_text.lower())
    
        #stop words removal
        omit_words = set(stopwords.words('english'))
        words = [x for x in words if x not in omit_words]
        
        #lemmatize each word to its lemma
        lemma_words = [lemmatizer.lemmatize(i) for i in words]
    
        reviews.append(lemma_words)

    return(reviews)

#cleaned reviews for both train and test set retrieved
train_sentences = clean_sentences(df)
print(len(train_sentences))


100%|██████████| 313/313 [00:05<00:00, 55.99it/s]

313





In [11]:
type(train_sentences)

list

In [12]:
from tqdm import tqdm 
tqdm.pandas(desc="progress-bar") 
from gensim.models.doc2vec import LabeledSentence

  from pandas import Panel


To implement doc2vec, we have to labelise or tag each tokenised tweet with unique IDs. We can do so by using Gensim’s LabeledSentence() function.

In [14]:
def add_label(twt):
    output = []
    for i, s in zip(range(0,len(twt)), twt):
        output.append(LabeledSentence(s, ["transcript_" + str(i)]))
    return output

labeled_X = add_label(train_sentences) # label all the tweets

  after removing the cwd from sys.path.


In [15]:
labeled_X[:6]

[LabeledSentence(words=['coronavirus', 'completely', 'changing', 'way', 'life', 'many', 'question', 'forced', 'u', 'ask', 'like', 'grocery', 'store', 'worker', 'delivery', 'people', 'essential', 'worker', 'earn', 'living', 'wage', 'health', 'care', 'tied', 'employment', 'okay', 'romantic', 'relationship', 'houseplant', 'get', 'done', 'finally', 'leave', 'house', 'normal', 'thing', 'like', 'tongue', 'kiss', 'uber', 'driver', 'gon', 'na', 'one', 'question', 'one', 'question', 'people', 'want', 'answer', 'anything', 'hell', 'disease', 'come', 'virus', 'shut', 'world', 'people', 'happy', 'people', 'happy', 'accept', 'official', 'explanation', 'china', 'virus', 'originated', 'live', 'animal', 'market', 'somehow', 'jumped', 'bat', 'human', 'like', 'okay', 'think', 'plot', 'dark', 'night', 'accept', 'u', 'stuck', 'house', 'nothing', 'except', 'throw', 'cat', 'birthday', 'party', 'zoo', 'everyone', 'everyone', 'home', 'time', 'come', 'theory', 'exactly', 'think', 'whole', 'thing', 'went', 'als

In [16]:
import gensim

model_d2v = gensim.models.Doc2Vec(dm=1, # dm = 1 for ‘distributed memory’ model
                                  dm_mean=1, # dm_mean = 1 for using mean of the context word vectors
                                  vector_size=200, # no. of desired features
                                  window=5, # width of the context window                                  
                                  negative=7, # if > 0 then negative sampling will be used
                                  min_count=5, # Ignores all words with total frequency lower than 5.                                  
                                  workers=32, # no. of cores                                  
                                  alpha=0.1, # learning rate                                  
                                  seed = 23, # for reproducibility
                                 ) 

model_d2v.build_vocab([i for i in tqdm(labeled_X)])

model_d2v.train(labeled_X, total_examples= len(train_sentences), epochs=15)

100%|██████████| 313/313 [00:00<00:00, 722519.07it/s]


Preparing doc2vec Feature Set

In [17]:
docvec_arrays = np.zeros((len(train_sentences), 200)) 
for i in range(len(train_sentences)):
    docvec_arrays[i,:] = model_d2v.docvecs[i].reshape((1,200))    

docvec_df = pd.DataFrame(docvec_arrays) 
docvec_df.shape

(313, 200)

concatenate visual and textual

In [43]:
visual = df[['var_r', 'var_g', 'var_b',
       'var_h', 'var_s', 'var_v', 'var_bright', 'var_bright_sd',
       'var_contrast', 'var_colorful', 'median_r', 'median_g', 'median_b',
       'median_h', 'median_s', 'median_v', 'median_bright', 'median_bright_sd',
       'median_contrast', 'median_colorful','color_lag']]
visual.reset_index(drop=True, inplace=True)

visual.shape

(313, 21)

In [44]:
X = pd.concat([visual, docvec_df], axis=1)
# X = X.dropna(how='all')

In [34]:
visual

Unnamed: 0,var_r,var_g,var_b,var_h,var_s,var_v,var_bright,var_bright_sd,var_contrast,var_colorful,median_r,median_g,median_b,median_h,median_s,median_v,median_bright,median_bright_sd,median_contrast,median_colorful
0,298.035544,256.526430,317.666638,176.226486,291.471955,318.839186,249.684164,58.312728,273.094250,131.323126,99.315580,85.044938,112.727762,86.092826,134.038498,134.587656,92.603380,66.266163,220.0,74.248203
1,14.562086,13.206167,14.473241,22.249816,3.087459,14.468497,13.461712,2.128646,6.360655,0.903765,126.550148,128.666820,130.396076,83.762660,39.437676,134.533176,128.116216,75.928568,241.0,19.761427
2,1397.200532,1812.288126,1125.290200,952.795220,3386.595470,942.447595,1478.249667,160.366149,1102.429909,528.665700,82.593592,44.906568,92.771928,116.763216,175.650236,106.260372,61.527896,54.770904,190.0,71.947426
3,800.175857,979.432379,765.202229,533.030459,899.264225,665.884765,850.382767,385.940453,2316.228370,115.950038,79.772896,73.095122,103.627240,107.586062,112.794364,110.728122,77.605316,50.424047,182.0,56.541083
4,611.670255,710.533599,704.285521,565.782339,267.640542,616.471287,661.184079,185.405393,48.207604,226.281947,178.297660,182.070140,183.746428,67.315794,5.285804,184.217756,181.100160,92.073127,255.0,10.850100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
308,147.788425,83.104502,343.138525,287.133037,738.014925,211.022546,73.414553,233.283992,762.744039,122.025988,111.254992,113.640668,101.466524,65.478744,56.372900,122.471632,112.078180,69.131249,237.0,30.316916
309,499.048934,671.771055,464.885271,768.802809,362.948260,492.587090,554.760968,93.525503,564.382108,60.941841,123.212802,119.276686,102.296032,79.687310,70.830036,129.006988,118.347988,63.955769,210.0,34.442498
310,12.184924,9.988667,8.442325,7.628704,8.160141,12.292979,9.858782,1.924338,13.182123,6.304640,126.875756,101.686932,90.995444,37.053376,85.170332,128.535556,108.091288,70.702156,249.0,54.941191
311,3.100336,12.856796,13.246367,0.652040,23.776415,2.342890,9.398054,2.273838,0.001471,6.499906,128.762536,80.070008,88.208350,124.785384,164.544432,141.974716,95.623774,77.683406,241.0,97.623431


In [31]:
docvec_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,1.293471,-0.525364,1.156779,1.081712,0.289586,0.298169,-0.882147,-0.006867,-0.161549,-0.138690,...,2.883073,1.285690,-1.315247,0.804228,1.058505,-1.556036,0.201086,0.571067,-1.174386,2.434449
1,-0.154030,0.544217,1.125765,1.333524,2.924797,-0.905756,-0.009149,-2.596337,-2.145004,0.822041,...,2.298729,0.618833,0.767335,-1.140592,0.402427,-0.278632,0.788667,0.226513,-0.671453,-1.244908
2,-1.363886,0.467359,-1.254235,1.886107,-0.930266,0.274078,0.719228,0.826790,0.167190,-0.629208,...,2.498982,-0.140726,1.231478,-0.313550,1.754472,0.075241,1.815325,-0.457882,0.292074,3.019835
3,0.851105,-0.072423,2.721164,-2.645950,-1.042278,0.589601,-2.598239,1.049134,2.103611,0.132658,...,0.940295,-0.067660,0.114367,0.443318,-0.512328,1.963000,-0.240004,4.226593,0.466409,2.227144
4,0.496069,-2.213356,-0.715225,-1.084395,-1.704261,1.907339,-1.263384,0.423036,3.069369,-1.560591,...,-0.782691,0.546885,0.439560,1.916162,0.020182,0.417506,0.390734,2.279149,-1.403165,1.629502
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
308,-0.507197,-1.275348,-0.999348,-0.172698,-0.250454,0.419883,-0.642506,-0.346242,0.186149,0.308790,...,-0.192465,0.237490,0.249331,-0.571955,1.094740,-0.011742,0.508636,-0.740747,-0.615826,-0.587718
309,-0.029470,-0.735089,-0.788504,-0.891680,-2.303042,-0.535786,2.721952,-0.252907,-0.952564,-1.209723,...,-0.251547,1.951036,-1.173271,1.618510,2.833762,-0.349432,3.787318,-1.050254,-0.491712,0.801583
310,-1.956018,-1.051472,-0.918749,-0.810338,-0.792636,1.935167,0.148238,-0.279661,0.209113,0.323572,...,0.381534,0.629705,1.820410,1.682745,-0.224729,0.431816,-0.091228,2.816985,1.740080,1.122173
311,0.035224,1.643500,-1.266737,2.240834,-0.493726,1.118073,0.685731,-0.239572,1.319384,-2.355529,...,1.568878,3.391594,-0.873840,-1.035734,0.695301,-0.195795,-0.647732,-1.394372,-2.175948,1.424991


In [45]:
X

Unnamed: 0,var_r,var_g,var_b,var_h,var_s,var_v,var_bright,var_bright_sd,var_contrast,var_colorful,...,190,191,192,193,194,195,196,197,198,199
0,298.035544,256.526430,317.666638,176.226486,291.471955,318.839186,249.684164,58.312728,273.094250,131.323126,...,2.883073,1.285690,-1.315247,0.804228,1.058505,-1.556036,0.201086,0.571067,-1.174386,2.434449
1,14.562086,13.206167,14.473241,22.249816,3.087459,14.468497,13.461712,2.128646,6.360655,0.903765,...,2.298729,0.618833,0.767335,-1.140592,0.402427,-0.278632,0.788667,0.226513,-0.671453,-1.244908
2,1397.200532,1812.288126,1125.290200,952.795220,3386.595470,942.447595,1478.249667,160.366149,1102.429909,528.665700,...,2.498982,-0.140726,1.231478,-0.313550,1.754472,0.075241,1.815325,-0.457882,0.292074,3.019835
3,800.175857,979.432379,765.202229,533.030459,899.264225,665.884765,850.382767,385.940453,2316.228370,115.950038,...,0.940295,-0.067660,0.114367,0.443318,-0.512328,1.963000,-0.240004,4.226593,0.466409,2.227144
4,611.670255,710.533599,704.285521,565.782339,267.640542,616.471287,661.184079,185.405393,48.207604,226.281947,...,-0.782691,0.546885,0.439560,1.916162,0.020182,0.417506,0.390734,2.279149,-1.403165,1.629502
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
308,147.788425,83.104502,343.138525,287.133037,738.014925,211.022546,73.414553,233.283992,762.744039,122.025988,...,-0.192465,0.237490,0.249331,-0.571955,1.094740,-0.011742,0.508636,-0.740747,-0.615826,-0.587718
309,499.048934,671.771055,464.885271,768.802809,362.948260,492.587090,554.760968,93.525503,564.382108,60.941841,...,-0.251547,1.951036,-1.173271,1.618510,2.833762,-0.349432,3.787318,-1.050254,-0.491712,0.801583
310,12.184924,9.988667,8.442325,7.628704,8.160141,12.292979,9.858782,1.924338,13.182123,6.304640,...,0.381534,0.629705,1.820410,1.682745,-0.224729,0.431816,-0.091228,2.816985,1.740080,1.122173
311,3.100336,12.856796,13.246367,0.652040,23.776415,2.342890,9.398054,2.273838,0.001471,6.499906,...,1.568878,3.391594,-0.873840,-1.035734,0.695301,-0.195795,-0.647732,-1.394372,-2.175948,1.424991


RandomForest

In [46]:
from sklearn.model_selection import cross_validate
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=500,
                              random_state=1,
                             max_features=None,max_depth=None,min_samples_split=2)
cv = cross_validate(forest, X, df['attitude'], cv=10,scoring = ['precision','recall'])




In [47]:

print(cv['test_precision'].mean())
print(cv['test_recall'].mean())





0.7920914693613345
0.9264069264069263


In [48]:
df.reset_index(drop=True, inplace=True)
df.shape
df['var_r']

0       298.035544
1        14.562086
2      1397.200532
3       800.175857
4       611.670255
          ...     
308     147.788425
309     499.048934
310      12.184924
311       3.100336
312      26.059390
Name: var_r, Length: 313, dtype: float64

In [57]:
p = pd.DataFrame({'prediction': prediction})

In [58]:
p

Unnamed: 0,prediction
0,0
1,0
2,0
3,0
4,1
...,...
308,1
309,0
310,0
311,0


In [49]:
prediction = forest.fit(X, df['attitude']).predict(X)

In [62]:
result.to_csv("wordembedding+visual.csv")

In [59]:
result = pd.concat([df, p], axis=1)

In [60]:
result.head

<bound method NDFrame.head of      Unnamed: 0     video_id                    channel_title  \
0             0  NcSUF8erpfU  The Daily Show with Trevor Noah   
1             1  V0yb0_a-WNc                        penguinz0   
2             2  BkbztWS4-9I                    BBC Newsnight   
3             4  7OVT3N5_4to                       TechMagnet   
4             5  8ocWUAwQMhY             LogicBeforeAuthority   
..          ...          ...                              ...   
308         402  bHkcnuJhDp8                          The Sun   
309         403  cszlsGiD1-E            Corbett Report Extras   
310         404  edXRrJPdoHA                        LMG Clips   
311         405  fS4OiNzdPAw         South China Morning Post   
312         406  jMX8Gtl8_cw                    paul chowdhry   

                   channel_id video_publish_date  \
0    UCwWhs_6x42TyRM4Wstoq8HA       4/17/20 2:03   
1    UCq6VFHwMzcMXbuKyG7SQYIg       4/3/20 23:30   
2    UC6o-wWU-v2ClFMwougmK7dA    

In [64]:
y = df['attitude']

In [65]:
import eli5
from eli5.sklearn import PermutationImportance
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2, shuffle = True, random_state = 123, stratify = y)
model = forest.fit(X_train,y_train)
perm = PermutationImportance(model).fit(X_test, y_test)


AttributeError: 'int' object has no attribute 'startswith'

In [66]:
eli5.show_weights(perm,top = 50)

Weight,Feature
0.0825  ± 0.0735,x47
0.0349  ± 0.0127,x144
0.0222  ± 0.0156,x147
0.0159  ± 0.0000,x66
0.0159  ± 0.0000,x35
0.0159  ± 0.0000,x175
0.0127  ± 0.0238,x134
0.0127  ± 0.0370,x39
0.0127  ± 0.0546,x105
0.0127  ± 0.0127,x89
