In [2]:
# Install required packages
import sys
!{sys.executable} -m pip install --upgrade pip
#!{sys.executable} -m pip install torch
!{sys.executable} -m pip install tensorflow
!{sys.executable} -m pip install transformers
!{sys.executable} -m pip install --upgrade google-api-python-client

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


Defaulting to user installation because normal site-packages is not writeable


In [36]:
# utils import 
import pprint

# Abstract

### /!\ The full report can be found in the same folder as this notebook.  /!\ 

### Background

**Sentiment analysis (or opinion mining) is a natural language processing technique usually performed on textual data to determine whether a piece of data is positive, negative, or neutral**. Traditionally, it allows businesses to monitor sentiment in customer feedback and therefore tailor their products or services to match their customers’ needs. This technique is **here applied to YouTube comments to help online content creators monitor their audience’s opinion without reading all the comments under a video**. This is particularly useful for those whose content generates thousands to millions of reactions.

###  Aims 

This project has multiple goals that work towards allowing the user to :
1. Get an overview of the audience's opinion on a piece of content (**sentiment + emotion classification**), 
2. Roughly know what is discussed in the comment section (**topic classification**), 
3. Spot intense negative emotions and take actions (report or ban toxic viewers).

The optimal situation is for the overview to provide insights on different objective aspects of the video : video, audio, content, length and editing quality.

###  Method 

The key steps of this project are :
1. **Data collection using the Youtube API**. Creation of JSON files to design a **NoSQL database using Elasticsearch**. Each comment is a document and one JSON file is created per video (a reference to the video is stored in the document).
2. **Data preprocessing** : removal of irrelevant data (unreadable characters and links) and emoji processing to make them classifiable. 
3. **Data analysis** : sentiment, emotion and topic classification using Tensorflow.
4. **Data summarisation in a Kibana dashboard**.

### Results 

Dashboard sentiment, topics and emotions portrayed under a video’s comment section. 

### Tools  
- NoSQL with Elasticsearch : for its tokenization feature that allows us to search through text.
- Jupyter notebook : source code and synthetic explanation of our process.
- Kibana : data visualisation (final dashboard).
- Youtube API : data collection (video comments).

### About this notebook
This notebook applies our method on a TEDx Talks video *[When money isn’t real: the 10,000 experiment](https://www.youtube.com/watch?v=_VB39Jo8mAQ)* with **4 690 comments**.

# Data gathering

In [27]:
import os
import googleapiclient.discovery
import json

DEVELOPER_KEY = "YOUR_API_KEY"
with open('credentials/credentials.txt') as f:
    DEVELOPER_KEY = f.readlines()
    
def getDataFromYouTubeAPI(video_id):
    # Disable OAuthlib's HTTPS verification when running locally.
    # *DO NOT* leave this option enabled in production.
    os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"

    api_service_name = "youtube"
    api_version = "v3"

    youtube = googleapiclient.discovery.build(
        api_service_name, api_version, developerKey = DEVELOPER_KEY)

    request = youtube.commentThreads().list(
        part="snippet,replies",
        maxResults=100,
        textFormat="plainText",
        videoId=video_id
    )
    response = request.execute()
    comments = response["items"]
    
    if("nextPageToken" in response):
        next_page_token = response["nextPageToken"]
        while(next_page_token):
            request = youtube.commentThreads().list(
                part="snippet,replies",
                maxResults=100,
                textFormat="plainText",
                videoId=video_id,
                pageToken=next_page_token
            )
            response = request.execute()
            comments = comments + response["items"]
            if("nextPageToken" in response):
                next_page_token = response["nextPageToken"]
            else:
                next_page_token = None
    return comments
    
comments = getDataFromYouTubeAPI("_VB39Jo8mAQ")
print("Number of comments : ", len(comments))
print("Sneak peek : ")
pprint.pprint(comments[0])

2666


In [40]:
print("Number of comments : ", len(comments))
print("Sneak peek : ")
pprint.pprint(comments[0])

Number of comments :  2666
Sneak peek : 
{'etag': 'VYCu_WJjXxGgqAuGYvyP-1zTjOo',
 'id': 'UgxK5RGyzy_FAG4R_1d4AaABAg',
 'kind': 'youtube#commentThread',
 'snippet': {'canReply': True,
             'isPublic': True,
             'topLevelComment': {'etag': 'ze2S-AAQlPiB2p3bdV5qM7YNTm4',
                                 'id': 'UgxK5RGyzy_FAG4R_1d4AaABAg',
                                 'kind': 'youtube#comment',
                                 'snippet': {'authorChannelId': {'value': 'UCzY2ONyq7_ge_doVLjOdBJg'},
                                             'authorChannelUrl': 'http://www.youtube.com/channel/UCzY2ONyq7_ge_doVLjOdBJg',
                                             'authorDisplayName': 'Pete '
                                                                  'Romocki',
                                             'authorProfileImageUrl': 'https://yt3.ggpht.com/ytc/AKedOLTY4QYWXCxqmfXtPsy7rq3ofDicZUPTf3E9nbcv8A=s48-c-k-c0x00ffffff-no-rj',
                                   

In [12]:
import tensorflow
from transformers import pipeline

sentiment_analysis = pipeline("sentiment-analysis")

pos_text = "I enjoy studying computational algorithms."
neg_text = "I dislike sleeping late everyday."
neutral_text = "The apple is red."

result = sentiment_analysis(neutral_text)[0]
print("Label:", result['label'])
print("Confidence Score:", result['score'])
print()

result = sentiment_analysis(pos_text)[0]
print(result)
print("Label:", result['label'])
print("Confidence Score:", result['score'])
print()

result = sentiment_analysis(neg_text)[0]
print("Label:", result['label'])
print("Confidence Score:", result['score'])

classifier = pipeline("text-classification",model='bhadresh-savani/distilbert-base-uncased-emotion', return_all_scores=True)
prediction = classifier("I love using transformers. The best part is wide range of support and its easy to use", )
print(prediction)

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


Label: NEGATIVE
Confidence Score: 0.8237260580062866

{'label': 'POSITIVE', 'score': 0.9980717897415161}
Label: POSITIVE
Confidence Score: 0.9980717897415161

Label: NEGATIVE
Confidence Score: 0.9881595373153687
[[{'label': 'sadness', 'score': 0.000679271062836051}, {'label': 'joy', 'score': 0.9959298968315125}, {'label': 'love', 'score': 0.0009452462545596063}, {'label': 'anger', 'score': 0.001805522944778204}, {'label': 'fear', 'score': 0.0004111044108867645}, {'label': 'surprise', 'score': 0.00022885717044118792}]]


In [14]:
import tensorflow
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("lordtt13/emo-mobilebert")

model = AutoModelForSequenceClassification.from_pretrained("lordtt13/emo-mobilebert")

nlp_sentence_classif = pipeline("sentiment-analysis", model = model, tokenizer = tokenizer)
nlp_sentence_classif("the apple is red")
# Output: [{'label': 'sad', 'score': 0.93153977394104}]


[{'label': 'others', 'score': 0.9879725575447083}]