In [1]:
# Importing Tools
import warnings
import pandas as pd
# from numba import jit, cuda
import re
from youtube_comment_downloader import *
import pickle
import numpy as np
from collections import Counter

# Visualization packages
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from ipywidgets import *
from nltk import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from wordcloud import STOPWORDS
%matplotlib inline

 

# Machine learning packages
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from nltk.stem import PorterStemmer
import spacy


 



In [2]:
# Utility
warnings.filterwarnings('ignore')
NUM_OF_THREADS = 6
pd.set_option('display.max_colwidth', None)
dl = nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
en = spacy.load('en_core_web_sm')
stopwords = en.Defaults.stop_words

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\sarah\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sarah\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sarah\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sarah\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
train = pd.read_csv(".\input\\updated_train_data.csv")

In [4]:
filename = 'logist_model.sav'
model = pickle.load(open(filename, 'rb'))

In [5]:
# Collecting and Cleaning Data
# function optimized to run on gpu 
# @jit(target_backend='cuda')
def get_comments(video_url):
    all_data=[]
    downloader = YoutubeCommentDownloader()
    comments = downloader.get_comments_from_url(video_url,sort_by=SORT_BY_POPULAR)
    for comment in list(comments):
        data = dict(comment_id = comment['cid'],
                    comment_text = comment['text'],
                    time = comment['time'],
                    author = comment['author']
                    
                   )
        all_data.append(data)
    return all_data

In [6]:
def apply_regex(corpus):
    corpus = corpus.apply(lambda x: re.sub("\S*\d\S*"," ", x))          # removes numbers and words concatenated with numbers (IE h4ck3r)
    corpus = corpus.apply(lambda x: re.sub("\S*@\S*\s?"," ", x))        # removes emails and mentions (words with @)
    corpus = corpus.apply(lambda x: re.sub("\S*#\S*\s?"," ", x))        # removes hashtags (words with #)
    corpus = corpus.apply(lambda x: re.sub(r'http\S+', ' ', x))         # removes URLs
    corpus = corpus.apply(lambda x: re.sub(r'[^a-zA-Z0-9 ]', ' ',x))    # keeps numbers and letters
    corpus = corpus.apply(lambda x: x.replace(u'\ufffd', '8'))          # replaces the ASCII '�' symbol with '8'
    corpus = corpus.apply(lambda x: re.sub(' +', ' ', x))               # removes multiple spaces
    return corpus

In [7]:
# Sentiment Analysis
def get_pol(df):
    sid = SentimentIntensityAnalyzer()
    df['compound'] = df['comment_text'].apply(lambda comment_text:sid.polarity_scores(str(comment_text))['compound'])
    return df

In [8]:
# function optimized to run on gpu 
# @jit(target_backend='cuda')    
def sentiment_score(df):
    df['sentiment'] = ''
    for i,score in enumerate(df['compound']):
        if float(score) > 0:
            df['sentiment'][i] = 'positive'
        elif float(score) == 0.0:
            df['sentiment'][i] = 'neutral'
        else:
            df['sentiment'][i] ='negative'
    return df

In [9]:
# Processing Comments
def process_comments(data,train):
    df = pd.DataFrame(data['sentiment'] +' '+ data['clean_text'])
    df.columns = ['text']
    
    tfid = TfidfVectorizer(lowercase=False,max_features=500)
    tfid_vect = tfid.fit_transform(train['comment_text'])
    tfid_vect = tfid.transform(df['text'])
    
    
    return tfid_vect

In [10]:
# Toxicity Score
def get_predictions(tfid_vect):
    return model.predict(tfid_vect)


In [11]:
# Assigns toxicity score on a scale of 0-5
def get_toxic_score(data):
    df = data[['comment_text','sentiment']]
    df['score'] = data['severe_toxic']+data['obscene']+data['threat']+data['insult']+data['identity_hate']
    return df

In [12]:
def get_results(data,prediction,score = 0,filtered = False):
    data['Toxic_Score'] = prediction.tolist()
    df = data.drop(['clean_text','compound'],axis = 1)
    df['comment_text'].drop_duplicates()
    if filtered:
        reports = df.query('Toxic_Score' == score)
    else:
        reports = df.query('Toxic_Score')
    return reports
        

In [49]:
# Creates Visualizations
out_graph = widgets.Output()
@out_graph.capture()
def show_distribution(reports):
#     word_freq = words_helper(reports)
#     sns.set_style("darkgrid")
#     toxic_score_dist()
#     sentiment_dist()
#     word_freq_dist(word_freq)
    with out_graph:
        sns.set()
        fig, axes = plt.subplots(nrows = 3,figsize=(10,15))
        word_freq = words_helper(reports)
        order = ['negative','neutral','positive']
        sns.countplot(x = reports["sentiment"],order = order,ax= axes[0])
        sns.barplot(x = word_freq['words'],y=word_freq['count'],ax=axes[1])
        order2= range(6)
        sns.countplot(x = reports["Toxic_Score"],order=order2)
        plt.show()

    

In [31]:
# out1 = widgets.Output()
# @out1.capture()
def toxic_score_dist():
#     with out1:
        order= range(6)
        ax1 = sns.countplot(x = reports["Toxic_Score"],order=order)
        plt.title('Distribution of Toxic Comments')
        plt.xlabel('Toxicity Rating')
        plt.ylabel('Count')
        for container in ax1.containers:
            plt.bar_label(container)
        plt.show()
     
      

In [32]:
# out2 = widgets.Output()
# @out2.capture()
def sentiment_dist():
#     with out2:
        order = ['negative','neutral','positive']
        sns.countplot(x = reports["sentiment"],order = order)
        plt.title('Distribution Sentiment')
        plt.xlabel('Sentiment')
        plt.ylabel('Count')
        plt.show()

  

In [33]:
# out3 = widgets.Output()
# @out3.capture()
def word_freq_dist(word_freq):
#     with out3:
        sns.barplot(x = word_freq['words'],y=word_freq['count'])
        plt.title('Most Common Words')
        plt.xticks(rotation=45)
#         plt.show()


In [34]:
# Driver
def load_comments(btn):
    video_url = url.value
    global data
    global reports
    score = 0
    filtered = False
    comments = get_comments(video_url)
    data = pd.DataFrame(comments)
    data.drop_duplicates
    data = get_pol(data)
    data = sentiment_score(data)
    data['clean_text'] = apply_regex(data['comment_text'])
    vect = process_comments(data,train)
    prediction = get_predictions(vect)
    reports = get_results(data,prediction,score,filtered)
    show_distribution(reports)


In [35]:
def extreme_toxic(btn5):
    # Report Comments with a Toxic Score of 5
    report5 = data.drop(['compound','clean_text'],axis = 1)
    report5 = data[data['Toxic_Score'] == 5]
    report5.style.set_caption("Toxic Score:5 Reccomendation:Report Comments")
    display(report5)


In [36]:
# Cleans text for visualizations
def words_helper(data):
    for word in STOPWORDS:
        if word not in stopwords:
            stopwords.add(word)
    stopwords.add('that')
    stopwords.add('this')
    stopwords.update
    
    data['clean_text'] = data['comment_text'].str.replace("[^a-zA-Z#]", " ")
    data['clean_text'] = data['clean_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
    all_words = ' '.join([text for text in data['clean_text'] if text not in stopwords])    
    
    cnt = Counter()

    wordlist = all_words.split(' ')

    for word in wordlist:
        cnt[word] += 1
        
    word_freq = pd.DataFrame(cnt.most_common(11),
                             columns=['words', 'count'])
    return word_freq



In [37]:
# Gets URL from user
global url
url = widgets.Text(
    value='https://www.youtube.com/watch?v=aibubHq2-C8',
    placeholder='https://www.youtube.com/watch?v=aibubHq2-C8',
    description='Video URL:',
    disabled=False,
    layout=widgets.Layout(height="auto")
)


In [38]:
# Runs reports on click
btn = widgets.Button(
    value = '',
    description='Get Reports',
    disabled=False,
    button_style='',
    layout=widgets.Layout(height="auto")
)
btn.on_click(load_comments)


In [39]:
dropdown_score = widgets.Dropdown(options = [('0. Nontoxic',0), ('1. Very Slightly Toxic',1) ,('2. Slightly Toxic',2),('3. Toxic',3),('4. Severely  Toxic',4),('5. Extremely Toxic',5)])
output_score = widgets.Output()
@output_score.capture(clear=True)
def dropdown_score_eventhandler(change):
    output_score.clear_output()
    with output_score:
        results = data.drop(['compound','clean_text'],axis = 1)
        display(results[results.Toxic_Score == change.new])
        
dropdown_score.observe(dropdown_score_eventhandler, names='value')

# Toxic Comment Analysis

In [50]:
widgets.VBox([url,btn])

VBox(children=(Text(value='https://www.youtube.com/watch?v=aibubHq2-C8', description='Video URL:', layout=Layo…

In [51]:
out_graph

Output(outputs=({'output_type': 'display_data', 'data': {'text/plain': '<Figure size 1000x1500 with 3 Axes>', …

In [25]:
display(dropdown_score)

Dropdown(options=(('0. Nontoxic', 0), ('1. Very Slightly Toxic', 1), ('2. Slightly Toxic', 2), ('3. Toxic', 3)…

In [44]:
output_score

Output()