In [9]:
import json
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
dataset = pd.read_csv('USvideos.csv')
new_df = dataset[['title','channel_title','category_id','publish_time','tags','description','views']].copy()
new_df.head()

In [10]:
with open('US_category_id.json') as f:
    datastore = json.load(f)
print(datastore['items'][0])
for d in datastore['items']:
    iden = d['id']
    names = d['snippet']['title']
    print((iden,names))

{'kind': 'youtube#videoCategory', 'etag': '"m2yskBQFythfE4irbTIeOgYYfBU/Xy1mB4_yLrHy_BmKmPBggty2mZQ"', 'id': '1', 'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ', 'title': 'Film & Animation', 'assignable': True}}
('1', 'Film & Animation')
('2', 'Autos & Vehicles')
('10', 'Music')
('15', 'Pets & Animals')
('17', 'Sports')
('18', 'Short Movies')
('19', 'Travel & Events')
('20', 'Gaming')
('21', 'Videoblogging')
('22', 'People & Blogs')
('23', 'Comedy')
('24', 'Entertainment')
('25', 'News & Politics')
('26', 'Howto & Style')
('27', 'Education')
('28', 'Science & Technology')
('29', 'Nonprofits & Activism')
('30', 'Movies')
('31', 'Anime/Animation')
('32', 'Action/Adventure')
('33', 'Classics')
('34', 'Comedy')
('35', 'Documentary')
('36', 'Drama')
('37', 'Family')
('38', 'Foreign')
('39', 'Horror')
('40', 'Sci-Fi/Fantasy')
('41', 'Thriller')
('42', 'Shorts')
('43', 'Shows')
('44', 'Trailers')


In [11]:
# data cleaning and preparation
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


In [None]:
title = re.sub('[^a-zA-Z]', '', new_df['title'][0])
title
new_df['description']

In [None]:
# data transformation
def process(items):
    corpus = []
    for s in items:
        s = re.sub('[^a-zA-Z]', ' ', s)
        s = s.lower()
        s = s.split()
        ps = PorterStemmer()
        s = [ps.stem(word) for word in s if not word in set(stopwords.words('english'))]
        s = ' '.join(s)
        corpus.append(s)
    return corpus

title = process(new_df['title'])
channel_title = process(new_df['channel_title'])
tags= process(new_df['tags'])

In [None]:
# data transformation: for just description
def process_des(items):
    corpus = []
    for s in items:
        if type(s)==str:
            s = re.sub('[^a-zA-Z]', ' ', s)
            s = s.lower()
            s = re.sub(r'^https?:\/\/.*[\r\n]*', '', s, flags=re.MULTILINE)
            #print(s)
            s = s.split()
            ps = PorterStemmer()
            s = [ps.stem(word) for word in s if not word in set(stopwords.words('english'))]
            s = ' '.join(s)
            corpus.append(s)
        else:
            corpus.append('')
    return corpus
description= process_des(new_df['description'])

In [None]:
# # Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer

agg_str = []
for i in range(40949):
    agg_str.append(title[i]+channel_title[i]+tags[i]+description[i])
cv = CountVectorizer(max_features=3000)
X = cv.fit_transform(agg_str).toarray()
np.shape(X)

In [None]:
new_df['popularity'] = pd.cut(new_df.views, bins=[0,10000,100000,1000000,1000000000], labels=[1,2,3,4])
p1 = new_df.index[new_df['popularity'] == 1].tolist()
p2 = new_df.index[new_df['popularity'] == 2].tolist()
p3 = new_df.index[new_df['popularity'] == 3].tolist()
p4 = new_df.index[new_df['popularity'] == 4].tolist()

In [None]:
y = new_df.iloc[:,7].values
y = y.astype(int)
y

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [None]:
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [None]:
cm

In [None]:
from sklearn.metrics import accuracy_score, cohen_kappa_score, classification_report
#print(accuracy_score(y_test, y_pred))
#print(cohen_kappa_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=['1','2','3','4']))

In [None]:
# sentiment analysis
from textblob import TextBlob

positive = [0]*40949
for i in range(40949):
    zen = TextBlob(agg_str[i])
    if zen.sentiment.polarity > 0:
        positive[i] = 1

In [None]:
sum(positive)/len(positive)

In [None]:
res1 = 0
for i in p1:
    res1+= positive[i]
print(res1/len(p1))
res1 = 0
for i in p2:
    res1+= positive[i]
print(res1/len(p2))
res1 = 0
for i in p3:
    res1+= positive[i]
print(res1/len(p3))
res1 = 0
for i in p4:
    res1+= positive[i]
print(res1/len(p4))

In [None]:

# Python program to generate WordCloud 
# importing all necessery modules 
from wordcloud import WordCloud, STOPWORDS 
import matplotlib.pyplot as plt 
import pandas as pd 
 
edit_str = []   
for i in range(40949):
    edit_str.append(title[i]+channel_title[i]+tags[i])
                   
comment_words = ' '.join(edit_str)
stopwords = set(STOPWORDS) 
  

wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='white', 
                stopwords = stopwords, 
                min_font_size = 10).generate(comment_words) 
  
# plot the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show() 

In [None]:
# create coloring from image
import random
from wordcloud import ImageColorGenerator
from PIL import Image

def grey_color_func(word, font_size, position, orientation, random_state=None, **kwargs):
    return "hsl(0, 0%%, %d%%)" % random.randint(10, 50)
    
mask = np.array(Image.open("youtube-t.png"))

wordcloud_youtube = WordCloud(stopwords=stopwords, background_color="white", 
                          mode="RGBA", mask=mask,collocations=False).generate(comment_words)

image_colors = ImageColorGenerator(mask)
plt.figure(figsize=[30,10])
#plt.imshow(wordcloud_youtube) 
plt.imshow(wordcloud_youtube.recolor(color_func=image_colors), interpolation="bilinear")
plt.axis("off")

In [None]:
with open("cloud.txt", "w") as text_file:
    text_file.write(comment_words)

In [None]:
from collections import Counter

words = []
for t in title:
    words+=t.split(' ')
for t in tags:
    words+=t.split(' ')
for t in channel_title:
    words+=t.split(' ')
Counter(words).most_common(100)