In [61]:
import pandas as pd
import numpy as np
import nltk
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,roc_auc_score

In [29]:
data=pd.read_csv("./Dataset_youtube.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,Video_id,Title,Description,Category
0,0,i9E_Blai8vk,TRAVEL VLOG ∙ Welcome to Bali | PRISCILLA LEE,I had the chance to fly out to Bali with my wh...,Travel Blogs
1,1,e2NQE41J5eM,How do I travel so much ! How do I earn money!!,SUBSCRIBE - https://goo.gl/dEtSMJ (‘MountainTr...,Travel Blogs
2,2,ehmsJLZlCZ0,Ep 1| Travelling through North East India | Of...,"The journey to Arunachal, North East India beg...",Travel Blogs
3,3,-LzdIILq5vE,GOA TRAVEL DIARY | FOUR DAYS IN GOA | TRAVEL O...,Hope you enjoy MY GOA TRAVEL DIARY this video!...,Travel Blogs
4,4,7ByoBJYXU0k,5 Steps to Becoming a Travel Blogger,"Travel blogger, Nikki Vargas, of The Pin the M...",Travel Blogs


In [35]:
train_data,test_data,train_labels,test_labels=train_test_split(data.iloc[:,2:-1],data.iloc[:,-1],test_size=0.2,random_state=1)
train_data.head()

Unnamed: 0,Title,Description
2739,the Israel of God Black History Promo 2018 (ex...,the Israel of God Black History Promo 2018 (ex...
1421,Top 15 Forever Foods for Survival,Top 15 Forever Foods for SurvivalBe a Team Soo...
1412,"JAPAN Street Food $100 CHALLENGE in Asakusa, T...",🎥MOUNTAIN LAMB IN UZBEKISTAN » https://youtu.b...
3248,"Sacred Rites in Flagstaff, AZ - sacred art, mu...",Established in 1992 we sell Sacred Art and Mus...
1579,Fast Food Lasagna - Epic Meal Time,MAKE A MEAL WITH US & ARNOLD!!!! http://omaze....


# Preprocessing

In [36]:
import preprocessing

In [37]:
def preprocess_dataset(data):
    for i in range(data.shape[0]):
        data.iloc[i,0]=preprocessing.preprocess(data.iloc[i,0])
        data.iloc[i,1]=preprocessing.preprocess(data.iloc[i,0])
        if i%100==0:
            print(i,end=" ")
    return data
        

In [38]:
preprocessing.vocab={}
train_data_x=preprocess_dataset(train_data)

0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 

In [39]:
vocabulary=preprocessing.vocab
len(vocabulary)

5556

In [44]:
vocabulary['money']

34

In [46]:
features=sorted(vocabulary,key=lambda x: vocabulary[x],reverse=True)[:100] #atleast occured 10 times
features={features[i]:i for i in range(len(features))}

In [49]:
features['life']

49

In [50]:
def create_dataset(data,features):
    dataset=[]
    for i in range(data.shape[0]):
        feat=np.zeros(len(features))
        for word in (data.iloc[i,0]+" "+data.iloc[i,1]).split():
            if word in features.keys():
                feat[features[word]]+=1
        dataset.append(feat)
        if i%100==0:
            print(i,end=" ")
    return np.array(dataset)

In [51]:
train_data_x=create_dataset(train_data_x.iloc[:,:],features=features)

0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 

# Testing the model

In [65]:
mnb=MultinomialNB() #data imbalance no panga
mnb.fit(train_data_x,train_labels)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [66]:
test_data_x=preprocess_dataset(test_data)
test_data_x=create_dataset(test_data_x.iloc[:,:],features=features)

0 100 200 300 400 500 600 700 0 100 200 300 400 500 600 700 

In [67]:
pred=mnb.predict(test_data_x)

print(classification_report(test_labels,pred))

                        precision    recall  f1-score   support

         Art and Music       0.90      0.99      0.94       127
                  Food       0.97      0.98      0.97        94
               History       0.95      0.95      0.95        87
         Manufacturing       1.00      0.96      0.98       134
Science and Technology       0.98      0.97      0.97       124
          Travel Blogs       1.00      0.95      0.97       135

              accuracy                           0.97       701
             macro avg       0.97      0.97      0.97       701
          weighted avg       0.97      0.97      0.97       701



In [64]:
pred=mnb.predict(test_data_x)

print(classification_report(test_labels,pred))

                        precision    recall  f1-score   support

         Art and Music       0.99      0.98      0.99       127
                  Food       0.96      0.98      0.97        94
               History       0.85      1.00      0.92        87
         Manufacturing       1.00      0.94      0.97       134
Science and Technology       0.99      0.98      0.98       124
          Travel Blogs       0.99      0.95      0.97       135

              accuracy                           0.97       701
             macro avg       0.96      0.97      0.97       701
          weighted avg       0.97      0.97      0.97       701

