# YouTube Title Category Classifier

In [1]:
import numpy as np
import pandas as pd
import collections
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Part 1: Import and clean data

### Import CSV with all country data

In [2]:
ALLvids = pd.read_csv("All_cnt_ctg.csv", header=0)
ALLvids.head(3)

Unnamed: 0.1,Unnamed: 0,channel_title,no_of_vids,trending_date,title,category_id,publish_time,views,likes,dislikes,comment_count,Category_name,Country
0,0,SET India,192,171411,Crime Patrol Dial 100 - क्राइम पेट्रोल - Ep 65...,24,2017-11-12T15:34:31.000Z,210478,905,176,85,Entertainment,CANADA
1,1,MSNBC,189,171411,Barnicle: World Leaders Are Laughing At The US...,25,2017-11-13T14:53:22.000Z,200264,2947,317,2321,News & Politics,CANADA
2,2,FBE,188,171411,YOUTUBERS REACT TO TOP 10 TWITTER ACCOUNTS OF ...,24,2017-11-12T22:00:01.000Z,960747,31810,668,5335,Entertainment,CANADA


### Create New Dataframe with only US, Great Britian, and Canada data

In [3]:
new_videos = ALLvids.loc[(ALLvids['Country'] == "U.S.A")|(ALLvids['Country'] == "GREAT BRITAIN")|(ALLvids['Country'] == "CANADA")]
            
new_videos.head(5)

Unnamed: 0.1,Unnamed: 0,channel_title,no_of_vids,trending_date,title,category_id,publish_time,views,likes,dislikes,comment_count,Category_name,Country
0,0,SET India,192,171411,Crime Patrol Dial 100 - क्राइम पेट्रोल - Ep 65...,24,2017-11-12T15:34:31.000Z,210478,905,176,85,Entertainment,CANADA
1,1,MSNBC,189,171411,Barnicle: World Leaders Are Laughing At The US...,25,2017-11-13T14:53:22.000Z,200264,2947,317,2321,News & Politics,CANADA
2,2,FBE,188,171411,YOUTUBERS REACT TO TOP 10 TWITTER ACCOUNTS OF ...,24,2017-11-12T22:00:01.000Z,960747,31810,668,5335,Entertainment,CANADA
3,3,The Young Turks,186,171411,The Alt-Right Is Crushing On Taylor Swift,25,2017-11-13T02:00:00.000Z,100394,1619,907,1595,News & Politics,CANADA
4,4,REACT,183,171411,DO COLLEGE KIDS KNOW 80s MUSIC? #8 (REACT: Do ...,24,2017-11-12T20:00:01.000Z,549374,16832,248,3579,Entertainment,CANADA


# Part 2: Train model using Naive Bayes
***

### Split 'Title' into a string of words using CountVectorizer:

In [4]:
vector = CountVectorizer()
counts = vector.fit_transform(new_videos['title'].values)

### Use the Naive Bayes model and target 'Category':

In [5]:
NB_Model = MultinomialNB()
targets = new_videos['category_id'].values
NB_Model.fit(counts,targets)

MultinomialNB()

### Check Accuracy using a 90/10 train/test split

In [6]:
X= counts
y= targets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .1)

NBtest = MultinomialNB().fit(X_train, y_train)
nb_predictions = NBtest.predict(X_test)
acc_nb = NBtest.score(X_test, y_test)
print('The Naive Bayes Algorithm scored an accuracy of', acc_nb)

The Naive Bayes Algorithm scored an accuracy of 0.47474747474747475


# Part 3: Test model

### Enter hypothetical titles to predict the category for: 

In [36]:
Titles = ["Adorable cat plays with toy",
        "Fashion Trends for Fall 2018",
        "Olympics Opening Ceremony Highlights",
        "Warriors vs. Cavs Basketball Game",
        "Donald Trump on Fox News",
        "Car Crash Injures Two",
        "Ed Sheeran - Perfect (Official Music Video)",
        "how to do eyeshadow"
         ]

### Insert titles into model to make predictions:

In [37]:
Titles_counts = vector.transform(Titles)
Predict = NB_Model.predict(Titles_counts)
Predict

array([24, 24, 17, 17, 25, 24, 10, 24])

### Create Category Names Dictionary to iterate through, pairing model outputs with matching category name 

In [38]:
#Create Category Dictionary
CategoryNamesDict = {"Film & Animation": 1,"Autos & Vehicles": 2,"Music": 10,
                "Pets & Animals": 15,"Sports":17,"Short Movies":18,"Travel & Events":19,"Gaming":20,
                "Videoblogging":21, "People & Blogs":22,"Comedy":23,"Entertainment":24,"News & Politics":25,
                "Howto & Style":26,"Education":27, "Science & Technology":28,"Nonprofits & Activism":29,
                "Movies":30,"Anime/Animation":31,"Action/Adventure":32,"Classics":33,"Comedy":34,"Documentary":35,
                "Drama":36,"Family":37,"Foreign":38,"Horror":39,"Sci-Fi/Fantasy":40,"Thriller":41,"Shorts":42,
                "Shows":43,"Trailers":44
                }

In [39]:
#Use List Comprehension to make dictionary accessible by id/title
CategoryDict = [{'id': value, 'title': key} for key, value in CategoryNamesDict.items()]
CategoryDict

[{'id': 1, 'title': 'Film & Animation'},
 {'id': 2, 'title': 'Autos & Vehicles'},
 {'id': 10, 'title': 'Music'},
 {'id': 15, 'title': 'Pets & Animals'},
 {'id': 17, 'title': 'Sports'},
 {'id': 18, 'title': 'Short Movies'},
 {'id': 19, 'title': 'Travel & Events'},
 {'id': 20, 'title': 'Gaming'},
 {'id': 21, 'title': 'Videoblogging'},
 {'id': 22, 'title': 'People & Blogs'},
 {'id': 34, 'title': 'Comedy'},
 {'id': 24, 'title': 'Entertainment'},
 {'id': 25, 'title': 'News & Politics'},
 {'id': 26, 'title': 'Howto & Style'},
 {'id': 27, 'title': 'Education'},
 {'id': 28, 'title': 'Science & Technology'},
 {'id': 29, 'title': 'Nonprofits & Activism'},
 {'id': 30, 'title': 'Movies'},
 {'id': 31, 'title': 'Anime/Animation'},
 {'id': 32, 'title': 'Action/Adventure'},
 {'id': 33, 'title': 'Classics'},
 {'id': 35, 'title': 'Documentary'},
 {'id': 36, 'title': 'Drama'},
 {'id': 37, 'title': 'Family'},
 {'id': 38, 'title': 'Foreign'},
 {'id': 39, 'title': 'Horror'},
 {'id': 40, 'title': 'Sci-Fi/Fan

In [40]:
#Create For Loop
CategoryNamesList = []

for Category_ID in Predict:
    MatchingCategories = [x for x in CategoryDict if x["id"] == Category_ID]
    print(MatchingCategories)
    if MatchingCategories:
        CategoryNamesList.append(MatchingCategories[0]["title"])
        
CategoryNamesList

[{'id': 24, 'title': 'Entertainment'}]
[{'id': 24, 'title': 'Entertainment'}]
[{'id': 17, 'title': 'Sports'}]
[{'id': 17, 'title': 'Sports'}]
[{'id': 25, 'title': 'News & Politics'}]
[{'id': 24, 'title': 'Entertainment'}]
[{'id': 10, 'title': 'Music'}]
[{'id': 24, 'title': 'Entertainment'}]


['Entertainment',
 'Entertainment',
 'Sports',
 'Sports',
 'News & Politics',
 'Entertainment',
 'Music',
 'Entertainment']

In [41]:
##OLD CODE

#CategoryNames = ["Film & Animation","Autos & Vehicles","Music",
#"Pets & Animals","Sports","Short Movies","Travel & Events","Gaming","Videoblogging",
#"People & Blogs","Comedy","Entertainment","News & Politics","Howto & Style","Education","Science & Technology",
#"Nonprofits & Activism","Movies","Anime/Animation","Action/Adventure","Classics",
#"Comedy","Documentary","Drama","Family","Foreign","Horror","Sci-Fi/Fantasy","Thriller","Shorts","Shows","Trailers"
#]
#Predict [0]
#CategoryNames[Predict[0]]
#CategoryNamesList = []
#for Category_ID in Predict:
    #CategoryNamesList.append(CategoryNames[Category_ID])
    
#CategoryNamesList
    #MatchingCategories = [x for x in CategoryDict if x["id"] == str(Category_ID)]
    #if MatchingCategories:
        #CategoryNamesList.append(MatchingCategories[0]["title"])

### Map values to the titles we want to predict:

In [42]:
TitleDataFrame = []
for i in range(0, len(Titles)):
    TitleToCategories = {'Title': Titles[i],  'Category': CategoryNamesList[i]}
    TitleDataFrame.append(TitleToCategories)

### Convert the resulting Dict to a Data Frame:

In [43]:
PredictDF = pd.DataFrame(Predict)
TitleDF = pd.DataFrame(TitleDataFrame)
PreFinalDF = pd.concat([PredictDF, TitleDF], axis=1)
PreFinalDF.columns = (['Categ_ID', 'Hypothetical Video Title', 'Predicted Category'])
FinalDF = PreFinalDF.drop(['Categ_ID'],axis=1)
cols = FinalDF.columns.tolist()
cols = cols[-1:] + cols[:-1]
FinalDF= FinalDF[cols]

# View Final Prediction Results:

In [44]:
FinalDF

Unnamed: 0,Predicted Category,Hypothetical Video Title
0,Entertainment,Adorable cat plays with toy
1,Entertainment,Fashion Trends for Fall 2018
2,Sports,Olympics Opening Ceremony Highlights
3,Sports,Warriors vs. Cavs Basketball Game
4,News & Politics,Donald Trump on Fox News
5,Entertainment,Car Crash Injures Two
6,Music,Ed Sheeran - Perfect (Official Music Video)
7,Entertainment,how to do eyeshadow


## Test Model: More Title Examples/Model Testing

### Titles2: Enter hypothetical titles to test

In [18]:
Titles2 = ["Joe Biden gives White House Speech",
        "How to dye your hair blonde",
        "Best SuperBowl Touchdowns",
        "10 Reasons Logan Paul Sucks",
        "How to jump start your car",
        "5 Reasons Aliens Are Real",
        "Great Hotels in Italy",
        "Kim Kardashian and Kanye West Divorce",
        "Machine Learning Tutorial",
        "Donald Trump Speaks at CPAC",
        "Nomadland Official Trailer",
        "BEST Hockey GOALS of ALL TIME!"
         ]

### Insert titles into model to make predictions:

In [19]:
Titles_counts2 = vector.transform(Titles2)
Predict = NB_Model.predict(Titles_counts2)
Predict

array([24, 24, 24, 24, 22, 24, 24, 24, 22, 25, 24, 24])

### Match category predictions with corresponding titles and append to list

In [20]:
CategoryNamesList2 = []

for Category_ID in Predict:
    MatchingCategories = [x for x in CategoryDict if x["id"] == Category_ID]
    print(MatchingCategories)
    if MatchingCategories:
        CategoryNamesList2.append(MatchingCategories[0]["title"])
        
CategoryNamesList2

[{'id': 24, 'title': 'Entertainment'}]
[{'id': 24, 'title': 'Entertainment'}]
[{'id': 24, 'title': 'Entertainment'}]
[{'id': 24, 'title': 'Entertainment'}]
[{'id': 22, 'title': 'People & Blogs'}]
[{'id': 24, 'title': 'Entertainment'}]
[{'id': 24, 'title': 'Entertainment'}]
[{'id': 24, 'title': 'Entertainment'}]
[{'id': 22, 'title': 'People & Blogs'}]
[{'id': 25, 'title': 'News & Politics'}]
[{'id': 24, 'title': 'Entertainment'}]
[{'id': 24, 'title': 'Entertainment'}]


['Entertainment',
 'Entertainment',
 'Entertainment',
 'Entertainment',
 'People & Blogs',
 'Entertainment',
 'Entertainment',
 'Entertainment',
 'People & Blogs',
 'News & Politics',
 'Entertainment',
 'Entertainment']

### Map values to the titles we want to predict

In [21]:
TitleDataFrame = []
for i in range(0, len(Titles2)):
    TitleToCategories = {'Title': Titles2[i],  'Category': CategoryNamesList2[i]}
    TitleDataFrame.append(TitleToCategories)

### Convert predictions and titles into DataFrame

In [32]:
PredictDF = pd.DataFrame(Predict)
TitleDF = pd.DataFrame(TitleDataFrame)
PreFinalDF = pd.concat([PredictDF, TitleDF], axis=1)
PreFinalDF.columns = (['Categ_ID', 'Hypothetical Video Title', 'Predicted Category'])
FinalDF = PreFinalDF.drop(['Categ_ID'],axis=1)
cols = FinalDF.columns.tolist()
cols = cols[-1:] + cols[:-1]
FinalDF= FinalDF[cols]

## View Predictions for Titles2

In [33]:
#Second Titles DF
FinalDF

Unnamed: 0,Predicted Category,Hypothetical Video Title
0,News & Politics,Donald Trump gives White House Speech
1,Music,Ed Sheeran- Best Hits
2,Sports,Basketball Highlights Warriors
3,Entertainment,10 Places to Travel for your next Vacation
4,Entertainment,How to put on makeup
5,Entertainment,Why Science is cool
6,People & Blogs,Jake Paul Worst Songs
7,Entertainment,Ariana Grande Thank You Next
8,Entertainment,Car Tutorial
9,News & Politics,Donald Trump Speaks at CPAC


## Titles3

In [24]:
Titles3 = ["Donald Trump gives White House Speech",
        "Ed Sheeran- Best Hits",
        "Basketball Highlights Warriors",
        "10 Places to Travel for your next Vacation",
        "How to put on makeup",
        "Why Science is cool",
        "Jake Paul Worst Songs",
        "Ariana Grande Thank You Next",
        "Car Tutorial",
        "Donald Trump Speaks at CPAC",
        "Drake - God’s Plan",
        "Roy Moore Speech"
         ]

In [25]:
Titles_counts3 = vector.transform(Titles3)
Predict = NB_Model.predict(Titles_counts3)
Predict

array([25, 10, 17, 24, 24, 24, 22, 24, 24, 25, 10, 25])

In [26]:
CategoryNamesList3 = []

for Category_ID in Predict:
    MatchingCategories = [x for x in CategoryDict if x["id"] == Category_ID]
    print(MatchingCategories)
    if MatchingCategories:
        CategoryNamesList3.append(MatchingCategories[0]["title"])
        
CategoryNamesList3

[{'id': 25, 'title': 'News & Politics'}]
[{'id': 10, 'title': 'Music'}]
[{'id': 17, 'title': 'Sports'}]
[{'id': 24, 'title': 'Entertainment'}]
[{'id': 24, 'title': 'Entertainment'}]
[{'id': 24, 'title': 'Entertainment'}]
[{'id': 22, 'title': 'People & Blogs'}]
[{'id': 24, 'title': 'Entertainment'}]
[{'id': 24, 'title': 'Entertainment'}]
[{'id': 25, 'title': 'News & Politics'}]
[{'id': 10, 'title': 'Music'}]
[{'id': 25, 'title': 'News & Politics'}]


['News & Politics',
 'Music',
 'Sports',
 'Entertainment',
 'Entertainment',
 'Entertainment',
 'People & Blogs',
 'Entertainment',
 'Entertainment',
 'News & Politics',
 'Music',
 'News & Politics']

In [27]:
TitleDataFrame = []
for i in range(0, len(Titles3)):
    TitleToCategories = {'Title': Titles3[i],  'Category': CategoryNamesList3[i]}
    TitleDataFrame.append(TitleToCategories)

In [30]:
PredictDF = pd.DataFrame(Predict)
TitleDF = pd.DataFrame(TitleDataFrame)
PreFinalDF = pd.concat([PredictDF, TitleDF], axis=1)
PreFinalDF.columns = (['Categ_ID', 'Hypothetical Video Title', 'Predicted Category'])
FinalDF = PreFinalDF.drop(['Categ_ID'],axis=1)
cols = FinalDF.columns.tolist()
cols = cols[-1:] + cols[:-1]
FinalDF= FinalDF[cols]

## Predictions for Titles3

In [31]:
FinalDF

Unnamed: 0,Predicted Category,Hypothetical Video Title
0,News & Politics,Donald Trump gives White House Speech
1,Music,Ed Sheeran- Best Hits
2,Sports,Basketball Highlights Warriors
3,Entertainment,10 Places to Travel for your next Vacation
4,Entertainment,How to put on makeup
5,Entertainment,Why Science is cool
6,People & Blogs,Jake Paul Worst Songs
7,Entertainment,Ariana Grande Thank You Next
8,Entertainment,Car Tutorial
9,News & Politics,Donald Trump Speaks at CPAC
