# Modelo de Classificação (Youtube Video Dataset)
https://www.kaggle.com/datasets/rahulanand0070/youtubevideodataset

In [2]:
import pandas as pd
import numpy as np
import copy as cp
import matplotlib.pyplot as plt
import re

# NLTK
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# SKLearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn import metrics

## Preparação do Dataset

A partir do .csv informado, vamos preparar o dataset para os algoritmos. Nesse caso, vamos remover valores nulos e filtrar as colunas de interesse.

In [3]:
df = pd.read_csv('datasets/youtube_video_dataset.csv')
df.head()

Unnamed: 0,Title,Videourl,Category,Description
0,Madagascar Street Food!!! Super RARE Malagasy ...,/watch?v=EwBA1fOQ96c,Food,🎥GIANT ALIEN SNAIL IN JAPAN! » https://youtu.b...
1,42 Foods You Need To Eat Before You Die,/watch?v=0SPwwpruGIA,Food,This is the ultimate must-try food bucket list...
2,Gordon Ramsay’s Top 5 Indian Dishes,/watch?v=upfu5nQB2ks,Food,We found 5 of the best and most interesting In...
3,How To Use Chopsticks - In About A Minute 🍜,/watch?v=xFRzzSF_6gk,Food,You're most likely sitting in a restaurant wit...
4,Trying Indian Food 1st Time!,/watch?v=K79bXtaRwcM,Food,HELP SUPPORT SINSTV!! Shop Our Sponsors!\nLast...


In [4]:
colunas = list()
for coluna in df.columns:
    colunas.append(coluna)
print("Colunas:", " ".join(colunas))
print("Número de Linhas:", df.shape[0])
# Removendo as colunas que não são interessantes (nesse caso, apenas importa "Title" e "Category")
df = df.iloc[:, [0, 2, 3]]
colunas = list()
for coluna in df.columns:
    colunas.append(coluna)
print("Colunas:", " ".join(colunas))
print("Número de Linhas com Valores Nulos:", df.isna().sum().sum())
# Retirando linhas com valores nulos 
df = df.dropna()
print("Número de Linhas Após Remoção de Nulos:", df.shape[0])
print("Número de Linhas com Valores Nulos (Verificação):", df.isna().sum().sum())

Colunas: Title Videourl Category Description
Número de Linhas: 11211
Colunas: Title Category Description
Número de Linhas com Valores Nulos: 83
Número de Linhas Após Remoção de Nulos: 11128
Número de Linhas com Valores Nulos (Verificação): 0


### Novo dataset com as colunas novas e valores removidos:

In [5]:
df

Unnamed: 0,Title,Category,Description
0,Madagascar Street Food!!! Super RARE Malagasy ...,Food,🎥GIANT ALIEN SNAIL IN JAPAN! » https://youtu.b...
1,42 Foods You Need To Eat Before You Die,Food,This is the ultimate must-try food bucket list...
2,Gordon Ramsay’s Top 5 Indian Dishes,Food,We found 5 of the best and most interesting In...
3,How To Use Chopsticks - In About A Minute 🍜,Food,You're most likely sitting in a restaurant wit...
4,Trying Indian Food 1st Time!,Food,HELP SUPPORT SINSTV!! Shop Our Sponsors!\nLast...
...,...,...,...
11206,"art journal | shimmer sprays, stencils, collag...",Art&Music,Step by step video on creating an art journal ...
11207,Ar-Tea Collage * Mixed Media Art,Art&Music,"By: Ilene McInnes,\nMixed media Art and inspir..."
11208,DIY Mixed Media Art Collage Greeting Cards / M...,Art&Music,Make your own Mixed Media Greeting Cards\n\nHe...
11209,Art Collage Process DecoJournal using Rice Pap...,Art&Music,Art Collage Process DecoJournal using Rice Pap...


Abaixo, foi verificado quantas categorias existem e qual a frequência de cada categoria. Como é possível observar, a mais comum é a de Viagem, enquanto a menos comum é a de História. Nesse caso, vamos mapear as classes aqui informadas para uma representação numérica.

In [6]:
df['Category'].value_counts()

travel blog           2200
Science&Technology    2074
Food                  1828
manufacturing         1699
Art&Music             1682
History               1645
Name: Category, dtype: int64

In [7]:
lista_categorias = df['Category'].value_counts().index.to_list()
dict_mapeamento = dict()
for i in range(len(lista_categorias)):
    dict_mapeamento[lista_categorias[i]] = i
df['Category'] = df['Category'].map(dict_mapeamento)
df.head()

Unnamed: 0,Title,Category,Description
0,Madagascar Street Food!!! Super RARE Malagasy ...,2,🎥GIANT ALIEN SNAIL IN JAPAN! » https://youtu.b...
1,42 Foods You Need To Eat Before You Die,2,This is the ultimate must-try food bucket list...
2,Gordon Ramsay’s Top 5 Indian Dishes,2,We found 5 of the best and most interesting In...
3,How To Use Chopsticks - In About A Minute 🍜,2,You're most likely sitting in a restaurant wit...
4,Trying Indian Food 1st Time!,2,HELP SUPPORT SINSTV!! Shop Our Sponsors!\nLast...


## Criação do Vetor TFIDF

O vetor TFIDF (Term Frequency - Inverse Document Frequency) é baseado no modelo de Bag of Words e procura dar mais importância para as palavras mais raras que aparecem dentro do corpo de um documento. Ainda assim, leva também em consideração as palavras mais frequentes. Ele foi escolhido em relação ao CountVectorizer, pois a acurácia dos modelos pareciam aumentar levemente quanto usando o TFIDF.

In [8]:
df['Title'] = df['Title'].apply(lambda x: x.lower()) # Tirando letras minúsculas
df['Description'] = df['Description'].apply(lambda x: x.lower())
df['Title'] = df['Title'].apply(lambda x: re.sub("[^a-zA-Z]"," ", x)) # Filtrando símbolos
df['Description'] = df['Description'].apply(lambda x: re.sub("[^a-zA-Z]"," ", x))

In [9]:
#https://datascience.stackexchange.com/questions/25004/text-classifier-with-multiple-bag-of-words

#import nltk
#nltk.download('punkt')

lista_stopwords = set(stopwords.words('english'))
ps = PorterStemmer()

lista_frases = list()

for i, row in df.iterrows():
    lista_palavra = word_tokenize(row['Title'])
    lista_palavra.extend(word_tokenize(row['Description']))
    lista_string = []
    for palavra in lista_palavra:
        if palavra not in lista_stopwords:
            palavra = ps.stem(palavra) # Stemming da palavra
            lista_string.append(palavra)
    string = " ".join(lista_string)
    lista_frases.append(string)

array_frases = np.array(lista_frases)

## Separação do Dataset entre Treinamento e Teste

Link explicando melhor sobre Cross Validation: https://machinelearningmastery.com/training-validation-test-split-and-cross-validation-done-right/

In [10]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer()

X = vectorizer.fit_transform(array_frases)
print(X.shape)

Y = df['Category'].values

X_train, X_test, y_train, y_test = train_test_split(X, Y, stratify=Y, test_size=0.33, shuffle=True)

kfold = StratifiedKFold(n_splits=10,  shuffle=True)

(11128, 77254)


In [11]:
## Função para avaliar o modelo com Cross Validation e exibindo a acurácia e o f1 score (macro e weighted)
def avalia_modelo(modelo):
    scoring = ["accuracy", "f1_macro", "f1_weighted"]
    scores = cross_validate(modelo, X, Y, cv=kfold, scoring=scoring, return_estimator=True)
    print("Acurácia: %0.2f%%" %(scores['test_accuracy'].mean()*100))
    print("F1-Score Macro: %0.2f%%" %(scores['test_f1_macro'].mean()*100))
    print("F1-Score Weighted: %0.2f%%" %(scores['test_f1_weighted'].mean()*100))
    return

## Regressão Logística

In [13]:
# https://vitalflux.com/text-classification-bag-of-words-model-python-sklearn/
# https://www.svm-tutorial.com/2014/10/svm-linear-kernel-good-text-classification/
import warnings 
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver='liblinear') # Indicado para datasets menores
print("Regressão Logística:")
avalia_modelo(lr)

Regressão Logística:
Acurácia: 96.52%
F1-Score Macro: 96.56%
F1-Score Weighted: 96.52%


In [14]:
from sklearn.model_selection import GridSearchCV
warnings.filterwarnings('ignore')

solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
grid = dict(solver=solvers,penalty=penalty,C=c_values)

grid_search = GridSearchCV(estimator=lr, param_grid=grid, n_jobs=-1, cv=kfold, scoring='accuracy', error_score=0)
grid_result = grid_search.fit(X, Y)
print("Melhor score: %f | Parâmetros = %s" % (grid_result.best_score_, grid_result.best_params_))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Melhor score: 0.970884 | Parâmetros = {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}


In [15]:
lr = LogisticRegression(solver='liblinear', C=10, penalty='l2')
print("Regressão Logística (Após tuning de hiperparâmetros):")
avalia_modelo(lr)

Regressão Logística (Após tuning de hiperparâmetros):
Acurácia: 97.25%
F1-Score Macro: 97.27%
F1-Score Weighted: 97.25%


## Gradiente Descendente Estocástico

In [16]:
from sklearn.linear_model import SGDClassifier, Perceptron

sgd = SGDClassifier(loss='perceptron', learning_rate='optimal')
print("SGD:")
avalia_modelo(sgd)

SGD:
Acurácia: 96.05%
F1-Score Macro: 96.06%
F1-Score Weighted: 96.05%


In [17]:
from sklearn.linear_model import Perceptron

perc = Perceptron()
print("Perceptron:")
avalia_modelo(perc)

Perceptron:
Acurácia: 96.22%
F1-Score Macro: 96.20%
F1-Score Weighted: 96.22%


## Árvore de Decisão

In [18]:
import multiprocessing 
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
print("Árvore de Decisão:")
avalia_modelo(dt)

Árvore de Decisão:
Acurácia: 91.84%
F1-Score Macro: 91.88%
F1-Score Weighted: 91.84%


## SVC (Supporting Vector Machine)

In [19]:
from sklearn.svm import SVC

svc = SVC()
print("SVC:")
avalia_modelo(svc)

SVC:
Acurácia: 96.26%
F1-Score Macro: 96.28%
F1-Score Weighted: 96.27%


## Naive Bayes

In [20]:
# https://stackoverflow.com/questions/16240721/sklearn-gaussiannb-bad-results-nan-probabilities porque nao é bom com TF-IDF.
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
print("Multinomial Naive Bayes:")
avalia_modelo(nb)

Multinomial Naive Bayes:
Acurácia: 88.16%
F1-Score Macro: 88.20%
F1-Score Weighted: 88.24%


## Random Forest

In [21]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
print("Random Forest:")
avalia_modelo(rf)

Random Forest:
Acurácia: 95.39%
F1-Score Macro: 95.41%
F1-Score Weighted: 95.39%


## K-Nearest Neighbour

In [12]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
print("KNN:")
avalia_modelo(knn)

KNN:
Acurácia: 46.19%
F1-Score Macro: 44.21%
F1-Score Weighted: 44.67%


## Avaliação da Regressão Logística

In [None]:
# import seaborn as sns
# from sklearn.metrics import confusion_matrix

# def cross_val_predict(model, kfold, X, y):
#     model_ = cp.deepcopy(model)
#     no_classes = len(np.unique(y))
#     actual_classes = np.empty([0], dtype=int)
#     predicted_classes = np.empty([0], dtype=int)
#     predicted_proba = np.empty([0, no_classes]) 
#     for train_ndx, test_ndx in kfold.split(X, y):
#         train_X, train_y, test_X, test_y = X[train_ndx], y[train_ndx], X[test_ndx], y[test_ndx]
#         actual_classes = np.append(actual_classes, test_y)
#         model_.fit(train_X, train_y)
#         predicted_classes = np.append(predicted_classes, model_.predict(test_X))
#         try:
#             predicted_proba = np.append(predicted_proba, model_.predict_proba(test_X), axis=0)
#         except:
#             predicted_proba = np.append(predicted_proba, np.zeros((len(test_X), no_classes), dtype=float), axis=0)

#     return actual_classes, predicted_classes, predicted_proba


# def plot_confusion_matrix(actual_classes, predicted_classes, sorted_labels):
#     matrix = confusion_matrix(actual_classes, predicted_classes, labels=sorted_labels)
#     plt.figure(figsize=(12.8,6))
#     sns.heatmap(matrix, annot=True, xticklabels=sorted_labels, yticklabels=sorted_labels, cmap="Blues", fmt="g")
#     plt.xlabel('Predicted'); plt.ylabel('Actual'); plt.title('Confusion Matrix')
#     plt.show()

# actual_classes, predicted_classes, _ = cross_val_predict(lr, kfold, X, Y)
# plot_confusion_matrix(actual_classes, predicted_classes, Y)

### Curva de Aprendizado

In [None]:
from sklearn.model_selection import learning_curve
import warnings 
warnings.filterwarnings('ignore')

# Regressão Logística
tsize, training_score, test_score = learning_curve(lr, X, Y, cv=kfold, random_state=1000)

avg_tr_scores = np.mean(training_score, axis=1)
avg_test_scores = np.mean(test_score, axis=1)

plt.plot(tsize,avg_tr_scores,label='Training Score')
plt.plot(tsize,avg_test_scores,label='CV Score')
plt.legend()
plt.show()

### Matriz de Confusão

Note que nesse caso a matriz de confusão abaixo não é feita em cima do Cross Validation utilizado para obter as métricas do modelo. Para isso é preciso ter uma nova função de predição em cima do CV. 

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, plot_confusion_matrix

def matriz_confusao(modelo, X_train, X_test, y_train, y_test):
    modelo.fit(X_train, y_train)
    cm = confusion_matrix(y_test, modelo.predict(X_test))
    plot_confusion_matrix(modelo, X_test, y_test, cmap=plt.cm.Blues)
    plt.show()

matriz_confusao(lr, X_train, X_test, y_train, y_test)