In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [2]:
features = pd.read_excel('raw_data_with_labels.xlsx')
features = features[features['y'].notnull()]
features.shape

(500, 5)

In [3]:
features['view_per_day'] = round(features['view_count'] / features['tempo_desde_pub'], 4)
features = features.drop(['tempo_desde_pub'], axis=1)
features.head()

Unnamed: 0,title,y,upload_date,view_count,view_per_day
0,How Far is Too Far? | The Age of A.I.,0.0,2019-12-18,49218295,79002.0787
1,AlphaGo - The Movie | Full award-winning docum...,0.0,2020-03-13,26896993,50087.5102
2,Artificial intelligence and algorithms: pros a...,0.0,2019-09-26,6424768,9100.238
3,"#AndroidDevChallenge - Helpful innovation, pow...",0.0,2020-06-22,5779436,13255.5872
4,Become a DATA ANALYST with NO degree?!? The Go...,0.0,2021-03-17,2037151,12125.8988


In [18]:
features.columns[(features.dtypes == 'int64').to_numpy()]

Index(['view_count'], dtype='object')

In [9]:
X = features.copy().drop(['y', 'upload_date'], axis=1)
y = features['y']

In [10]:
Xtrain, Xval, ytrain, yval = train_test_split(X, y, test_size=0.5, random_state=0)
Xtrain.shape, Xval.shape, ytrain.shape, yval.shape

((250, 3), (250, 3), (250,), (250,))

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

title_train = Xtrain['title']
title_val = Xval['title']

title_vec = TfidfVectorizer(min_df=2) #min_df = minimo de exemplos de palavra
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)

In [18]:
title_bow_train.shape

(250, 238)

In [20]:
from scipy.sparse import hstack, vstack

Xtrain_wtitle = hstack([Xtrain.drop(['title'], axis=1), title_bow_train])
Xval_wtitle = hstack([Xval.drop(['title'], axis=1), title_bow_val])

In [26]:
Xtrain_wtitle.shape, Xval_wtitle.shape

((250, 240), (250, 240))

In [28]:
mdl = RandomForestClassifier(n_estimators=1000, random_state=0, class_weight='balanced', n_jobs=6)
mdl.fit(Xtrain_wtitle, ytrain)

RandomForestClassifier(class_weight='balanced', n_estimators=1000, n_jobs=6,
                       random_state=0)

In [30]:
p = mdl.predict_proba(Xval_wtitle)[ : , 1]

In [31]:
from sklearn.metrics import roc_auc_score, average_precision_score

In [32]:
average_precision_score(yval, p)

0.32909939825559575

In [33]:
roc_auc_score(yval, p)

0.7491956241956242