In [1]:
#IMPORTAR DATASETS DE GOOGLE DRIVE

import pandas as pd
from google.colab import drive

drive.mount('/content/drive')
dataTrain = pd.read_csv('/content/drive/MyDrive/Datasets/conjuntoEntrenamiento.csv', sep=';')
dataTest = pd.read_csv('/content/drive/MyDrive/Datasets/conjuntoTest.csv', sep=';')

dataTrain

Mounted at /content/drive


Unnamed: 0,titular,clickbait
0,I Wore A Smart Ring For A Month And This Is Wh...,1
1,What Does Your Favorite Condiment Say About You,1
2,10 Reasons Were Psyched About The Renewal Of S...,1
3,Phelps Retains Mazda Sponsorship After Apology...,0
4,These Gorgeous Moving Portraits Feature Your F...,1
...,...,...
31663,Trump Signals America First,0
31664,The Coen brothers are planning their first TV ...,0
31665,TAO Group announces major L.A. expansion,0
31666,Taxi driver stabbed to death in Nova Scotia,0


In [2]:
#COUNT VECTORIZER

from sklearn.feature_extraction.text import CountVectorizer 
 
countVectorizer = CountVectorizer() 

dfTrain = pd.DataFrame(dataTrain, index = None)
dfTest = pd.DataFrame(dataTest, index = None)
dfTrain.head()

train_x, train_y = dataTrain['titular'], dataTrain['clickbait']
test_x, test_y = dataTest['titular'], dataTest['clickbait']

train_x_vector = countVectorizer.fit_transform(train_x.values.astype('U'))
test_x_vector = countVectorizer.transform(test_x.values.astype('U'))

In [None]:
#TFIDF VECTORIZER

from sklearn.feature_extraction.text import TfidfVectorizer

train_x, train_y = dataTrain['titular'], dataTrain['clickbait']
test_x, test_y = dataTest['titular'], dataTest['clickbait']

tfidfVectorizer = TfidfVectorizer(stop_words='english')

train_x_vector = tfidfVectorizer.fit_transform(train_x.values.astype('U'))
test_x_vector = tfidfVectorizer.transform(test_x.values.astype('U'))

In [None]:
#DICT VECTORIZER

from sklearn.feature_extraction import DictVectorizer

import numpy as np
import csv
from sklearn.metrics import roc_auc_score

def read_dataTrain(n, offset=0):
    X_dict, y = [], []
    with open('/content/drive/MyDrive/Datasets/conjuntoEntrenamiento.csv', 'r') as csvfile:
        reader = csv.DictReader(csvfile, delimiter=';')
        for i in range(offset):
            next(reader)
        i = 0
        for row in reader:
            i += 1
            y.append(int(row['clickbait']))
            del row['clickbait']
            X_dict.append(row)
            if i >= n:
                break
    return X_dict, y

def read_dataTest(n, offset=0):
    X_dict, y = [], []
    with open('/content/drive/MyDrive/Datasets/conjuntoTest.csv', 'r') as csvfile:
        reader = csv.DictReader(csvfile, delimiter=';')
        for i in range(offset):
            next(reader)
        i = 0
        for row in reader:
            i += 1
            y.append(int(row['clickbait']))
            del row['clickbait']
            X_dict.append(row)
            if i >= n:
                break
    return X_dict, y

nTrain = 21000
nTest = 7000
X_dict_train, y_train = read_dataTrain(nTrain)
X_dict_test, y_test = read_dataTest(nTest)

dtrain = dataTrain.iloc[:nTrain]
dtest = dataTrain.iloc[:nTest]

train_x, train_y = dtrain['titular'], dtrain['clickbait']
test_x, test_y = dtest['titular'], dtest['clickbait']

dictVectorizer = DictVectorizer(sparse=False)
train_x_vector = dictVectorizer.fit_transform(X_dict_train)
test_x_vector = dictVectorizer.transform(X_dict_test)

In [None]:
#SUPPORT VECTOR MACHINE

from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

svc = SVC(kernel='linear',probability=True)
svc.fit(train_x_vector, train_y)

score = svc.score(test_x_vector, test_y)
pred = svc.predict(test_x_vector)
pred_proba = svc.predict_proba(test_x_vector)
report = classification_report(test_y, pred, labels=[0,1])

print("Score: ", score)
print("F1 Score: ", f1_score(test_y, pred,labels=[0,1], average=None))
print("Report: ")
print(classification_report(test_y, pred, labels=[0,1]))

Score:  0.7115317818687044
F1 Score:  [0.70131271 0.72107473]
Report: 
              precision    recall  f1-score   support

           0       0.73      0.68      0.70      2879
           1       0.70      0.75      0.72      2879

    accuracy                           0.71      5758
   macro avg       0.71      0.71      0.71      5758
weighted avg       0.71      0.71      0.71      5758



In [3]:
#DECISION TREE

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

dec_tree = DecisionTreeClassifier()
dec_tree.fit(train_x_vector, train_y)

score = dec_tree.score(test_x_vector, test_y)
pred = dec_tree.predict(test_x_vector)
pred_proba = dec_tree.predict_proba(test_x_vector)
report = classification_report(test_y, pred, labels=[0,1])

print("Score: ", score)
print("F1 Score: ", f1_score(test_y, pred,labels=[0,1], average=None))
print("Report: ")
print(classification_report(test_y, pred, labels=[0,1]))

Score:  0.7921758075210761
F1 Score:  [0.73040059 0.83091862]
Report: 
              precision    recall  f1-score   support

           0       0.75      0.71      0.73      4159
           1       0.82      0.84      0.83      6398

    accuracy                           0.79     10557
   macro avg       0.78      0.78      0.78     10557
weighted avg       0.79      0.79      0.79     10557



In [None]:
#NAIVE BAYES

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

gnb = GaussianNB()

#For Dict Vectorizer:

#gnb.fit(train_x_vector, train_y)
#score = gnb.score(test_x_vector, test_y)
#pred = gnb.predict(test_x_vector)
#pred_proba = gnb.predict_proba(test_x_vector)
#report = classification_report(test_y, pred, labels=[0,1])

#For Count Vectorizer or Tf-idf:

gnb.fit(train_x_vector.toarray(), train_y)
score = gnb.score(test_x_vector.toarray(), test_y)
pred = gnb.predict(test_x_vector.toarray())
pred_proba = gnb.predict_proba(test_x_vector.toarray())
report = classification_report(test_y, pred, labels=[0,1])

print("Score: ", score)
print("F1 Score: ", f1_score(test_y, pred,labels=[0,1], average=None))
print("Report: ")
print(classification_report(test_y, pred, labels=[0,1]))

Score:  0.6351163598471692
F1 Score:  [0.55001071 0.69315028]
Report: 
              precision    recall  f1-score   support

           0       0.72      0.45      0.55      2879
           1       0.60      0.82      0.69      2879

    accuracy                           0.64      5758
   macro avg       0.66      0.64      0.62      5758
weighted avg       0.66      0.64      0.62      5758



In [4]:
#LOGISTIC REGRESSION

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

log_reg = LogisticRegression()
log_reg.fit(train_x_vector, train_y)

score = log_reg.score(test_x_vector, test_y)
pred = log_reg.predict(test_x_vector)
pred_proba = log_reg.predict_proba(test_x_vector)
report = classification_report(test_y, pred, labels=[0,1])

print("Score: ", score)
print("F1 Score: ", f1_score(test_y, pred,labels=[0,1], average=None))
print("Report: ")
print(classification_report(test_y, pred, labels=[0,1]))

Score:  0.8482523444160273
F1 Score:  [0.80553532 0.87558248]
Report: 
              precision    recall  f1-score   support

           0       0.81      0.80      0.81      4159
           1       0.87      0.88      0.88      6398

    accuracy                           0.85     10557
   macro avg       0.84      0.84      0.84     10557
weighted avg       0.85      0.85      0.85     10557



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [5]:
#Libreria Anvil Uplink

!pip install anvil-uplink

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting anvil-uplink
  Downloading anvil_uplink-0.4.0-py2.py3-none-any.whl (88 kB)
[K     |████████████████████████████████| 88 kB 8.0 MB/s 
[?25hCollecting ws4py
  Downloading ws4py-0.5.1.tar.gz (51 kB)
[K     |████████████████████████████████| 51 kB 235 kB/s 
Collecting argparse
  Downloading argparse-1.4.0-py2.py3-none-any.whl (23 kB)
Building wheels for collected packages: ws4py
  Building wheel for ws4py (setup.py) ... [?25l[?25hdone
  Created wheel for ws4py: filename=ws4py-0.5.1-py3-none-any.whl size=45229 sha256=32d8f0d818c8cd343dc4a1cf93c6a4e651a9bb217f64bd6ad3928232dee6274c
  Stored in directory: /root/.cache/pip/wheels/29/ea/7d/3410aa0aa0e4402ead9a7a97ab2214804887e0f5c2b76f0c96
Successfully built ws4py
Installing collected packages: ws4py, argparse, anvil-uplink
Successfully installed anvil-uplink-0.4.0 argparse-1.4.0 ws4py-0.5.1


In [6]:
#Servicio web

import anvil.server

anvil.server.connect("LA6NSQZCSO5X27CWO6XZA6RH-YRUJISVSLLU7OHCF")

@anvil.server.callable
def check(headline):

  vector = countVectorizer.transform([headline])
  pred = log_reg.predict(vector)
  proba = log_reg.predict_proba(vector)
  
  #logs
  print("Headline: %s" % headline)
  print("Clickbait: %s" % pred[0])
  print("Proba: %s" % proba)
  print()

  return pred[0], proba
  

anvil.server.wait_forever()

Connecting to wss://anvil.works/uplink
Anvil websocket open
Connected to "Default environment" as SERVER
Headline: Here’s The Original Meaning Of 8 Phrases You Probably Use All The Time
Clickbait: 1
Proba: [[0.00375713 0.99624287]]

Headline: A 20-Year-Old Mother Pushing A Baby Stroller Was Shot In The Head And Killed In NYC
Clickbait: 0
Proba: [[0.710272 0.289728]]

Headline: I Have Monkeypox — Here’s What It’s Like And What You Should Know
Clickbait: 1
Proba: [[0.00463685 0.99536315]]

Headline: This Mom’s Heart Was Broken After Her Son’s Classmates Wouldn’t Sign His Yearbook, So People Are Coming Together To Brighten This 12-Year-Old’s Day
Clickbait: 1
Proba: [[0.00622118 0.99377882]]

Headline: This Mom’s Heart Was Broken After Her Son’s Classmates Wouldn’t Sign His Yearbook, So People Are Coming Together To Brighten This 12-Year-Old’s Day
Clickbait: 1
Proba: [[0.00622118 0.99377882]]

Headline: Court Decision Leaves Biden With Few Tools to Combat Climate Change
Clickbait: 0
Proba:

KeyboardInterrupt: ignored