In [None]:
# import the preprocessing functions 
import preprocess

In [None]:
# imports 

import pandas as pd 
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics


import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_digits
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit

In [None]:
# import ads dataset 
# the dataset contains commercial ads tagged as 0 and informational ads tagged as 1
file_url='./data/task1_dataset'
df=preprocess.get_data(file=file_url,columns_to_drop=['Unnamed: 0'],drop=True)
df.head()

In [None]:
#perform data pre-processing
df['preprocessed']=preprocess.preprocess_data(df['text'])
df.head()

In [None]:
# split data (80% training / 20% test)
train,test=preprocess.eval_train_split(data=df['preprocessed'],labels=df['label'],validation=False)
X_train=train[0]
y_train=train[1]
X_test=test[0]
y_test=test[1]


**Count vectorizer**
---



In [None]:
# loading CountVectorizer
tf_vectorizer = CountVectorizer() 
# transform texts to numerical values
X_train_tf = tf_vectorizer.fit_transform(X_train)

print("n_samples: %d, n_features: %d" % X_train_tf.shape)


In [None]:
# transform texts to numerical values
X_test_tf = tf_vectorizer.transform(X_test)
print("n_samples: %d, n_features: %d" % X_test_tf.shape)

In [None]:
# train the multinomial naive bayes classifier
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train_tf,y_train)


In [None]:
# predict data classes
y_pred = naive_bayes_classifier.predict(X_test_tf)


In [None]:
# compute the performance measures
score1 = metrics.accuracy_score(y_test, y_pred)
print("accuracy:   %0.3f" % score1)

print(metrics.classification_report(y_test, y_pred,
                                            target_names=['Inf', 'Com']))

print("confusion matrix:")
tn, fp, fn, tp = metrics.confusion_matrix(y_test, y_pred).ravel()
(tn, fp, fn, tp)




**Tf IDF vectorizer**
---




In [None]:
#load the term frequency vectorizer
tf_idf_vectorizer = TfidfVectorizer() 

X_train_tf_idf = tf_idf_vectorizer.fit_transform(X_train)

print("n_samples: %d, n_features: %d" % X_train_tf_idf.shape)

In [None]:
X_test_tf_idf = tf_idf_vectorizer.transform(X_test)

print("n_samples: %d, n_features: %d" % X_test_tf_idf.shape)

In [None]:
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train_tf_idf,y_train)


In [None]:
y_pred_idf = naive_bayes_classifier.predict(X_test_tf_idf)


In [None]:
# compute the performance measures
score2 = metrics.accuracy_score(y_test, y_pred_idf)
print("accuracy:   %0.3f" % score2)

print(metrics.classification_report(y_test, y_pred_idf,
                                            target_names=['Inf', 'Com']))

print("confusion matrix:")


tn, fp, fn, tp = metrics.confusion_matrix(y_test, y_pred_idf).ravel()

(tn, fp, fn, tp)
