#### Import Library

In [9]:
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt
import sklearn.metrics
from sklearn import model_selection
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.layers import Input, Flatten, Dense, Conv1D, MaxPool1D, Dropout
from keras.utils import plot_model
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score

#### Import Dataset

In [10]:
df = pd.read_csv('(A) Data/PreProcessed_Final Result_News Content_400 Data.csv', usecols=['Detokenized', 'Labelling'], engine='python')
df.head()

Unnamed: 0,Labelling,Detokenized
0,1.0,marak tren bugar yoga barre strength training ...
1,1.0,tenis putri indonesia janice tjen tampil turna...
2,-1.0,pasu dukung cepat rapid support forces rsf sor...
3,1.0,pt pertamina persero program tanggung sosial l...
4,0.0,badan meteorologi klimatologi geofisika bmkg i...


In [11]:
df = df.dropna()

In [12]:
x = df['Detokenized'].copy()
y = df['Labelling'].copy()

#### Feature Extraction Using TF-IDF

In [13]:
%%time
# define tf-idf
tf_idf = TfidfVectorizer(use_idf=True, smooth_idf=False, ngram_range=(1,1), max_features=1000)

# create dataframe using tf-idf
x_uni = pd.DataFrame(tf_idf.fit_transform(x).toarray(), columns=tf_idf.get_feature_names_out())
y_uni = y

x_uni

CPU times: total: 31.2 ms
Wall time: 25 ms


Unnamed: 0,abar,abdul,acara,aceh,acung,adat,adil,adit,adu,advertisement,...,wujud,xii,ya,yesus,yogyakarta,yosodiningrat,yout,youtube,yudhi,zona
0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.103229,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.152011,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
3,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
4,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.0,0.212644,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
410,0.0,0.205193,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
411,0.0,0.000000,0.0,0.0,0.0,0.0,0.147516,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
412,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
413,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0


#### Data Splt ( 70% Train, 30% Test)

In [14]:
# splitting Data to 70:30
X_train, X_test, y_train, y_test = model_selection.train_test_split(x_uni, y_uni, test_size = 0.3, random_state=0)

# reshape the train and test data
X_train = np.array(X_train).reshape(X_train.shape[0], 1, X_train.shape[1])
X_test = np.array(X_test).reshape(X_test.shape[0], 1, X_test.shape[1])

#### CNN Modelling

In [15]:
%%time
# define the CNN Model
cnn_model = tf.keras.models.Sequential() # create sequential model 
cnn_model.add(Conv1D(filters=32, kernel_size=3, padding='same',  activation=tf.keras.layers.LeakyReLU(alpha=0.001), input_shape = (1, X_train.shape[2]))) # first cnn layer with 32 filters, conv window 3, and Leaky relu activation 
cnn_model.add(MaxPool1D(pool_size=3, padding='same')) # second cnn layer using max pooling with pool size 3
cnn_model.add(Dropout(0.2)) # add dropout layer 0.2
cnn_model.add(Flatten()) # flatten the output
cnn_model.add(Dense(units = 1, activation='sigmoid')) # dense as last layer with units=1 and activation sigmoid

# compile the model
cnn_model.compile(optimizer='adam', loss = 'binary_crossentropy', metrics=['accuracy']) 
cnn_model.summary()

# plot model architecture
plot_model(cnn_model, to_file='cnn_model.png', show_shapes=True, show_layer_names=True)

# fit the model on the training data
cnn_model_history = cnn_model.fit(X_train, y_train, epochs=5, batch_size = 64, validation_data = (X_test, y_test))

# evaluate the model and predict
score = cnn_model.evaluate(X_test, y_test, batch_size=64, verbose=0)

print()
print('Validation Accuracy:', score[1])
print('Validation Loss:', score[0])
print()

y_pred = cnn_model.predict(X_test)
y_true = y_test

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 1, 32)             96032     
                                                                 
 max_pooling1d (MaxPooling1D  (None, 1, 32)            0         
 )                                                               
                                                                 
 dropout (Dropout)           (None, 1, 32)             0         
                                                                 
 flatten (Flatten)           (None, 32)                0         
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 96,065
Trainable params: 96,065
Non-trainable params: 0
____________________________________________________

In [16]:
# Classification Report
classreport = classification_report(y_true, y_pred.round(), digits=4)
accscore = accuracy_score(y_true, y_pred.round())
precscore = precision_score(y_true, y_pred.round())
recscore = recall_score(y_true, y_pred.round())
f1score = f1_score(y_true, y_pred.round(), average='weighted')
print('Classification Report :')
print(classreport)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].