In [None]:
import re
import math
import numpy as np
import pandas as pd

# PREPROCESSING

In [None]:
from stop_word import stop_words
import nltk
from nltk.stem import PorterStemmer
import re
nltk.download('punkt') # Download data yang dibutuhkan untuk NLTK (jalankan sekali pada setiap instalasi)

In [None]:
df = pd.read_csv("./dataset/Dataset2.csv") # Memuat dataset

In [None]:
df

In [None]:
df.dropna(inplace=True) # Drop semua dataset bernilai NaN

In [None]:
df['Full'] = df['Headline'].loc[0:2000] + ' ' + df['Body'].loc[0:2000] # Membuat kolom baru full terdiri dari gabungan kolom headline dan body

In [None]:
df['Full']
df.dropna(inplace=True) # Drop semua dataset bernilai NaN

#### PREPROCESSING FUNCTION

In [None]:
def stemming(word):
    stemmer = PorterStemmer()
    stemmed_word = stemmer.stem(word)
    return stemmed_word

In [None]:
def stopword_removal_and_filtering(words):
    # Menghilangkan stopwords dan Case Folding
    filtered_words = [word for word in words if word.lower() not in stop_words] 

    # Filtering singkatan sederhana
    filtered_words = [re.sub(r'\.', '', word) for word in filtered_words]

    # Filtering angka
    filtered_words = [re.sub(r'\d', '', word) for word in filtered_words]
    
    # Filtering data redundan
    filtered_words = set(filtered_words)
    
    return filtered_words

In [None]:
def preprocessing(text):
    # Menghilangkan tanda baca
    text = re.sub(r'[^\w\s-]', ' ', text)

    # Memisahkan teks menjadi kata-kata (Tokenizing)
    words = text.split()
    
    # Stop word removal dan stemming
    filtered_words = stopword_removal_and_filtering(words)

    # Stemming
    filtered_words = [stemming(i) for i in filtered_words]

    # Menghilangkan spasi ekstra
    processed_text = ' '.join(filtered_words[1:])

    return processed_text


In [None]:
X_dataset_fullset = df['Full'].apply(preprocessing) # Proses preprocessing dataset

In [None]:
X_dataset_fullset = X_dataset_fullset.values.flatten() # Menggabungkan data menjadi 1 dimensi

In [None]:
X_dataset_fullset = X_dataset_fullset.tolist() # Mengubah data menjadi list

In [None]:
X_string_fullset = ' '.join(X_dataset_fullset) # Menggabung data menjadi sebuah string

In [None]:
X_feature_fullset = set(X_string_fullset.split(" ")) # Memisahkan string menjadi fitur - fitur
X_feature_fullset = list(X_feature_fullset)

In [None]:
X_feature_fullset = pd.Series(data=X_feature_fullset) # Mengubah list menjadi data series 

## PERHITUNGAN BOBOT FITUR (TF-IDF)

In [None]:
import re
import math
import numpy as np
import pandas as pd

In [None]:
docs = X_dataset_fullset
processed_unique_words = X_feature_fullset

In [None]:
tf = np.zeros((len(processed_unique_words), len(docs)))

### Menghitung Term Frequency

In [None]:
# for i in range(len(docs)):
#     for x in range(len(processed_unique_words)):
#         count = docs[i].count(processed_unique_words[x])
#         tf[x][i] = 0 if count == 0 else 1 + math.log(1 + count)

In [None]:
# df_tf = pd.DataFrame(tf)

In [None]:
# df_tf.to_csv('tf.csv',index=False) # Menyimpan hasil perhitungan term frequency

## Data Preparation (START RUNNING DARI SINI)

### Load Feature

In [None]:
tfdf = pd.read_csv("./tf.csv")
tfdf.index = processed_unique_words
tfdf = tfdf.transpose()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tfdf, df['Label'], test_size=0.10, random_state=42) # Data Splitting

# Normalisasi Data
norm_X_train = scaler.fit_transform(X_train)
norm_X_test = scaler.fit_transform(X_test)

## Create Model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import tensorflow as tf

### Arsitektur 1

In [None]:
model = Sequential()

model.add(Dense(512, input_shape=(norm_X_train.shape[1],), activation='relu'))

# Hidden layers
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))  # Dropout layer for regularization

model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))  # Dropout layer for regularization

# Output layer
model.add(Dense(1, activation='sigmoid'))  # Binary classification (fake or not)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(norm_X_train, y_train, epochs=100, batch_size=256, validation_split=0.1)

### Arsitektur 2

In [None]:
model = Sequential()

# Input layer
model.add(Dense(512, input_shape=(norm_X_train.shape[1],), activation='relu'))
model.add(Dropout(0.6))

# Hidden layers
model.add(Dense(256, activation='relu'))

model.add(Dense(128, activation='relu'))
model.add(Dropout(0.6))

# Additional hidden layer
model.add(Dense(64, activation='relu'))


# Output layer
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(norm_X_train, y_train, epochs=100, batch_size=256, validation_split=0.1)

### Arsitektur 3

In [None]:
model = Sequential()

model.add(Dense(512, input_shape=(norm_X_train.shape[1],), activation='relu'))
model.add(Dropout(0.5))

# Hidden layers
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))

# Additional hidden layers
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))

# Output layer
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(norm_X_train, y_train, epochs=100, batch_size=256, validation_split=0.1)

### ARSITEKTUR 4

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

model = Sequential()

# Embedding layer
model.add(Embedding(input_dim=2000, output_dim=128, input_length=norm_X_train.shape[1],))

# LSTM layer
model.add(LSTM(64, return_sequences=True))
model.add(LSTM(32))

# Dense layers
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))

# Output layer
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(norm_X_train, y_train, epochs=100, batch_size=256, validation_split=0.1)

### Arsitektur 5

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization

model = Sequential()

# Input layer
model.add(Dense(512, input_shape=(norm_X_train.shape[1],), activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))

# Hidden layers
model.add(Dense(256, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))

model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))

model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))

# Output layer
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(norm_X_train, y_train, epochs=100, batch_size=256, validation_split=0.1)

### Arsitektur 6

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, LeakyReLU

model = Sequential()

# Input layer
model.add(Dense(1024, input_shape=(norm_X_train.shape[1],), activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))

# Hidden layers
model.add(Dense(512))
model.add(LeakyReLU(alpha=0.2))
model.add(BatchNormalization())
model.add(Dropout(0.5))

model.add(Dense(256))
model.add(LeakyReLU(alpha=0.2))
model.add(BatchNormalization())
model.add(Dropout(0.5))

model.add(Dense(128))
model.add(LeakyReLU(alpha=0.2))
model.add(BatchNormalization())
model.add(Dropout(0.5))

model.add(Dense(64))
model.add(LeakyReLU(alpha=0.2))
model.add(BatchNormalization())
model.add(Dropout(0.5))

# Output layer
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(norm_X_train, y_train, epochs=100, batch_size=256, validation_split=0.1)

In [None]:
history = model.fit(norm_X_train, y_train, epochs=100, batch_size=256, validation_split=0.1)

In [None]:
loss, accuracy = model.evaluate(norm_X_test, y_test)

In [None]:
model.save("my_model.h5")

### Model Jurnal Referensi

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
decision_tree = DecisionTreeClassifier(criterion='gini', splitter='best', min_samples_split=2,
                                       min_samples_leaf=1, max_features=None, random_state=42)
random_forest = RandomForestClassifier(n_estimators=100, criterion='gini', max_features='auto', random_state=42)
svm_model = SVC(C=1.0, kernel='poly', degree=1, coef0=1.0, gamma=1.0)
gradient_boosted_trees = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

In [None]:
decision_tree.fit(norm_X_train, y_train)
random_forest.fit(norm_X_train, y_train)
svm_model.fit(norm_X_train, y_train)
gradient_boosted_trees.fit(norm_X_train, y_train)

### Model Testing

In [None]:
def TFIDF(processed_unique_words,docs):
    tf = np.zeros((len(processed_unique_words), len(docs)))
    for i in range(len(docs)):
        for x in range(len(processed_unique_words)):
            count = docs[i].count(processed_unique_words[x])
            tf[x][i] = 0 if count == 0 else 1 + math.log(1 + count)
    return tf

In [None]:
# Contoh data teks (dokumen)
documents = [
    "COVID-19 Cure Found! Scientists have discovered a cure for COVID-19",
    "Breaking: UFO Sightings on the Rise! Multiple reports of UFO sightings around the world",
    "New Study: Chocolate is Healthy! Eating chocolate has been proven to improve health",
    "Government Denies Alien Contact Officials deny any contact with extraterrestrial beings",
    "Fake News Alert! Rumors of a zombie apocalypse are false, authorities confirm."
]

tf = TFIDF(tfdf.columns,documents)
tf = tf.transpose()

In [None]:
predictions = model.predict(tf)
predicted_labels = [df['Label'].iloc[prediction.argmax()] for prediction in predictions]
print(predicted_labels)
for i in predicted_labels:
    if i == 1:
        print("True")
    else:
        print("Hoax bejir")