In [None]:
import pandas as pd 
from tensorflow.keras.preprocessing.text import Tokenizer 
import numpy as np 
from keras.models import Sequential 
from keras.layers import Dense, Dropout, Activation 
from keras.utils import to_categorical
from sklearn.model_selection import StratifiedKFold

In [None]:
d = pd.concat([pd.read_csv("Youtube Spam Classification\Youtube01-Psy.csv"), 
               pd.read_csv("Youtube Spam Classification\Youtube02-KatyPerry.csv"), 
               pd.read_csv("Youtube Spam Classification\Youtube03-LMFAO.csv"),
               pd.read_csv("Youtube Spam Classification\Youtube04-Eminem.csv"), 
               pd.read_csv("Youtube Spam Classification\Youtube05-Shakira.csv")]) 
d = d.sample(frac=1)

In [None]:
kfold = StratifiedKFold(n_splits=5)
splits = kfold.split(d, d['CLASS'])

In [None]:
for train, test in splits:
    print('Split')
    print(np.shape(train))

In [None]:
def train_and_test(train_idx, test_idx): 
    train_content = d['CONTENT'].iloc[train_idx] 
    test_content = d['CONTENT'].iloc[test_idx]
    
    tokenizer = Tokenizer(num_words=2000)
    
    # Learn the training words (not the testing words!) 
    tokenizer.fit_on_texts(train_content) 
    
    #options for mode: binary, freq, tfidf 
    d_train_inputs = tokenizer.texts_to_matrix(train_content, mode='tfidf') 
    d_test_inputs = tokenizer.texts_to_matrix(test_content, mode='tfidf') 
    
    # divide tfidf by max 
    d_train_inputs = d_train_inputs/np.amax(np.absolute(d_train_inputs)) 
    d_test_inputs = d_test_inputs/np.amax(np.absolute(d_test_inputs))

    # subtract mean, to get values between -1 and 1 
    d_train_inputs = d_train_inputs - np.mean(d_train_inputs) 
    d_test_inputs = d_test_inputs - np.mean(d_test_inputs)

    #one -hot encoding
    d_train_outputs = to_categorical(d['CLASS'].iloc[train_idx])
    d_test_outputs = to_categorical(d['CLASS'].iloc[test_idx])

    model = Sequential()
    model.add(Dense(512, input_shape=(2000,)))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(2))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adamax',
                 metrics=['accuracy'])
    model.fit(d_train_inputs, d_train_outputs, epochs=10, batch_size=16)

    scores = model.evaluate(d_test_inputs, d_test_outputs) 
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100)) 
    
    # 🚀 Return the model and tokenizer
    return scores, model, tokenizer

In [None]:
kfold = StratifiedKFold(n_splits=5)
splits = kfold.split(d, d['CLASS'])
cvscores = []

final_model = None
final_tokenizer = None

for train_idx, test_idx in splits:
    scores, model, tokenizer = train_and_test(train_idx, test_idx)
    cvscores.append(scores[1]*100)

    # Save the last model and tokenizer
    final_model = model
    final_tokenizer = tokenizer

In [None]:
print("%.2f%% (+/- %.2f%%)" %(np.mean(cvscores), np.std(cvscores)))

In [None]:
def simple_test(text):
    # Minimal preprocessing (just to match the model's expectations)
    processed_input = tokenizer.texts_to_matrix([text], mode='tfidf')
    
    # Predict
    prediction = model.predict(processed_input)
    predicted_class = np.argmax(prediction)
    
    # Map the output
    label_mapping = {0: "Not Spam", 1: "Spam"}
    print(f"Prediction: {label_mapping[predicted_class]} ({prediction[0][predicted_class]*100:.2f}%)")

# Test with a spam message
simple_test("The song is good, but do subscribe my channel coz its lot more better")

# Test with a normal message
simple_test("the song is really great. I loved it! kind of similar to those of my channel's videos")
