# Exploratory analysis of the data

In [None]:
# Loading the necessary module
import pandas as pd

# Reading the Youtube Spam data
pre_data = pd.read_csv('train.csv')

# Display first 5 rows of the data
print(pre_data.head())

In [None]:
# Show the names of the columns
colnam = pre_data.columns

for x in colnam:
    print(f'Colume name: {x}')

In [None]:
# Show the dimensions of the data
pre_data.shape

In [None]:
# Count the number of NAs in each columns
print('The number of NAs in the data set categorized by columns are: \n', pre_data.isnull().sum())

In [None]:
# Count the number of duplicated rows
print('The total number of duplicated rows in the data set is:', pre_data.duplicated().sum())

In [None]:
# Count the number of unique contributors in the data set
pre_data['AUTHOR'].nunique()

In [None]:
# Loading the necessary module
from collections import Counter

# Count the top 5 most dedicated contributors
top_5_author = Counter(pre_data['AUTHOR']).most_common(5)

for x in top_5_author:
    print(f'{x[0]}: {x[1]}')

In [None]:
# Loading the necessary module
import matplotlib.pyplot as plt 

# Visualize top 5 most common contributors in the data set
name = [x[0] for x in top_5_author]
count = [x[1] for x in top_5_author]

plt.bar(name, count, width = 0.4)

plt.xlabel('Authors')
plt.ylabel('Number of contributions')
plt.title('Top 5 most common contributors')

plt.show()


In [None]:
filtered_data = pre_data[pre_data['CLASS'] == 1]

top_5_spammer = Counter(filtered_data['AUTHOR']).most_common(5)

for x in top_5_spammer:
    print(f'{x[0]}: {x[1]}')

In [None]:
# Visualize top 5 spammers in the data set
name = [x[0] for x in top_5_spammer]
count = [x[1] for x in top_5_spammer]

plt.bar(name, count, width = 0.4)

plt.xlabel('Authors')
plt.ylabel('Number of contributions')
plt.title('Top 5 spammer')

plt.show()

In [None]:
# Load nessary module
import re

# A function to see if a comment contains URL
def contains_url(comment):
    pattern = r'http[s]?://\S+|www\.\S+'
    return bool(re.search(pattern, comment))

pre_data['Contains_url'] = pre_data['CONTENT'].apply(contains_url)

num_urls = pre_data['Contains_url'].sum()

print(f'The number of comments containing URLs is: {num_urls}')

In [None]:
url_spam = pre_data[(pre_data['CLASS'] == 1) & (pre_data['Contains_url'] == True)]

prop_url_spam = len(url_spam) / num_urls * 100

print(f'The proposition of spam comments containing a URL is: {prop_url_spam}%')

We can see that as long as the comment is a spam, it is very likely to contain a URL.

In [None]:
!pip install emoji

In [None]:
# Load necessary module
import emoji

# Function to check if a comment contains emojis
def contains_emoji(comment):
    return bool(emoji.emoji_count(comment))

pre_data['Contains_emo'] = pre_data['CONTENT'].apply(contains_emoji)

num_emo = pre_data['Contains_emo'].sum()

print(f'The number of comments containing emoji is: {num_emo}')

In [None]:
emo_spam = pre_data[(pre_data['CLASS'] == 1) & (pre_data['Contains_emo'] == True)]

prop_emo_spam = len(emo_spam) / num_emo * 100

print(f'The proposition of spam comments containing emoji is: {prop_emo_spam}%')

We can see that the not a lot of spam comment contains emoji.

In [None]:
# Extracting the spam comment and non-spam comment
spam_comment = pre_data[pre_data['CLASS'] == 1]['CONTENT']
non_spam_comment = pre_data[pre_data['CLASS'] == 0]['CONTENT']

punctuation_pattern = r'[^\w\s]'

punc_list = []
for comment in spam_comment:
    comment = comment.replace('\ufeff', '') # Removing the Byte Order Mark
    punctuations = re.findall(punctuation_pattern, comment)
    punc_list.extend(punctuations)

punctuation_counts = Counter(punc_list)

top_5_punc = punctuation_counts.most_common(5)

for x in top_5_punc:
    print(f'{x[0]}: {x[1]}')

In [None]:
# Define whether the comment contains top spam punctuations
def contains_punc(comment):
    tokens = comment.lower().split(' ')
    return any(word in top_5_punc for word in tokens)

pre_data['Contains_punc'] = pre_data['CONTENT'].apply(contains_punc)

In [None]:
# loading the necessary modules
import nltk # Natural language toolkit
from nltk.corpus import stopwords # Importing common stopwords
from nltk.tokenize import word_tokenize # Import tokenizer
from nltk.stem import WordNetLemmatizer # Import lemmatizer

# Download necessary module resources
nltk.download('stopwords') # Download stopwords database
nltk.download('punkt') # Download Punkt tokenizer
nltk.download('wordnet') # Enable the WordNetLemmatizer by downloading WordNet database
nltk.download('omw-1.4') # A additional database

In [None]:
lemmatizer = WordNetLemmatizer()

# Building the text cleaning function and tokenization
def clean_text(text):
    # Step 1: Remove URLs
    text = re.sub(r'http[s]?://\S+|www\.\S+', '', text)
    
    # Step 2: Remove emojis
    text = emoji.replace_emoji(text, replace="")
    
    # Step 3: Convert to lowercase
    text = text.lower()
    
    # Step 4: Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Step 5: Tokenize
    token = word_tokenize(text)
    
    # Step 6: Remove stopwords
    token = [word for word in token if word not in stopwords.words('english')]
    
    # Step 7: Lemmatize
    token = [lemmatizer.lemmatize(word) for word in token]
    
    return token

In [None]:
# Count top 5 most commonly used words in spam comment
all_tokens = []
for comment in spam_comment:
    token = clean_text(comment)
    for x in token:
        all_tokens.append(x)

top_5_words = Counter(all_tokens).most_common(5)

for x in top_5_words:
    print(f'{x[0]}: {x[1]}')

In [None]:
# Function to check if a comment contain those top used spam words
def contains_words(comment):
    tokens = clean_text(comment)
    return any(word in tokens for word in top_5_words)

pre_data['Contains_words'] = pre_data['CONTENT'].apply(contains_words)

In [None]:
# Count the number of unique videos in the data set
pre_data['VIDEO_NAME'].nunique()

In [None]:
# Visualize the composition of comments of the videos
vid_comment = pre_data['VIDEO_NAME'].value_counts()
spam_vid_comment = filtered_data['VIDEO_NAME'].value_counts()

total_vid_comment = pd.DataFrame({
    'Total comments of a video': vid_comment,
    'Total spam comment of a video': spam_vid_comment
})

total_vid_comment.plot(kind='bar', figsize=(12, 6), stacked=True)

plt.title("Number of comments by video")
plt.xlabel("Video name")
plt.ylabel("Comment counts")
plt.show()

### Explore CLASS colume

In [None]:
num_comment = pre_data.shape[0]

num_spam = len(spam_comment)

prop_spam = len(spam_comment) / pre_data.shape[0] * 100

print(f'The proposition of spam comment is: {prop_spam}%')

# Feature selection

In [None]:
# Showing all columes we have in the dataset so far.

colnam = pre_data.columns

for x in colnam:
    print(f'Colume name: {x}')

In [None]:
print(pre_data.head())

## Forward stepwise selection 

In [None]:
# Load the necessary modules
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorizing the text data in the data set
pre_data['text_data'] = pre_data['CONTENT']

vectorizer = TfidfVectorizer()

tfidf_matrix = vectorizer.fit_transform(pre_data['text_data'])
tfidf_features = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

In [None]:
# Load the necessary modules
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Define all possible predictors, we use dictionary becuase it is easier to present the final feature
all_predictors = {
    'Content colume': tfidf_features,
    'Contains_url colume': pre_data[['Contains_url']],
    'Contains_emo colume': pre_data[['Contains_emo']],
    'Contains_punc colume': pre_data[['Contains_punc']],
    'Contains_words colume': pre_data[['Contains_words']]
}

# Define the response variable
response = pre_data['CLASS']

# Define unused predictors
unused_predictors = list(all_predictors.keys())

# Initialization
selected_predictors = []
best_accuracy = 0

# The adjusted forward stepwise selection
while unused_predictors:
    temp_best_accuracy = 0
    temp_best_predictor = None

    for index in unused_predictors:
        model = RandomForestClassifier(bootstrap = True, random_state = 123)

        temp_predictors = pd.concat([all_predictors[x] for x in selected_predictors + [index]], axis = 1)
        
        x_train, x_test, y_train, y_test = train_test_split(temp_predictors, response, test_size = 0.3, random_state = 123)

        model.fit(x_train, y_train)

        y_prediction = model.predict(x_test)

        accuracy = accuracy_score(y_test, y_prediction)

        if temp_best_accuracy < accuracy:
            temp_best_accuracy = accuracy
            temp_best_predictor = index

    if temp_best_predictor != None and best_accuracy < temp_best_accuracy:
        selected_predictors.append(temp_best_predictor)
        unused_predictors.remove(temp_best_predictor)
        best_accuracy = temp_best_accuracy
    else:
        break

In [None]:
selected_predictors

In [None]:
best_accuracy

## Investigate the behavior of the model and its relavent statistics

In [None]:
# Investigate the preliminary model
final_predictors = pd.concat([all_predictors[x] for x in selected_predictors], axis=1)

x_train, x_test, y_train, y_test = train_test_split(final_predictors, response, test_size=0.3, random_state=123)

pre_model = RandomForestClassifier(bootstrap=True, random_state=123)

pre_model.fit(x_train, y_train)

y_prediction = pre_model.predict(x_test)

In [None]:
# Investigate the accuracy score
accuracy_score(y_test, y_prediction)

In [None]:
# Load the necessary package
from sklearn.metrics import classification_report

# Investigate F1 score
print(classification_report(y_test,y_prediction)) 

In [None]:
# Load the necessary module
from sklearn import metrics
from sklearn.metrics import confusion_matrix

# Investigate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_prediction)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = conf_matrix)
cm_display.plot()
plt.show()

In [None]:
# Load the necessary package
from sklearn.metrics import roc_curve, auc

# ROC and AUC
y_pred_prob = pre_model.predict_proba(x_test)[:, 1]

fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, label = f'ROC curve with AUC = {roc_auc:.3f}')
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC curve')
plt.legend(loc = 'lower right')
plt.show()

# Final model construction

In [None]:
# Constructing final model using the variable selected above
final_predictors = pd.concat([all_predictors[x] for x in selected_predictors], axis=1)

final_model = RandomForestClassifier(bootstrap=True, random_state=123)

final_model.fit(final_predictors, response)

In [None]:
# Output the file for kaggle submission with the final model
test_data = pd.read_csv('test.csv')

test_data['Contains_url'] = test_data['CONTENT'].apply(contains_url)

test_data['Contains_punc'] = test_data['CONTENT'].apply(contains_punc)

test_data['Contains_emo'] = test_data['CONTENT'].apply(contains_emoji)

test_data['Contains_words'] = test_data['CONTENT'].apply(contains_words)

test_tfidf_matrix = vectorizer.transform(test_data['CONTENT'])
test_tfidf_features = pd.DataFrame(test_tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

all_test_predictors = {
    'Content colume': test_tfidf_features,
    'Contains_url colume': test_data[['Contains_url']],
    'Contains_emo colume': test_data[['Contains_emo']],
    'Contains_punc colume': test_data[['Contains_punc']],
    'Contains_words colume': test_data[['Contains_words']]
}

final_test_predictors = pd.concat([all_test_predictors[x] for x in selected_predictors], axis = 1)

new_predictions = final_model.predict(final_test_predictors)

output = pd.DataFrame({
    'COMMENT_ID': test_data['COMMENT_ID'],
    'CLASS':new_predictions
})

output.to_csv('predictions_test_final.csv', index = False)