In [1]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud 
from collections import Counter
import nltk
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
import joblib

In [2]:
# Download necessary nltk data
nltk.download('punkt')
from nltk.tokenize import word_tokenize

ParseError: no element found: line 1, column 0 (<string>)

In [None]:
# Load the dataset
url = 'https://raw.githubusercontent.com/siddhantbhattarai/AI-DataScience-BootCamp/main/SMSSpamCollection'
df = pd.read_csv(url, sep='\t', names=['label', 'message'])
df.head()

In [None]:
df['label'].nunique()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
clean_df = df.drop_duplicates()

In [None]:
clean_df.duplicated().sum()

In [None]:
# Basics statics of datasets
clean_df.describe()

In [None]:
clean_df.info()

In [None]:
# Count the spam vs ham messages
clean_df['label'].value_counts()

In [None]:
# Visualization: Distributing the spam
sns.countplot(x='label', data=clean_df)
plt.title('Distribution of spam vs ham message')
plt.show()

In [None]:
# Visualize the lenght of message
clean_df['message_length'] = clean_df['message'].apply(len)
plt.figure(figsize=(12, 6))
sns.histplot(clean_df[clean_df['label'] == 'ham']['message_length'], label='Ham', color='blue', bins=50, kde=True)
sns.histplot(clean_df[clean_df['label'] == 'spam']['message_length'], label='Spam', color='red', bins=50, kde=True)
plt.title('Message Length Distribution')
plt.legend()
plt.show()

In [None]:
# Visualization: Word clouds for spam and ham messages
spam_words = ' '.join(clean_df[clean_df['label'] == 'spam']['message'])
ham_words = ' '.join(clean_df[clean_df['label'] == 'ham']['message'])

In [None]:
spam_wordCloud = WordCloud(width=800, height=400, background_color='white').generate(spam_words)
plt.figure(figsize=(10,5))
plt.imshow(spam_wordCloud, interpolation='bilinear')
plt.title('Spam Mesages Word Cloud')
plt.axis('off')
plt.show()

In [None]:
ham_wordCloud = WordCloud(width=800, height=400, background_color='white').generate(ham_words)
plt.figure(figsize=(10,5))
plt.imshow(ham_wordCloud, interpolation='bilinear')
plt.title('Ham Mesages Word Cloud')
plt.axis('off')
plt.show()

In [None]:
# EDA: Tokenization and common word analysis
clean_df['tokens'] = clean_df['message'].apply(word_tokenize)
spam_tokens = [token for sublist in clean_df[clean_df['label'] == 'spam']['tokens'] for token in sublist]
ham_tokens = [token for sublist in clean_df[clean_df['label'] == 'ham']['tokens'] for token in sublist]

In [None]:
spam_common_words = Counter(spam_tokens).most_common(20)
ham_common_words = Counter(ham_tokens).most_common(20)

In [None]:
spam_common_df = pd.DataFrame(spam_common_words, columns=['word', 'count'])
ham_common_df = pd.DataFrame(ham_common_words, columns=['word', 'count'])

In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(x='count', y='word', data = spam_common_df, color='red')
plt.title('Most Common words in Spam Message')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(x='count', y='word', data = ham_common_df, color='red')
plt.title('Most Common words in Ham Message')
plt.show()

In [None]:
# Data cleaning
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = ' '. join(text.split())
    return text
clean_df['message'] = clean_df['message'].apply(clean_text)

In [None]:
# Data Preprocessing
X = clean_df['message']
y = clean_df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
# Feature engineering: Convert text data to TF-IDF features
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
# Model Selection and Training
# Multinomial Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

In [None]:
# Linear SVC
svc_model = LinearSVC(dual=False)
svc_model.fit(X_train_tfidf, y_train)

In [None]:
# Random Forest Classifier
rf_model = RandomForestClassifier()
rf_model.fit(X_train_tfidf, y_train)

In [None]:
# Model Evaluation
nb_predictions = nb_model.predict(X_test_tfidf)
svc_predictions = svc_model.predict(X_test_tfidf)
rf_predictions = rf_model.predict(X_test_tfidf)

from sklearn.metrics import accuracy_score
# Accuracy computation
nb_accuracy = accuracy_score(y_test, nb_predictions)
svc_accuracy = accuracy_score(y_test, svc_predictions)
rf_accuracy = accuracy_score(y_test, rf_predictions)

# Print the accuracy
print(f'Multinomial Naive Bayes Accuracy: {nb_accuracy*100:.2f}%')
print(f'Support Vector Classifier Accuracy: {svc_accuracy*100:.2f}%')
print(f'Random Forest Classifier Accuracy: {rf_accuracy*100:.2f}%')