# SMS Spam Detection Data Cleaning and EDA

## Importing the Libraries
We start by downloading the libraries we will use.

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import nltk
nltk.download('stopwords')
nltk.download("punkt")
from nltk.corpus import stopwords
stopwords.words("english")
import string
string.punctuation
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
import timeit
from wordcloud import WordCloud
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier
import pickle

## Create some functions that we will use.

#### Explanation of the `transform_text` & `transform_text_2` Functions

The `transform_text` & `transform_text_2` function performs the following steps:

Converts text to lowercase.

Tokenizes the text into words.

Removes non-alphanumeric characters.

Filters out stopwords and punctuation.

Stems the remaining words.

Returns the cleaned and stemmed text as a single string.

In [None]:
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    y = []
    
    for i in text:
        if i.isalnum():
            y.append(i)
            
    text = y[:]
    y.clear()
    
    for i in text:
        if i not in stopwords.words("english") and i not in string.punctuation:
            y.append(i)
            
    text = y[:]
    y.clear()
    
    for i in text:
        y.append(ps.stem(i))
        
    return " " .join(y)

In [None]:
def transform_text_2(text):
    
    text = text.lower()
    
    tokens = word_tokenize(text)
    
    filtered_tokens = [word for word in tokens if word.isalnum()]
    
    final_tokens = [word for word in filtered_tokens if word not in stopwords.words("english")]
    
    stemmed_tokens = [ps.stem(word) for word in final_tokens]
    
    return " ".join(stemmed_tokens)

## Importing the Dataset
We begin by loading the dataset and displaying its first few rows:

In [None]:
df = pd.read_csv("spam.csv",encoding="latin1")
df.head()

In [None]:
df.shape

#### Output:
- The dataset consists of **5,572** rows and **5** columns.

## Data Cleaning:
* Checkingg for Missing Values

In [None]:
df.isnull().sum()

#### Observations:
- The dataset contains five columns: `v1`, `v2`, `Unnamed: 2`, `Unnamed: 3`, and `Unnamed: 4`.
- The last three columns have many missing values and are unnecessary, so we drop them.

#### Dropping Unnecessary Columns

In [None]:
# Drop cols ["Unnamed: 2" , "Unnamed: 3" , "Unnamed: 4"]
df.drop(columns=["Unnamed: 2","Unnamed: 3","Unnamed: 4"],inplace=True)

In [None]:
df.head(1)

#### Output:
- The dataset now consists of two columns: `v1` (label) and `v2` (message text).

#### Renaming Columns

- `v1` is renamed to `target`.
- `v2` is renamed to `text`.

In [None]:
df.rename(columns={"v1":"target","v2":"text"},inplace=True)

#### Encoding Target Variable
We convert the `target` column to numerical values where:
- **0** represents `ham` (not spam)
- **1** represents `spam`

In [None]:
encoder = LabelEncoder()
encoder.fit_transform(df["target"])

df["target"] = encoder.fit_transform(df["target"])

In [None]:
df["target"].dtypes

In [None]:
df.head()

#### Removing Duplicates

In [None]:
print("Duplicate values before removal:", df.duplicated().sum())
df = df.drop_duplicates(keep="first")
print("Duplicate values after removal:", df.duplicated().sum())

#### Observations:
- There were **403** duplicate rows in the dataset, which were removed.
- The dataset now has **5,169** unique rows.

## Exploratory Data Analysis (EDA)

#### Checking Class Distribution

In [None]:
df["target"].value_counts()

#### Output:
- **4,516** messages are `ham` (non-spam).
- **653** messages are `spam`.

#### Visualizing Class Distribution

In [None]:
plt.style.use('dark_background')
explode = (0.1, 0)
plt.pie(
    df["target"].value_counts(), 
    labels=["ham", "spam"], 
    autopct="%0.2f", 
    colors=sns.color_palette("coolwarm"),
    explode=explode, 
    shadow=True,
    startangle=90
)
plt.title("Distribution of Spam vs Ham Messages", fontsize=16, color='cyan', fontweight='bold')
plt.savefig("spam_ham_distribution.png", dpi=300, bbox_inches='tight')
plt.show()

#### Observations:
- The dataset is **imbalanced**, with a majority of messages being `ham` (not spam).
- This will be important when training a machine learning model, as class imbalance can impact performance.

#### Creating a "characters_num" Column:

* counts the number of characters in each SMS message.

* saves the character count for each row.

In [None]:
# create a number of charahcters column
df["characters_num"] = df["text"].apply(len)

#### Creating a "words_num" Column  

1. **Tokenize each message** → Splits the text into individual words.  
2. **Count the number of words** → Calculates the total words in each message.  
3. **Store the result in a new column** → Saves the word count for each row.  

Now, the dataset has a new column that shows the number of words in each SMS message.


In [None]:
# create a number of words column
df["text"].apply(lambda x: nltk.word_tokenize(x))
df["text"].apply(lambda x: len(nltk.word_tokenize(x)))
df["words_num"] = df["text"].apply(lambda x: len(nltk.word_tokenize(x)))

#### Creating a "sentences_num" Column  

1. **Tokenize each message into sentences** → Splits the text into individual sentences.  
2. **Count the number of sentences** → Calculates the total sentences in each message.  
3. **Store the result in a new column** → Saves the sentence count for each row.  

Now, the dataset has a new column that shows the number of sentences in each SMS message.


In [None]:
# create a number of sentences column
df["text"].apply(lambda x: nltk.sent_tokenize(x))
df["text"].apply(lambda x: len(nltk.sent_tokenize(x)))
df["sentences_num"] = df["text"].apply(lambda x: len(nltk.sent_tokenize(x)))

#### Descriptive Statistics for Character, Word, and Sentence Counts 

In [None]:
# descriptive statistics of columns["characters_num","words_num","sentences_num"]
df[["characters_num","words_num","sentences_num"]].describe()

#### Descriptive Statistics for Ham Messages  

In [None]:
# ham descriptive statistics of columns["characters_num","words_num","sentences_num"]
df[df["target"]==0][["characters_num","words_num","sentences_num"]].describe()

#### Descriptive Statistics for Spam Messages  

In [None]:
# spam descriptive statistics of columns["characters_num","words_num","sentences_num"]
df[df["target"]==1][["characters_num","words_num","sentences_num"]].describe()

#### Histogram of SMS Messages by Number of Characters 

In [None]:
plt.style.use("dark_background")

fig = plt.figure(figsize=(10, 5), dpi=150, facecolor='black')
ax = plt.gca()
ax.set_facecolor('black')  

sns.histplot(df[df["target"] == 0]["characters_num"], color="#00FFFF", edgecolor="#00FFFF", linewidth=1.2)  # Neon Cyan
sns.histplot(df[df["target"] == 1]["characters_num"], color="#FF00FF", edgecolor="#FF00FF", linewidth=1.2, alpha=0.5)  # Neon Magenta

plt.legend(title="Target", labels=["Ham", "Spam"], facecolor="black", edgecolor="#00FFFF", fontsize=10, title_fontsize=12)
plt.grid(True, linestyle="--", linewidth=0.5, color="#444444")

ax.spines['bottom'].set_color('#00FFFF')
ax.spines['left'].set_color('#FF00FF')
plt.title("Distribution of SMS Messages by Number of Characters", color="white", fontsize=14)
ax.xaxis.label.set_color('#00FFFF')
ax.yaxis.label.set_color('#FF00FF')
ax.tick_params(axis='x', colors='#00FFFF')
ax.tick_params(axis='y', colors='#FF00FF')

plt.savefig("sms_length_distribution.png", dpi=300, bbox_inches='tight')
plt.show()

#### Histogram of SMS Messages by Number of Words

In [None]:
plt.style.use("dark_background")

fig, ax = plt.subplots(figsize=(10, 5), dpi=150, facecolor='black')

sns.kdeplot(df[df["target"] == 0]["words_num"], 
            color="#00FFFF", linewidth=2, fill=True, alpha=0.3, label="Ham")

sns.kdeplot(df[df["target"] == 1]["words_num"], 
            color="#FF00FF", linewidth=2, fill=True, alpha=0.3, label="Spam")

plt.legend(title="Target", facecolor="black", edgecolor="white", fontsize=10, title_fontsize=12)
plt.grid(True, linestyle="--", linewidth=0.5, color="#444444")
ax.spines['bottom'].set_color('#00FFFF')
ax.spines['left'].set_color('#FF00FF')
ax.xaxis.label.set_color('#00FFFF')
ax.yaxis.label.set_color('#FF00FF')
ax.tick_params(axis='x', colors='#00FFFF')
ax.tick_params(axis='y', colors='#FF00FF')
plt.title("Distribution of SMS Messages by Word Count", color="white", fontsize=14)
plt.xlabel("Number of Words", fontsize=12)
plt.ylabel("Density", fontsize=12)
plt.savefig("sms_word_count_distribution.png", dpi=300, bbox_inches='tight')
plt.show()

#### Histogram of SMS Messages by Number of Sentences 

In [None]:
plt.style.use("dark_background")

fig, ax = plt.subplots(figsize=(10, 5), dpi=150, facecolor='black')

sns.histplot(df[df["target"] == 0]["sentences_num"], color="#00FFFF", edgecolor="#00FFFF", bins=30, alpha=0.5, linewidth=1.5, kde=True, label="Ham")
sns.histplot(df[df["target"] == 1]["sentences_num"], color="#FF00FF", edgecolor="#FF00FF", bins=30, alpha=0.5, linewidth=1.5, kde=True, label="Spam")

plt.legend(title="Target", facecolor="black", edgecolor="white", fontsize=10, title_fontsize=12)
plt.grid(True, linestyle="--", linewidth=0.5, color="#444444")
ax.spines['bottom'].set_color('#00FFFF')
ax.spines['left'].set_color('#FF00FF')
ax.xaxis.label.set_color('#00FFFF')
ax.yaxis.label.set_color('#FF00FF')
ax.tick_params(axis='x', colors='#00FFFF')
ax.tick_params(axis='y', colors='#FF00FF')
plt.title("Distribution of SMS Messages by Number of Sentences", color="white", fontsize=14)
plt.xlabel("Number of Sentences", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.savefig("sms_sentence_count_distribution.png", dpi=300, bbox_inches='tight')
plt.show()

#### Pairplot of Features Colored by `Target`
- **Creates a pairplot** to visualize relationships between all pairs of features in the DataFrame.

In [None]:
sns.pairplot(df,hue="target",palette="Set2")
plt.savefig("feature_pairplot_by_target.png", dpi=300, bbox_inches='tight')

#### Correlation of Features in the DataFrame

In [None]:
# correlation
df.corr(numeric_only=True)

#### Heatmap Showing Correlation Between Features

In [None]:
plt.style.use('dark_background')
plt.figure(figsize=(12, 8), dpi=150)
sns.heatmap(
    df.corr(numeric_only=True),
    cmap='Spectral',annot=True,fmt='.2f',
    annot_kws={"size": 12, "weight": "bold", "color": "white"},
    linewidths=1.5,linecolor='cyan',
    cbar_kws={"shrink": 0.8, "orientation": "vertical", "ticks": [0, 0.5, 1]},
    square=True
)
plt.title("Correlation Heatmap", fontsize=20, color='cyan', fontweight='bold', loc='center')
plt.xticks(fontsize=14, color='lightgreen', fontweight='light')
plt.yticks(fontsize=14, color='lightgreen', fontweight='light')
plt.tight_layout()
plt.savefig("correlation_heatmap.png", dpi=300, bbox_inches='tight')
plt.show()

---
## Data Preprocessing
---

In [None]:
ps = PorterStemmer()

#### Measuring the Runtime of Two Text Transformation Functions (`transform_text`,`transform_text_2`)

In [None]:
import timeit

time1 = timeit.timeit(lambda: df["text"].apply(transform_text),number=1)

time2 = timeit.timeit(lambda: df["text"].apply(transform_text_2),number=1)

print(f"Runtime for transform_text: {time1:.5f} seconds")
print(f"Runtime for transform_text_2: {time2:.5f} seconds")

In [None]:
df["transformed_text"] = df["text"].apply(transform_text_2)

#### Creating a WordCloud Object

In [None]:
wc = WordCloud(width=500,height=500,min_font_size=10,background_color="white")

#### Generating and Displaying a WordCloud for Spam Text

In [None]:
spam_wc = WordCloud(
    background_color="black",
    colormap="coolwarm",
    contour_color="cyan",
    contour_width=1,
    width=800,
    height=400,
    max_words=200,
    random_state=42
).generate(df[df["target"]==1]["transformed_text"].str.cat(sep=" "))
plt.figure(figsize=(10, 6))
plt.imshow(spam_wc, interpolation="bilinear")
plt.axis("off")
plt.title("Word Cloud - Spam Messages", fontsize=16, color='cyan', fontweight='bold')
plt.savefig("spam_word_cloud.png", dpi=300, bbox_inches='tight')
plt.show()

#### Generating and Displaying a WordCloud for Ham Text

In [None]:
ham_wc = WordCloud(
    background_color="darkslategray",
    colormap="plasma",
    contour_color="white",
    contour_width=2,
    width=900,
    height=450,
    max_words=150,
    random_state=42,
    min_font_size=10
).generate(df[df["target"]==0]["transformed_text"].str.cat(sep=" "))

plt.figure(figsize=(12, 7))
plt.imshow(ham_wc, interpolation="bilinear")
plt.axis("off")
plt.title("Word Cloud - Ham Messages", fontsize=18, color='lightblue', fontweight='bold')
plt.savefig("ham_word_cloud.png", dpi=300, bbox_inches='tight')
plt.show()

#### Creating a Corpus of Words from Spam Texts

In [None]:
spam_corpus = []
for msg in df[df["target"]==1]["transformed_text"].tolist():
    for words in msg.split():
        spam_corpus.append(words)

In [None]:
len(spam_corpus)

In [None]:
Counter(spam_corpus).most_common(30)

#### Displaying the 30 Most Common Words in Spam Texts

In [None]:
# convert spam_corpus to Dataframe
pd.DataFrame(Counter(spam_corpus).most_common(30))

#### Bar Graph for the Most Popular Words in Spam Corpus

In [None]:
plt.style.use('dark_background')
plt.figure(figsize=(10, 6), dpi=150)
sns.barplot(
    x=pd.DataFrame(Counter(spam_corpus).most_common(30))[0],
    y=pd.DataFrame(Counter(spam_corpus).most_common(30))[1],
    palette="coolwarm", linewidth=1.5, edgecolor='black' 
)
plt.xlabel("Words", fontsize=14, color='cyan', fontweight='bold')
plt.ylabel("Popularity", fontsize=14, color='cyan', fontweight='bold')
plt.title("Most Popular Words in Spam Corpus", fontsize=16, color='cyan', fontweight='bold')
plt.xticks(rotation=90, color='lightgreen', fontsize=12, fontweight='light')
plt.grid(True, color='gray', linestyle='--', linewidth=0.5, alpha=0.5)
plt.gca().patch.set_facecolor('#121212')

plt.savefig("spam_most_common_words.png", dpi=300, bbox_inches='tight')
plt.show()

#### Creating a Corpus of Words from Ham Texts

In [None]:
ham_corpus = []
for msg in df[df["target"]==0]["transformed_text"].tolist():
    for words in msg.split():
        ham_corpus.append(words)

In [None]:
len(ham_corpus)

#### Displaying the 30 Most Common Words in Ham Texts

In [None]:
pd.DataFrame(Counter(ham_corpus).most_common(30))

#### Bar Graph for the Most Popular Words in Ham Corpus

In [None]:
plt.style.use('dark_background')
plt.figure(figsize=(10, 6), dpi=150)
sns.barplot(
    x=pd.DataFrame(Counter(ham_corpus).most_common(30))[0],
    y=pd.DataFrame(Counter(ham_corpus).most_common(30))[1],
    palette="coolwarm", linewidth=1.5, edgecolor='black'
)
plt.xlabel("Words", fontsize=14, color='cyan', fontweight='bold')
plt.ylabel("Popularity", fontsize=14, color='cyan', fontweight='bold')
plt.title("Most Popular Words in Ham Corpus", fontsize=16, color='cyan', fontweight='bold')
plt.xticks(rotation=90, color='lightgreen', fontsize=12, fontweight='light')
plt.grid(True, color='gray', linestyle='--', linewidth=0.5, alpha=0.5)
plt.gca().patch.set_facecolor('#121212')
plt.savefig("ham_most_common_words.png", dpi=300, bbox_inches='tight')
plt.show()

---
## Model Building 
---

#### Initializing a CountVectorizer
transform raw text data into numerical features

In [None]:
cv = CountVectorizer()

#### Transforming Text Data into Numerical Features
transforms the text data into a matrix of word counts and shows the dimensions of the resulting feature matrix

In [None]:
X = cv.fit_transform(df["transformed_text"]).toarray()
X.shape

In [None]:
y = df["target"].values
y.shape

#### Splitting Data into Training and Test Sets

In [None]:
# Train , Split the Data
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

#### Training and Evaluating a Gaussian Naive Bayes Classifier

In [None]:
# GaussianNB
gnb = GaussianNB()
gnb.fit(X_train,y_train)
y_pred_gnb = gnb.predict(X_test)
print(accuracy_score(y_test,y_pred_gnb))
print(confusion_matrix(y_test,y_pred_gnb))
print(precision_score(y_test,y_pred_gnb))

#### Training and Evaluating a Multinomial Naive Bayes Classifier

In [None]:
# MultinomialNB
mnb = MultinomialNB()
mnb.fit(X_train,y_train)
y_pred_mnb = mnb.predict(X_test)
print(accuracy_score(y_test,y_pred_mnb))
print(confusion_matrix(y_test,y_pred_mnb))
print(precision_score(y_test,y_pred_mnb))

#### Training and Evaluating a Bernoulli Naive Bayes Classifier

In [None]:
# BernoulliNB
bnb = BernoulliNB()
bnb.fit(X_train,y_train)
y_pred_bnb = mnb.predict(X_test)
print(accuracy_score(y_test,y_pred_bnb))
print(confusion_matrix(y_test,y_pred_bnb))
print(precision_score(y_test,y_pred_bnb))

#### Initializing a Term Frequency-Inverse Document Frequency (TF-IDF) Vectorizer (TfidfVectorizer)

In [None]:
tfidf = TfidfVectorizer()

#### Transforming Text Data into TF-IDF Features

In [None]:
X = tfidf.fit_transform(df["transformed_text"]).toarray()
X.shape

In [None]:
y = df["target"].values
y.shape

In [None]:
# Train , Split the Data
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [None]:
# GaussianNB
gnb = GaussianNB()
gnb.fit(X_train,y_train)
y_pred_gnb = gnb.predict(X_test)
print(accuracy_score(y_test,y_pred_gnb))
print(confusion_matrix(y_test,y_pred_gnb))
print(precision_score(y_test,y_pred_gnb))

In [None]:
# MultinomialNB
mnb = MultinomialNB()
mnb.fit(X_train,y_train)
y_pred_mnb = mnb.predict(X_test)
print(accuracy_score(y_test,y_pred_mnb))
print(confusion_matrix(y_test,y_pred_mnb))
print(precision_score(y_test,y_pred_mnb))

In [None]:
# BernoulliNB
bnb = BernoulliNB()
bnb.fit(X_train,y_train)
y_pred_bnb = bnb.predict(X_test)
print(accuracy_score(y_test,y_pred_bnb))
print(confusion_matrix(y_test,y_pred_bnb))
print(precision_score(y_test,y_pred_bnb))

#### Initializing Various Machine Learning Classifiers

In [None]:
svc = SVC(kernel="sigmoid",gamma=1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver="liblinear",penalty="l1")
rfc = RandomForestClassifier(n_estimators=50,random_state=2)
abc = AdaBoostClassifier(n_estimators=50,random_state=2,algorithm="SAMME")
bc = BaggingClassifier(n_estimators=50,random_state=2)
etc = ExtraTreesClassifier(n_estimators=50,random_state=2)
gbdt = GradientBoostingClassifier(n_estimators=50,random_state=2)
xgb = XGBClassifier(n_estimators=50,random_state=2)

#### Storing Classifiers in a Dictionary

In [None]:
clfs = {
    "SVC":svc,
    "KN" :knc,
    "NB" :mnb,
    "DT" :dtc,
    "LR" :lrc,
    "RF" :rfc,
    "AdaBoost":abc,
    "BgC":bc,
    "ETC":etc,
    "GBDT":gbdt,
    "xgb":xgb
}

#### Training a Classifier

#### Explanation of the `train_classifier` Function
The `train_classifier` function performs the following steps:

Trains a classifier on the training data.

Predicts labels for the test data.

Evaluates the classifier's performance using accuracy and precision.

Returns the calculated metrics.

In [None]:
def train_classifier(clf,X_train,y_train,X_test,y_test):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
    return accuracy,precision

In [None]:
# train classfier
train_classifier(svc,X_train,y_train,X_test,y_test)

#### Evaluating Multiple Classifiers
This loop evaluates the performance of multiple classifiers stored in the `clfs` dictionary, prints their accuracy and precision scores, and stores them in lists for later comparison.

In [None]:
accuracy_scores = []
precision_scores = []
for name,clf in clfs.items():
    current_accuracy,current_precision = train_classifier(clf,X_train,y_train,X_test,y_test)
    print("For: ",name)
    print("accuracy: ",current_accuracy)
    print("precision: ",current_precision)

    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)

#### Creating a DataFrame for Classifier Performance

In [None]:
# create a dataframe for algorithms performace
performance_df = pd.DataFrame({"Algorithm":clfs.keys(),"Accuracy":accuracy_scores,"Precision":precision_scores})

In [None]:
performance_df

In [None]:
performance_df1 = pd.melt(performance_df,id_vars="Algorithm")

In [None]:
performance_df1

#### Visualizing Classifier Performance (Accuracy and Precision)

In [None]:
plt.style.use('dark_background')
plt.figure(figsize=(12, 6), dpi=150)
sns.barplot(data=performance_df1, x="Algorithm", y="value", hue="variable", palette="coolwarm")
plt.xlabel("Algorithm", fontsize=14, color='cyan', fontweight='bold')
plt.ylabel("Score", fontsize=14, color='cyan', fontweight='bold')
plt.title("Algorithm Performance (Accuracy & Precision)", fontsize=16, color='cyan', fontweight='bold')
plt.legend(loc=(1.1, 0.5), fontsize=12, title='Metrics', title_fontsize=14, labelspacing=1.2, borderpad=1)
plt.ylim(0.5, 1.0)
plt.tight_layout()
plt.xticks(rotation=90, color='lightgreen', fontsize=12, fontweight='light')
plt.savefig("algorithm_performance.png", dpi=300, bbox_inches='tight')
plt.show()

---
## Model Improve
---

#### Changing `max_features` in TF-IDF Vectorizer

In [None]:
# Change max_features parameter of TfIdf
tfidf = TfidfVectorizer(max_features=3000)

#### Creating a DataFrame for Classifier Performance with `max_features=3000`

In [None]:
temp_df = pd.DataFrame({"Algorithm": clfs.keys(), "Accuracy_max_ft_3000": accuracy_scores, "Precision_max_ft_3000": precision_scores})

In [None]:
temp_df

#### Merging DataFrames to Combine Results

In [None]:
# merge Dataframes
new_df_max_ft = performance_df.merge(temp_df,on="Algorithm")
new_df_max_ft

#### Applying TF-IDF Vectorization to the Text Data

In [None]:
X = tfidf.fit_transform(df["transformed_text"]).toarray()

#### Initializing Min-Max Scaler

In [None]:
scaler = MinMaxScaler()

#### Scaling Features and Extracting Target Labels

In [None]:
X = scaler.fit_transform(X)
y = df["target"].values

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [None]:
# GaussianNB
gnb = GaussianNB()
gnb.fit(X_train,y_train)
y_pred_gnb = gnb.predict(X_test)
print(accuracy_score(y_test,y_pred_gnb))
print(confusion_matrix(y_test,y_pred_gnb))
print(precision_score(y_test,y_pred_gnb))

In [None]:
# MultinomialNB
mnb = MultinomialNB()
mnb.fit(X_train,y_train)
y_pred_mnb = mnb.predict(X_test)
print(accuracy_score(y_test,y_pred_mnb))
print(confusion_matrix(y_test,y_pred_mnb))
print(precision_score(y_test,y_pred_mnb))

In [None]:
# BernoulliNB
bnb = BernoulliNB()
bnb.fit(X_train,y_train)
y_pred_bnb = bnb.predict(X_test)
print(accuracy_score(y_test,y_pred_bnb))
print(confusion_matrix(y_test,y_pred_bnb))
print(precision_score(y_test,y_pred_bnb))

In [None]:
accuracy_scores = []
precision_scores = []
for name,clf in clfs.items():
    current_accuracy,current_precision = train_classifier(clf,X_train,y_train,X_test,y_test)
    print("For: ",name)
    print("accuracy: ",current_accuracy)
    print("precision: ",current_precision)

    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)

#### Creating a DataFrame for Scaled Classifier Performance

In [None]:
temp_df_scaled = pd.DataFrame({"Algorithm":clfs.keys(),"Accuracy_scaled":accuracy_scores,"Precision_scaled":precision_scores})

In [None]:
temp_df_scaled

#### Merging DataFrames to Combine Results with Scaled Features

In [None]:
# Merge Dataframes
new_df_scaled = new_df_max_ft.merge(temp_df_scaled,on="Algorithm")

In [None]:
new_df_scaled

#### Adding the Number of Characters as an Additional Feature

In [None]:
X = np.hstack((X,df["characters_num"].values.reshape(-1,1)))
X.shape

In [None]:
y = df["target"].values
y.shape

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [None]:
# GaussianNB
gnb = GaussianNB()
gnb.fit(X_train,y_train)
y_pred_gnb = gnb.predict(X_test)
print(accuracy_score(y_test,y_pred_gnb))
print(confusion_matrix(y_test,y_pred_gnb))
print(precision_score(y_test,y_pred_gnb))

In [None]:
# MultinomialNB
mnb = MultinomialNB()
mnb.fit(X_train,y_train)
y_pred_mnb = mnb.predict(X_test)
print(accuracy_score(y_test,y_pred_mnb))
print(confusion_matrix(y_test,y_pred_mnb))
print(precision_score(y_test,y_pred_mnb))

In [None]:
# BernoulliNB
bnb = BernoulliNB()
bnb.fit(X_train,y_train)
y_pred_bnb = bnb.predict(X_test)
print(accuracy_score(y_test,y_pred_bnb))
print(confusion_matrix(y_test,y_pred_bnb))
print(precision_score(y_test,y_pred_bnb))

In [None]:
accuracy_scores = []
precision_scores = []
for name,clf in clfs.items():
    current_accuracy,current_precision = train_classifier(clf,X_train,y_train,X_test,y_test)
    print("For: ",name)
    print("accuracy: ",current_accuracy)
    print("precision: ",current_precision)

    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)

#### Creating a DataFrame for Classifier Performance with Number of Characters Feature

In [None]:
df_num_chars = pd.DataFrame({"Algorithm":clfs.keys(),"Accuracy_num_chars":accuracy_scores,"Precision_num_chars":precision_scores})

In [None]:
df_num_chars

#### Merging DataFrames to Combine Results with Number of Characters Feature

In [None]:
# Merge Dataframes
new_df_scaled_chars = new_df_scaled.merge(df_num_chars,on="Algorithm")

In [None]:
new_df_scaled_chars

#### Initializing Classifiers for Voting Classifier
These three classifiers will be used in a **Voting Classifier** to combine their predictions for improved accuracy.

In [None]:
# Voting Classifier
svc = SVC(kernel="sigmoid",gamma=1.0,probability=True)
mnb = MultinomialNB()
etc = ExtraTreesClassifier(n_estimators=50,random_state=2)

#### Initializing Voting Classifier

In [None]:
voting = VotingClassifier(estimators=[("svm",svc),("nb",mnb),("et",etc)],voting="soft")

In [None]:
voting.fit(X_train,y_train)

#### Making Predictions with the Voting Classifier

In [None]:
y_pred = voting.predict(X_test)

In [None]:
print("accuracy: ",accuracy_score(y_test,y_pred))
print("precision: ",precision_score(y_test,y_pred))

#### Initializing Estimators for Stacking

In [None]:
# Applaying stacking
estimators = [("svm",svc),("nb",mnb),("et",etc)]

#### Initializing the Final Estimator for Stacking

In [None]:
final_estimator = RandomForestClassifier()

#### Applying Stacking Classifier

In [None]:
clf = StackingClassifier(estimators=estimators,final_estimator=final_estimator)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

In [None]:
print("accuracy: ",accuracy_score(y_test,y_pred))
print("precision: ",precision_score(y_test,y_pred))

#### Saving the Model and Vectorizer using Pickle

In [None]:
pickle.dump(tfidf,open("vectorizer.pkl","wb"))
pickle.dump(mnb,open("model.pkl","wb"))