In [166]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec,KeyedVectors
import gensim.downloader as api
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

In [167]:
df=pd.read_csv('spam_dataset.csv')

In [168]:
df.head()


Unnamed: 0,title,text,type
0,?? the secrets to SUCCESS,"Hi James,\n\nHave you claim your complimentary...",spam
1,?? You Earned 500 GCLoot Points,"\nalt_text\nCongratulations, you just earned\n...",not spam
2,?? Your GitHub launch code,"Here's your GitHub launch code, @Mortyj420!\n ...",not spam
3,[The Virtual Reward Center] Re: ** Clarifications,"Hello,\n \nThank you for contacting the Virtua...",not spam
4,"10-1 MLB Expert Inside, Plus Everything You Ne...","Hey Prachanda Rawal,\n\nToday's newsletter is ...",spam


In [169]:
df.shape

(84, 3)

## REMOVING SPECIAL CHARACTER


In [170]:
lemmatizer=WordNetLemmatizer()


In [171]:
titles = []
for i in range(len(df['title'])):
    review = re.sub('[^a-zA-Z]', ' ', df['title'][i])  # Fixed regex: 'A-z' to 'A-Z'
    review = review.lower()
    words = review.split()
    review = [lemmatizer.lemmatize(word) for word in words if word not in stopwords.words('english')]
    review = ' '.join(review)
    titles.append(review)


In [172]:
df['title']=titles

In [173]:
text = []
for i in range(len(df['text'])):
    review = re.sub('[^a-zA-Z]', ' ', df['text'][i])  # Fixed regex: 'A-z' to 'A-Z'
    review = review.lower()
    words = review.split()
    review = [lemmatizer.lemmatize(word) for word in words if word not in stopwords.words('english')]
    review = ' '.join(review)
    text.append(review)


In [174]:
df['text']=text

## Dependent and independent variables splitting

In [175]:
X=df.iloc[:,:-1]

In [176]:
y=df['type']

## Encoding Dependent variables

In [177]:
y=pd.get_dummies(df['type']).astype(int)

In [178]:
y=y.iloc[:,0]

## Train Test Split

In [179]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)


## Vectorization of Data using word2vec

In [180]:
from sklearn.feature_extraction.text import CountVectorizer

In [181]:
print(type(X_train))

<class 'pandas.core.frame.DataFrame'>


In [182]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv_title = TfidfVectorizer(max_features=2500, ngram_range=(1,2))
titles_train = tv_title.fit_transform(X_train['title']).toarray()
titles_test = tv_title.transform(X_test['title']).toarray()

tv_text = TfidfVectorizer(max_features=2500, ngram_range=(1,2))
text_train = tv_text.fit_transform(X_train['text']).toarray()
text_test = tv_text.transform(X_test['text']).toarray()




In [183]:
combined_train = np.hstack([titles_train, text_train])
combined_test = np.hstack([titles_test, text_test])

In [184]:
combined_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [185]:
combined_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [186]:
from sklearn.naive_bayes import MultinomialNB

In [187]:
baye=MultinomialNB()

In [188]:
baye.fit(combined_train,y_train)

In [189]:
combined_test.shape

(26, 2851)

In [190]:
print(combined_test)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [191]:
y_pred=baye.predict(combined_test)

In [192]:
# importing Libraries
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc

In [193]:
cm = confusion_matrix(y_test, y_pred)
# Accuracy
accuracy = accuracy_score(y_test, y_pred)
# Precision
precision = precision_score(y_test, y_pred)
# Recall
recall = recall_score(y_test, y_pred)
# F1-Score
f1 = f1_score(y_test, y_pred)
# ROC Curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)

print("Confusion Matrix:",cm)
print("Accuracy", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)
print("ROC AUC:", roc_auc)

Confusion Matrix: [[ 2  7]
 [ 0 17]]
Accuracy 0.7307692307692307
Precision: 0.7083333333333334
Recall: 1.0
F1-Score: 0.8292682926829268
ROC AUC: 0.6111111111111112


In [194]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report,ConfusionMatrixDisplay, \
                            precision_score, recall_score, f1_score, roc_auc_score,roc_curve

In [201]:
models={
    "Logisitic Regression":LogisticRegression(),
    "Decision Tree":DecisionTreeClassifier(),
    "Random Forest":RandomForestClassifier(),
    "Gradient Boost":GradientBoostingClassifier(),
    "Adaboost":AdaBoostClassifier(),
    "Xgboost":XGBClassifier(),
    "Naive_bayes":MultinomialNB()
}
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(combined_train,y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(combined_train)
    y_test_pred = model.predict(combined_test)

    # Training set performance
    model_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
    model_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score
    model_train_precision = precision_score(y_train, y_train_pred) # Calculate Precision
    model_train_recall = recall_score(y_train, y_train_pred) # Calculate Recall
    model_train_rocauc_score = roc_auc_score(y_train, y_train_pred)


    # Test set performance
    model_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
    model_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
    model_test_precision = precision_score(y_test, y_test_pred) # Calculate Precision
    model_test_recall = recall_score(y_test, y_test_pred) # Calculate Recall
    model_test_rocauc_score = roc_auc_score(y_test, y_test_pred) #Calculate Roc


    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Accuracy: {:.4f}".format(model_train_accuracy))
    print('- F1 score: {:.4f}'.format(model_train_f1))
    
    print('- Precision: {:.4f}'.format(model_train_precision))
    print('- Recall: {:.4f}'.format(model_train_recall))
    print('- Roc Auc Score: {:.4f}'.format(model_train_rocauc_score))

    
    
    print('----------------------------------')
    
    print('Model performance for Test set')
    print('- Accuracy: {:.4f}'.format(model_test_accuracy))
    print('- F1 score: {:.4f}'.format(model_test_f1))
    print('- Precision: {:.4f}'.format(model_test_precision))
    print('- Recall: {:.4f}'.format(model_test_recall))
    print('- Roc Auc Score: {:.4f}'.format(model_test_rocauc_score))

    
    print('='*35)
    print('\n')
    

Logisitic Regression
Model performance for Training set
- Accuracy: 0.9655
- F1 score: 0.9649
- Precision: 0.9535
- Recall: 1.0000
- Roc Auc Score: 0.9412
----------------------------------
Model performance for Test set
- Accuracy: 0.6923
- F1 score: 0.5985
- Precision: 0.6800
- Recall: 1.0000
- Roc Auc Score: 0.5556


Decision Tree
Model performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- Roc Auc Score: 1.0000
----------------------------------
Model performance for Test set
- Accuracy: 0.6923
- F1 score: 0.6816
- Precision: 0.7368
- Recall: 0.8235
- Roc Auc Score: 0.6340


Random Forest
Model performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- Roc Auc Score: 1.0000
----------------------------------
Model performance for Test set
- Accuracy: 0.7308
- F1 score: 0.6681
- Precision: 0.7083
- Recall: 1.0000
- Roc Auc Score: 0.6111


Gradient Boost
Model performance for Training se

In [202]:
## Hyperparameter Training
rf_params = {"max_depth": [5, 8, 15, None, 10],
             "max_features": [5, 7, "auto", 8],
             "min_samples_split": [2, 8, 15, 20],
             "n_estimators": [100, 200, 500, 1000]}
xgboost_params = {"learning_rate": [0.1, 0.01],
                  "max_depth": [5, 8, 12, 20, 30],
                  "n_estimators": [100, 200, 300],
                  "colsample_bytree": [0.5, 0.8, 1, 0.3, 0.4]}

In [203]:
# Models list for Hyperparameter tuning
randomcv_models = [
                   ("RF", RandomForestClassifier(), rf_params),
    ("Xgboost", XGBClassifier(), xgboost_params)
                   
                   ]

In [205]:
from sklearn.model_selection import RandomizedSearchCV

model_param = {}
for name, model, params in randomcv_models:
    random = RandomizedSearchCV(estimator=model,
                                   param_distributions=params,
                                   n_iter=100,
                                   cv=3,
                                   verbose=2,
                                   n_jobs=-1)
    random.fit(combined_train, y_train)
    model_param[name] = random.best_params_

for model_name in model_param:
    print(f"---------------- Best Params for {model_name} -------------------")
    print(model_param[model_name])

Fitting 3 folds for each of 100 candidates, totalling 300 fits


75 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
56 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Projects\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Projects\venv\lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "c:\Projects\venv\lib\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "c:\Projects\venv\lib\site-packages\sklearn\utils\_param_validation.py", line 96, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_val

Fitting 3 folds for each of 100 candidates, totalling 300 fits
---------------- Best Params for RF -------------------
{'n_estimators': 100, 'min_samples_split': 20, 'max_features': 5, 'max_depth': 10}
---------------- Best Params for Xgboost -------------------
{'n_estimators': 100, 'max_depth': 20, 'learning_rate': 0.01, 'colsample_bytree': 1}


In [207]:
models={
    
    "Random Forest":RandomForestClassifier(n_estimators=100,min_samples_split=20,
                                          max_features=5,max_depth=10),
    "Xgboost":XGBClassifier(n_estimators=100,max_depth=20,learning_rate=0.01,
                           colsample_bytree=1)
}
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(combined_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(combined_train)
    y_test_pred = model.predict(combined_test)

    # Training set performance
    model_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
    model_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score
    model_train_precision = precision_score(y_train, y_train_pred) # Calculate Precision
    model_train_recall = recall_score(y_train, y_train_pred) # Calculate Recall
    model_train_rocauc_score = roc_auc_score(y_train, y_train_pred)


    # Test set performance
    model_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
    model_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
    model_test_precision = precision_score(y_test, y_test_pred) # Calculate Precision
    model_test_recall = recall_score(y_test, y_test_pred) # Calculate Recall
    model_test_rocauc_score = roc_auc_score(y_test, y_test_pred) #Calculate Roc


    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Accuracy: {:.4f}".format(model_train_accuracy))
    print('- F1 score: {:.4f}'.format(model_train_f1))
    
    print('- Precision: {:.4f}'.format(model_train_precision))
    print('- Recall: {:.4f}'.format(model_train_recall))
    print('- Roc Auc Score: {:.4f}'.format(model_train_rocauc_score))

    
    
    print('----------------------------------')
    
    print('Model performance for Test set')
    print('- Accuracy: {:.4f}'.format(model_test_accuracy))
    print('- F1 score: {:.4f}'.format(model_test_f1))
    print('- Precision: {:.4f}'.format(model_test_precision))
    print('- Recall: {:.4f}'.format(model_test_recall))
    print('- Roc Auc Score: {:.4f}'.format(model_test_rocauc_score))

    
    print('='*35)
    print('\n')

Random Forest
Model performance for Training set
- Accuracy: 0.8793
- F1 score: 0.8684
- Precision: 0.8542
- Recall: 1.0000
- Roc Auc Score: 0.7941
----------------------------------
Model performance for Test set
- Accuracy: 0.6923
- F1 score: 0.5985
- Precision: 0.6800
- Recall: 1.0000
- Roc Auc Score: 0.5556


Xgboost
Model performance for Training set
- Accuracy: 0.8103
- F1 score: 0.7762
- Precision: 0.7885
- Recall: 1.0000
- Roc Auc Score: 0.6765
----------------------------------
Model performance for Test set
- Accuracy: 0.6154
- F1 score: 0.5481
- Precision: 0.6522
- Recall: 0.8824
- Roc Auc Score: 0.4967


