# Phishing Detection Using NLP Project

In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

### Data Import

In [None]:
df = pd.read_csv("./data/spam.csv", encoding='ISO-8859-1')

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df = df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1).rename(columns={'v1':'labels', 'v2':'text'})

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.shape

### Distribution Visualization

In [None]:
df['labels'].value_counts().plot(kind='pie',autopct='%.0f%%')

### Word Cloud

In [None]:
from wordcloud import WordCloud, STOPWORDS

In [None]:
from helper_func import TextClean
tc= TextClean()
df['text']=tc.fit_transform(df['text'])

corpus = ' '.join([j for i in df['text'].values for j in i.split(' ') if len(j)>2])

In [None]:
print(f"Size of corpus is {len(corpus.split(' '))} words and {len(corpus)} characters")

In [None]:
wc = WordCloud(width = 1200, height = 1200,
                background_color ='white',
                min_font_size = 10)
word_cloud = wc.generate(corpus)

In [None]:
plt.figure(figsize=(15,15))
plt.title("Word Cloud for Text", fontsize=20)
plt.imshow(word_cloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()

### Baseline Model

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['text'],df['labels'],stratify=df['labels'], test_size=.2)

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

**Text Preprocessing**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train=le.fit_transform(y_train)
y_test=le.transform(y_test)

count = CountVectorizer()
X_train=count.fit_transform(X_train)
X_test= count.transform(X_test)

tfidf = TfidfTransformer()
X_train=tfidf.fit_transform(X_train)
X_test= tfidf.transform(X_test)


**Baseline Model Training**

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train,y_train)
y_pred = nb.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, f1_score, precision_score, accuracy_score, recall_score
print(classification_report(y_test,y_pred))

**Creating and setting up a MLFlow experiment**

In [None]:
import mlflow

if mlflow.get_experiment_by_name('Phishing_Detection'):
    mlflow.delete_experiment(mlflow.get_experiment_by_name('Phishing_Detection').experiment_id)
experiment = mlflow.create_experiment('Phishing_Detection')
mlflow.set_experiment(experiment_name='Phishing_Detection')

In [None]:
le = LabelEncoder()
y=le.fit_transform(df['labels'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['text'],y,stratify=df['labels'], test_size=.2)

**Creating Text Preprocessing Pipeline**

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [None]:
preprocessor = Pipeline([('textprep',TextClean()),
                          ('cv',CountVectorizer()),
                          ('tfidf',TfidfTransformer())])

**Hyper-parameter ptimization with hyperopt**

In [None]:
from hyperopt import fmin, tpe, hp, SparkTrials, STATUS_OK
from hyperopt.pyll import scope

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from mlflow.models.signature import ModelSignature
from mlflow.types.schema import ColSpec, Schema


In [None]:
def objective(params):
     with mlflow.start_run(nested = True):
         
        classifier_type = params['type']
        del params['type']
        
        if classifier_type == 'gb':
            clf = GradientBoostingClassifier(**params)
        elif classifier_type == 'rf':
            clf = RandomForestClassifier(**params)
        elif classifier_type == 'nb':
            clf = MultinomialNB(**params)
        else:
            return 0
        
        pipeline = Pipeline(steps = [('preprocessor', preprocessor), ('model', clf)])
        
        pipeline.fit(X_train, y_train)
        
        predictions =  pipeline.predict(X_test) 
    
        train_accuracy_score =  pipeline.score(X_train, y_train)
        test_accuracy_score = accuracy_score(y_test, predictions)
        test_precision_score = precision_score(y_test, predictions)
        test_recall_score = recall_score(y_test, predictions)
        test_f1_score = f1_score(y_test, predictions)

        metrics = {
            'Train_accuracy_score': train_accuracy_score, 
            'Test_accuracy_score': test_accuracy_score,
            'Test_precision_score': test_precision_score,
            'Test_recall_score': test_recall_score,
            'Test_f1_score': test_f1_score 
        }

        mlflow.log_metrics(metrics)

        input = Schema([ColSpec('string','text')])
        output = Schema([ColSpec('integer')])
        signature = ModelSignature(inputs=input,outputs=output)
        mlflow.sklearn.log_model(pipeline, 
                                 f'clf_hpo_{classifier_type}',
                                 signature = signature)
        mlflow.set_tags({'model':f'clf_hpo_{classifier_type}',
                         'ac':test_accuracy_score})

        return {'loss': -test_f1_score, 'status': STATUS_OK}


In [None]:
search_space = hp.choice('classifier_type', [
    {
        'type': 'gb',
        'n_estimators':scope.int(hp.quniform('n_estimators_gb', 100, 500, 50)),
        'loss': hp.choice('loss', ['log_loss', 'exponential']),
        'criterion': hp.choice('criterion', ['friedman_mse', 'squared_error']),
        'max_depth': scope.int(hp.quniform('max_depth', 4, 15, 1)),
        'min_samples_leaf': scope.int(hp.uniform('min_samples_leaf_gb',1,5)),
        'min_samples_split': scope.int(hp.uniform('min_samples_split_gb',2,6))
    },
    {
        'type': 'rf',
        'n_estimators':scope.int(hp.quniform('n_estimators_rf', 100, 500, 50)),
        'max_depth': scope.int(hp.quniform('max_depth_rf', 4, 15, 1)),
        'min_samples_leaf': scope.int(hp.uniform('min_samples_leaf_rf',1,5)),
        'min_samples_split': scope.int(hp.uniform('min_samples_split_rf',2,6))
    },
    {
        'type': 'nb',
        'alpha': hp.lognormal('alpha', 0, 1.0),
        'force_alpha': hp.choice('force_alpha', [True, False])
    }
])

In [None]:
algo = tpe.suggest

with mlflow.start_run():
    best_result = fmin(
        fn = objective, 
        space = search_space,
        algo = algo,
        max_evals = 32,
    )

In [None]:
import hyperopt
print(hyperopt.space_eval(search_space, best_result))

In [None]:
hyperopt.space_eval(search_space, best_result)

In [None]:
params=hyperopt.space_eval(search_space, best_result)
del params['type']

**Registering best model**

In [None]:
import json

In [None]:
runs =json.loads(mlflow.search_runs(mlflow.get_experiment_by_name('Phishing_Detection').experiment_id).sort_values('metrics.Test_f1_score',ascending=False)['tags.mlflow.log-model.history'][0][1:-1])
model_name=runs['artifact_path']
run_id=runs['run_id']

In [None]:
model_name

**Getting Prediction from best model**

In [None]:
model_uri = f"runs:/{run_id}/{model_name}"
loaded_model = mlflow.sklearn.load_model(model_uri)

In [None]:

loaded_model.fit(X_train,y_train)

In [None]:
y_pred=loaded_model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, f1_score, precision_score, accuracy_score, recall_score
print(classification_report(y_test,y_pred))

In [None]:
mlflow.register_model(model_uri, model_name)

In [None]:
client = mlflow.MlflowClient()

In [None]:
client.update_model_version(
    name = model_name,
    version = 1,
    description = 'This model had the best accuracy score '
)

In [None]:
client.transition_model_version_stage(
  name = model_name,
  version = 1,
  stage = 'Production'
)

- Serve the model locally (make sure to replace the right run id)

`mlflow models serve -m <model_uri> --env-manager local --host 127.0.0.1:1234`

- Now open up a new tab

`curl -X POST -H "Content-Type:application/json" --data '{"dataframe_split": {"columns":['text],"data":["Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."]}}' http://127.0.0.1:1234/invocations`