
**Deploying Machine Learning Model on Streamlit Cloud**

1.   Saving and Loading Sklearn Pipeline (Or Model)
2.   Streamlit Introduction
3.   Integrate Sklearn Model Into Streamlit
4.   Push Inetegated Code to Github Repo
5.   Deploy Github Repo into Streamlit Cloud






**Streamlit Code**
https://github.com/PradipNichite/sklearn_streamlit

In [1]:
!python -m spacy download en_core_web_sm

^C


In [2]:
import numpy as np
import pandas as pd 

# ## For tokenization and data pre-processing 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
# ## fro cleaning the data 
import string
import spacy
from sklearn import metrics

In [3]:
np.random.seed(42)

In [7]:
data = pd.read_csv("C:\\Users\\DSSGayathreeDevi\\Desktop\\surge_notebook\\NLP Module\\Spam Detector Component_1\\SMSSpamCollection.txt", sep = "\t", names=["label", "message"])
data['label'] = data['label'].map({'ham':0,'spam':1})
data_1 = data[data['label']==1]
data_0 = data[data['label']==0]
sample_count = len(data[data['label']==1])
data_0 = data_0.sample(sample_count, replace=True, random_state=1)
data_balanced = pd.concat([data_0,data_1], axis=0)
data = data_balanced
data.head()

Unnamed: 0,label,message
278,0,"Awesome, I'll see you in a bit"
4603,0,"Hey j! r u feeling any better, hopeSo hunny. i..."
1066,0,No my mum went 2 dentist.
3195,0,And you! Will expect you whenever you text! Ho...
3341,0,Like I made him throw up when we were smoking ...


In [8]:
nlp = spacy.load("en_core_web_sm")

In [9]:
stop_words = nlp.Defaults.stop_words
print(stop_words)
punctuations = string.punctuation
punctuations

{'by', 'sixty', 'formerly', 'at', 'afterwards', 'without', 'beside', 'hence', 'neither', 're', '’m', 'show', 'give', 'elsewhere', 'although', 'hers', 'which', 'been', 'nothing', 'and', 'yet', '‘ve', 'one', 'or', 'please', 'hereafter', 'it', 'since', 'nor', '‘re', 'often', 'more', 'herself', 'becomes', 'whereupon', 'my', 'four', 'though', 'becoming', 'anything', "'d", 'thus', 'you', 'latterly', 'unless', 'to', 'not', 'further', 'anyway', 'between', 'may', 'her', 'side', 'eleven', '‘ll', 'almost', 'yourself', 'part', 'name', 'using', 'can', 'be', 'really', 'due', 'other', 'anyhow', 'itself', 'has', 'just', 'nobody', 'had', 'seeming', 'make', 'once', 'through', 'see', 'therein', 'no', 'became', 'above', 'six', 'top', 'whereafter', 'yours', 'any', 'ten', 'others', 'go', 'herein', 'are', 'third', 'against', '‘s', 'but', 'twenty', 'mine', 'except', 'as', 'then', 'nevertheless', 'should', 'someone', 'whole', 'always', 'on', 'up', 'sometime', 'per', "n't", 'beyond', 'sometimes', 'your', 'there

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [10]:
# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    doc = nlp(sentence)

    # print(doc)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() for word in doc ]

    # print(mytokens)

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [11]:
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [12]:
from sklearn.model_selection import train_test_split

X = data['message'] # the features we want to analyze
ylabels = data['label'] # the labels, or answers, we want to test against

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.2)

In [13]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()

In [14]:
pipe = Pipeline([ ('vectorizer', tfidf_vector),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(tokenizer=<function spacy_tokenizer at 0x00000200862F8280>)),
                ('classifier', RandomForestClassifier())])

In [15]:
predicted = pipe.predict(X_test)

# Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

Logistic Regression Accuracy: 0.9364548494983278
Logistic Regression Precision: 0.9927007299270073
Logistic Regression Recall: 0.8831168831168831


In [16]:
pipe.predict(['Free entry pass'])

array([0], dtype=int64)

In [17]:
pipe.predict(['win lottery'])

array([1], dtype=int64)

In [18]:
pipe.predict(['I am waiting to get response from you'])

array([0], dtype=int64)

**Save and Load Sklearn Pipeline (Or Model)**

In [19]:
from joblib import dump, load

In [20]:
dump(pipe, 'pipeline.joblib') 

['pipeline.joblib']

In [21]:
pipeline = load('pipeline.joblib')

In [22]:
pipeline

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(tokenizer=<function spacy_tokenizer at 0x00000200862F8280>)),
                ('classifier', RandomForestClassifier())])

In [23]:
pipeline.predict(['win lottery'])

array([1], dtype=int64)