# `SkLearn Basics: using Logistic Regression`

# <font color=red>Mr Fugu Data Science</font>
    
# (◕‿◕✿)

# Purpose & Outcome:

+ Use Binary Classification and go through an example to see how sklearn is used

+ Show Logistic regression, NLP and working with spam or no spam messages
    + Print metrics such as F1-score, precision, recall
    + show 3 ways to get accuracy

In [3]:
%matplotlib inline 

# For file operations
import requests
import pprint # for pretty printing
import os # listing and managing file path
import zipfile # for zip and unzip utilities
import csv

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt # for plotting

In [11]:
# NLP
import string
from collections import Counter
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Training model:
# for converting documents in word count
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report

from joblib import dump, load # save model and load them

In [4]:
# get our dataset of Ham/Spam:

data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip'
r = requests.get(data_url)

In [5]:
# Download our dataset:
sms_zip_file = 'smsspamcollection.zip'

with open(sms_zip_file, 'wb') as out_file:
    out_file.write(r.content)

In [6]:
# verify the download:
!ls -lth *.zip

-rw-r--r--  1 zatoichi59  staff   199K Oct 31 10:03 smsspamcollection.zip


In [7]:
with zipfile.ZipFile(sms_zip_file,"r") as zip_ref:
    zip_ref.extractall("data")

In [8]:
! ls data

SMSSpamCollection readme


In [10]:
!head -n 10 data/SMSSpamCollection

ham	Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
ham	Ok lar... Joking wif u oni...
spam	Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
ham	U dun say so early hor... U c already then say...
ham	Nah I don't think he goes to usf, he lives around here though
spam	FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv
ham	Even my brother is not like to speak with me. They treat me like aids patent.
ham	As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune
spam	WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only

In [12]:
messages_df = pd.read_csv('./data/SMSSpamCollection', sep='\t',
                           names=["label", "message"])

messages_df.head(7)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...


# `Clean up our messages with some NLP before modeling`

In [13]:
english_stopwords = set(stopwords.words('english'))

In [17]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

In [18]:
def clean_messages(msg):
    msg = msg.lower()
    msg_tokens = nltk.word_tokenize(msg)
    #remove stopwords
    clean_msg_tokens = [w for w in msg_tokens if w not in english_stopwords]
    #remove puncuations
    clean_msg_tokens_puct = [ w for w in clean_msg_tokens if w not in string.punctuation]
    # lemmatize: deal with endings
    lemmatized_token = [wordnet_lemmatizer.lemmatize(w) for w in  clean_msg_tokens_puct]
    return lemmatized_token

In [19]:
messages_df['clean_message'] = messages_df.message.apply(clean_messages)

In [20]:
messages_df.head()

Unnamed: 0,label,message,clean_message
0,ham,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy.., available, bugis,..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, ..., joking, wif, u, oni, ...]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, ..., u, c, already, ..."
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, n't, think, go, usf, life, around, though]"


In [22]:
# if/else converting labels to binary
y = np.where(messages_df.label == 'ham', 1, 0)

In [25]:
X_string_messages = messages_df.clean_message.apply(' '.join).values

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

# `Train/Test split:`

+ If you don't have enough data this will be a problem with your accuracy

+ Leaving to little data left for the test set, you will also have problems with your accuracy as well. 

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X_string_messages, y, test_size = 0.3,
    random_state = 37)
print ("X_train: ", len(X_train))
print("X_test: ", len(X_test))
print("y_train: ", len(y_train))
print("y_test: ", len(y_test))

X_train:  3900
X_test:  1672
y_train:  3900
y_test:  1672


# `Count Vectorizer:`

+ Convert a list/array of text documents into matrix of tokens. The dimensions will be the same as your vocabulary and keep a count of each word

In [33]:
cv = CountVectorizer()

cv.fit(X_train)


CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=1500, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [35]:
X_train_cv = cv.transform(X_train)
X_train_cv

<3900x1500 sparse matrix of type '<class 'numpy.int64'>'
	with 25944 stored elements in Compressed Sparse Row format>

In [36]:
X_test_cv = cv.transform(X_test)
X_test_cv

<1672x1500 sparse matrix of type '<class 'numpy.int64'>'
	with 10963 stored elements in Compressed Sparse Row format>

# `Explain what is going on below:`

we are creating a variable `clf` which is storing our `Logistic Regression` model that will be used to make predictions on our data. 

+ `cv=5`: will be our cross validation (k-folds), here I chose 5.

Why use this `Cross Validation` anyway?

Well, it is useful for a few reasons: consider if you were splitting your data into `train/validate/split` this can drastically reduce the amount of data you have and if you don't have a heaping gluttonous amount of data then maybe you would need to remove the validation step.

You can use the `CV` with the train/test model and still be alright. But, you have to choose what are called `folds` or more commonly `k-folds`. This is the amount of sets you will create by splitting up your data or (`k-smaller`) sets of data.

+ The data are trained on **(k-1)** folds
+ Then the resulting data are checked for instance on accuracy

+ The result will be a performance of how your averaged `CV` was scored for that loop.

**Basically, cv=5 splits the data 5 times and each run will split a different percent train/test and in the end give back an averaged result.** That result can be your accuracy for instance. 


[sklearn doc for clarity](https://scikit-learn.org/stable/modules/cross_validation.html)

Now the model will try to take our training data and generate a fit based on what we provide. 

+ After this step we have to taken unseen data `test set` that was with held and see how it predicts from what we initially learned. The result will be our accuracy in this case, which will show that we have a 98.5% chance of test data correctly labeling Ham/Spam.

+ **Always, watchout for over/underfitting NO matter what you are doing! Good Life lesson ok---**

In [40]:
# convert/cast type as int
y_train=y_train.astype('int')
y_test=y_test.astype('int')

# Cross Val. with Log Regr.
clf=LogisticRegressionCV(cv=5,max_iter=200,scoring='accuracy')

print(clf.fit(X_train_cv,y_train))
print("Train Scoring ('Accuracy'): ",clf.score(X_train_cv,y_train))

LogisticRegressionCV()

LogisticRegressionCV(Cs=10, class_weight=None, cv=5, dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=200, multi_class='auto', n_jobs=None,
                     penalty='l2', random_state=None, refit=True,
                     scoring='accuracy', solver='lbfgs', tol=0.0001, verbose=0)
Train Scoring:  0.9992307692307693


# `3 ways to do the same thing: `

+ **Predict Ham/spam: using Logistic Regression**

+ `Accuracy:` number_correctly_labeled_oberservations / total_oberservations

Can think of it like this:

`Accuracy` = (TP+TN) / (TP+FP+FN+TN)




|        	|          	| **Predicted**      	| Predicted      	|
|--------	|----------	|----------------	|----------------	|
|        	|          	| `Negative`       	| `Positive`       	|
| **Actual** 	| `Positive` 	| FN 	| TP  	|
    | **Actual** 	| `Negative` 	| TN  	| FP 	|
    
`----------------------`

**`A side not:`** if the amount of false negative and false positive are fairly close, than you have a good chance that `accuracy` will be a good call. But, if they are very different when compared to each other consider using F1-score.

In [46]:

clf.score(X_test_cv,y_test)

0.9856459330143541

In [50]:
from sklearn.metrics import accuracy_score
accuracy_score(clf.predict(X_test_cv),y_test)

0.9856459330143541

In [52]:
# By Hand:
np.sum(np.argmax(clf.predict_proba(X_test_cv),axis=1)==y_test)/len(y_test)

0.9856459330143541

# `Describe our Metrics:`

+ `Precision:` ratio predicted positive observations to total positive predicted observations

`Precision = TP / (TP + FP)`

+ `Recall:` sensitivity, correctly predicted positive observations to all observations in actual class.

`Recall = TP/(TP+FN)`, where the true positive = (TP+FP)

I need to explain two valid and important points here:

1.) If you have a bank that is looking at fraud detection and predicted to be negative, well that is a problems when indeed it was fraud. 

2.) Suppose you have a really bad virus, well like we have currently and you accidently predict that a sick person is not sick from some given test. Saying this would result in a False Negative and can be detrimental if you have a highly infectious disease.


+  `F1-score:` weighted average of recall and precission, use when there are unbalanced class distribution. For example,  a large amount of true negatives: TN 

`F1-Score = 2*(Recall * Precision) / (Recall + Precision)`


+ `Support:` frequency of items from all items that occur

In [53]:
from sklearn.metrics import classification_report

print(classification_report(y_test,clf.predict(X_test_cv)))

# Ham=1 , spam=0

              precision    recall  f1-score   support

           0       0.99      0.91      0.95       240
           1       0.99      1.00      0.99      1432

    accuracy                           0.99      1672
   macro avg       0.99      0.96      0.97      1672
weighted avg       0.99      0.99      0.99      1672



**`Outcome:`** 

+ We have a 98.5% `accuracy` meaning that from our data, we correctly predicted 98.5% of predicted observations/total observations 

+ We have a `precision` of 99% for both classes meaning that: we correctly predicted the positive observations to the total predicted positive observations.

+ `Recall`, was 91% predicted for spam meaning that the true positives were correctly predicted 91% of the time using Logistic regression.

+ `F1-score:` showed that the weighted ratio of precision and recall showed that 95% of the time spam was correctly labeled given we take into account false positive and false negatives.

Overall, we can see that the f1-score would be something we should focus on and see what other models may reflect these data better, if possible. Depeding on what threshold you would like to meet.

In [93]:
# ham=1, spam=0
ham_spam=pd.DataFrame(list(zip(y,  clf.predict(X_test_cv))),columns=['Actual','Predicted'])

ham_spam.head(10)

actual=np.where(ham_spam.Actual == 1, 'ham','spam')
predicted=np.where(ham_spam.Predicted == 1, 'ham','spam')
ham_spam_updt=pd.DataFrame(list(zip(actual,predicted)),columns=['Acutal','Predicted'])

pd.concat([ham_spam_updt,messages_df.iloc[:1672,1]],axis=1).head(10)



Unnamed: 0,Acutal,Predicted,message
0,ham,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,ham,Ok lar... Joking wif u oni...
2,spam,ham,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,ham,U dun say so early hor... U c already then say...
4,ham,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,ham,FreeMsg Hey there darling it's been 3 week's n...
6,ham,ham,Even my brother is not like to speak with me. ...
7,ham,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,ham,WINNER!! As a valued network customer you have...
9,spam,ham,Had your mobile 11 months or more? U R entitle...


# <font color=red>Like</font>, Share &

# <font color=red>SUB</font>scribe

# Citations & Help:

# ◔̯◔

https://machinelearningmastery.com/a-gentle-introduction-to-scikit-learn-a-python-machine-learning-library/

https://towardsdatascience.com/spam-classifier-in-python-from-scratch-27a98ddd8e73

https://www.kaggle.com/snehithatiger/spam-or-ham-classification

https://adataanalyst.com/scikit-learn/countvectorizer-sklearn-example/

https://machinelearningmastery.com/prepare-text-data-machine-learning-scikit-learn/

https://www.youtube.com/watch?v=RZYjsw6P4nI

https://towardsdatascience.com/natural-language-processing-count-vectorization-with-scikit-learn-e7804269bb5e

https://blog.exsilio.com/all/accuracy-precision-recall-f1-score-interpretation-of-performance-measures/

https://towardsdatascience.com/accuracy-precision-recall-or-f1-331fb37c5cb9

https://towardsdatascience.com/association-rules-2-aa9a77241654