In [1]:
import os

In [2]:
# imports
import pandas as pd
import numpy as np

In [3]:
# get test and train dataframes
rawTextData = os.path.join('dataset', 'SPAM text message 20170820 - Data.csv')
print(rawTextData)
text_df = pd.read_csv(rawTextData, index_col=False)

dataset\SPAM text message 20170820 - Data.csv


In [4]:
# check data type 
type(text_df)

pandas.core.frame.DataFrame

In [5]:
# use .info() to get the basic information about the data frame
text_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
Category    5572 non-null object
Message     5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB


In [6]:
text_df.head(10)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [7]:
text_df['Message'][100]

"Please don't text me anymore. I have nothing else to say."

In [8]:
### Encoding ham to 0 and spam to 1
texts = []
labels = []

for i, label in enumerate(text_df['Category']):
    #print(text)
    texts.append(text_df['Message'][i])
    if label == 'ham':
        labels.append(0)
    else:
        labels.append(1)

print(labels[0:20])

for i in range(5):
    print(texts[i])
        

[0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1]
Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
Ok lar... Joking wif u oni...
Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
U dun say so early hor... U c already then say...
Nah I don't think he goes to usf, he lives around here though


In [9]:
type(texts)

list

In [10]:
texts = np.asarray(texts)
labels = np.asarray(labels)

In [None]:
type(texts)

In [None]:
print(texts[0])

## Data analysis

### Important to note: this is imbalanced data

####### There are more ham than spam mails

####### Ratio -- ham: spam = 6.5 : 1

In [None]:
print(np.unique(labels))
print(np.bincount(labels))

In [None]:
4825 / 745

#### Shuffle data and split 80% data for train and 20% for test

In [None]:
np.random.seed(42)

# shuffle data
indices = np.arange(text_df.shape[0])
np.random.shuffle(indices)
texts = texts[indices]
labels = labels[indices]

In [None]:
split_percent = 0.8

In [None]:
# Split data 80%:20%
training_samples = int(len(indices)*split_percent)

train_data_X = texts[0:training_samples]
train_data_Y = labels[0:training_samples]

test_data_x = texts[training_samples:]
test_data_y = labels[training_samples:]

# Test split
if len(texts) == len(train_data_X)+len(test_data_x):
    print("Split Success, 80% training data, 20% test data")
    print("train_X data size: {},   test_x data size: {}".format(len(train_data_X), len(test_data_x)))
    print("train_Y data size: {},   test_y data size: {}".format(len(train_data_Y), len(test_data_y)))

#### Bag of wods

In [None]:
'''
BAG OF WORDS MODEL TO SPAM HAM train_data_X
'''

from sklearn.feature_extraction.text import CountVectorizer
countVect = CountVectorizer().fit(train_data_X)
X_train = countVect.transform(train_data_X)
print(repr(X_train))

In [None]:
type(X_train)

In [None]:
X_train.shape

In [None]:
print(len(X_train.toarray()[0]))

In [None]:
X_test = countVect.transform(test_data_x)

In [None]:
np.bincount(train_data_Y)

#### Logistic Regression

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression

In [None]:
logreg = LogisticRegression()
param_grid = {'C': [0.01, 0.1, 1, 10, 20, 50, 100]}

grid = GridSearchCV(logreg, param_grid, cv=5)
logreg_train = grid.fit(X_train, train_data_Y)

In [None]:
print(grid.best_estimator_)

In [None]:
# logreg = LogisticRegression(C=0.1).fit(X_train, y_train)
pred_logreg = logreg_train.predict(X_test)
print("accuracy is: ", grid.score(X_test, test_data_y))

## Note: this is imbalance datasets and a simple accuracy is not reliable (always predicting a major class will result in high accuracy)

## Thus important to check confusion matrix !!

In [None]:
confusion = confusion_matrix(test_data_y, pred_logreg)
print(confusion)

## How to imporve ?

## Check the vocabulary

In [None]:
features_names = countVect.get_feature_names()
print(len(features_names))
print("\n")
# print first 20 features
print(features_names[:20])
print("\n")
# print last 20 features
print(features_names[-20:])
print("\n")
# print every 50th word
print(features_names[::400])

## As we saw above, there are many uninformative features(words) such as numbers
## We will use only the words that appear in at least 3 emails(documents) -- In other words, we will use frequent words which are also likely to be in the test set¶

In [None]:
# min_df controls this condition(min_df=3 means pick up words which appear
# at least 3 documents)
vect = CountVectorizer(min_df=3).fit(train_data_X)

X_train = vect.transform(train_data_X)
X_test = vect.transform(test_data_x)
print(repr(X_train))

In [None]:
features_names = vect.get_feature_names()
print(len(features_names))
print("\n")
# print first 20 features
print(features_names[:20])
print("\n")
# print last 20 features
print(features_names[-20:])
print("\n")
# print every 50th word
print(features_names[::400])

In [None]:
logreg = LogisticRegression()
param_grid = {'C': [0.01, 0.1, 1, 10, 20, 100]}

grid = GridSearchCV(logreg, param_grid, cv=5)
logreg_train = grid.fit(X_train, train_data_Y)

In [None]:
# logreg = LogisticRegression(C=0.1).fit(X_train, y_train)
pred_logreg = logreg_train.predict(X_test)
print("accuracy is: ", grid.score(X_test, test_data_y))

## False positive rate and false negative rate decreased by 2

In [None]:
confusion = confusion_matrix(test_data_y, pred_logreg)
print(confusion)

## Another way to imporve this accuracy is to remove stop words¶

In [None]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
print("Number of stop words is :", len(ENGLISH_STOP_WORDS), "\n")
print("Examples: ", list(ENGLISH_STOP_WORDS)[::10])

In [None]:
vect = CountVectorizer(min_df=3, stop_words='english').fit(train_data_X)
X_train = vect.transform(train_data_X)
X_test = vect.transform(test_data_x)
print(repr(X_train))

## Now there are x (2404 - 2178 ) less features

In [None]:
logreg = LogisticRegression()
param_grid = {'C': [0.01, 0.1, 1, 10, 100]}

grid = GridSearchCV(logreg, param_grid, cv=5)
logreg_train = grid.fit(X_train, train_data_Y)

In [None]:
# logreg = LogisticRegression(C=0.1).fit(X_train, y_train)
pred_logreg = logreg_train.predict(X_test)
print("accuracy is: ", grid.score(X_test, test_data_y))
confusion = confusion_matrix(test_data_y, pred_logreg)
print("confusion matrix \n", confusion)

## 2 ) TFIDF
Difference from bag of words -- tfidf gives hight weights to the words which appear less in many document. In other words, it recoginizes the words showing specific document as important words (since these are the s features of the specific documents ).

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline

In [None]:
logreg = LogisticRegression()

In [None]:
pipe = make_pipeline(TfidfVectorizer(min_df=3, norm=None, stop_words='english'), logreg)
param_grid = {'logisticregression__C': [0.001, 0.01, 0.1, 1, 10]}

grid = GridSearchCV(pipe, param_grid, cv=5)
logreg_train = grid.fit(train_data_X, train_data_Y)

In [None]:
grid

In [None]:
logreg.get_params().keys()

In [None]:
print(grid.best_estimator_)

In [None]:
# logreg = LogisticRegression(C=0.1).fit(X_train, y_train)
pred_logreg = logreg_train.predict(test_data_x)
print("accuracy is: ", grid.score(test_data_x, test_data_y))
confusion = confusion_matrix(test_data_y, pred_logreg)
print("confusion matrix \n", confusion)

## Check which words are considered to be low tfidf(widely used words across many emails) and high tfidf (used only in a few emails)¶

In [None]:
vectorizer = grid.best_estimator_.named_steps["tfidfvectorizer"]
X_train = vectorizer.transform(train_data_X)
max_value = X_train.max(axis=0).toarray().ravel()
sorted_by_tfidf = max_value.argsort()

In [None]:
feature_names = np.array(vectorizer.get_feature_names())

In [None]:
print("features with lowest tfidf")
print(feature_names[sorted_by_tfidf[:20]], '\n')

print("featues with hightest tfidf")
print(feature_names[sorted_by_tfidf[-20:]])

In [None]:
print(sorted_by_tfidf)

## Examine the low inverse document frequency(words used in many emails)¶

In [None]:
sorted_by_idf = np.argsort(vectorizer.idf_)
print("features with lowest idf")
print(feature_names[sorted_by_idf[:100]])

## Check words that are most important by looking at the cofficinet learned. Red words are for ham, blue words are for spam.¶

In [None]:
!pip install mglearn

In [None]:
import mglearn
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
mglearn.tools.visualize_coefficients(grid.best_estimator_.named_steps['logisticregression'].coef_, 
                                   feature_names, n_top_features=40)
plt.title("tfidf-cofficient")

## 3 ) N-grams
--often single word is misleading predictor. For example, "like" and "don't like" are opposite

In [None]:
texts_test = test_data_x
y_test = test_data_y
texts_train = train_data_X
y_train = train_data_Y

In [None]:
pipe = make_pipeline(TfidfVectorizer(min_df=3, stop_words='english'), logreg)
param_grid = {'logisticregression__C': [0.001, 0.01, 0.1, 1, 10, 100], 
              'tfidfvectorizer__ngram_range': [(1,1), (1,2), (1,3)]}

grid = GridSearchCV(pipe, param_grid, cv=5)
logreg_train = grid.fit(texts_train, y_train)

# logreg = LogisticRegression(C=0.1).fit(X_train, y_train)
pred_logreg = logreg_train.predict(texts_test)
print("accuracy is: ", grid.score(texts_test, y_test))
confusion = confusion_matrix(y_test, pred_logreg)
print("confusion matrix \n", confusion)

### Bigram is the best parameter(predictor)

In [None]:
print(grid.best_estimator_)

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
scores = grid.cv_results_['mean_test_score']
scores = np.array(scores).reshape(-1, 3).T

heatmap = mglearn.tools.heatmap(scores, xlabel="C", ylabel="ngram_range", 
                                xticklabels=param_grid['logisticregression__C'], 
                                yticklabels=param_grid['tfidfvectorizer__ngram_range'], 
                                cmap='viridis', fmt="%.3f")
plt.colorbar(heatmap);

In [None]:
feature_names = np.array(grid.best_estimator_.named_steps['tfidfvectorizer'].get_feature_names())
coef = grid.best_estimator_.named_steps['logisticregression'].coef_
mglearn.tools.visualize_coefficients(coef,feature_names, n_top_features=40)
plt.title("tfidf-cofficient")

## 4 ) stemming and lemmatization¶

In [None]:
import spacy
import nltk

#### Move down 4 - gird serach

In [None]:
type(logreg)

In [None]:
logreg

In [None]:
grid = GridSearchCV(logreg, param_grid, cv=5)

In [None]:
type(grid)

In [None]:
logreg_train = grid.fit(X_train, train_data_Y)

In [None]:
#logreg_train = logreg.fit(X_train, train_data_Y)

In [None]:
# logreg = LogisticRegression(C=0.1).fit(X_train, y_train)
pred_logreg = logreg_train.predict(X_test)
print("accuracy is: ", grid.score(X_test, test_data_y))

In [None]:
confusion = confusion_matrix(test_data_y, pred_logreg)
print(confusion)

#### Move Down 3  -- BAG OF WORDS

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
countVect = CountVectorizer()

In [None]:
# Example
toy_samples = ["It is sunny today and I like it, ", 
               "she does not like hamburger"]

# tokenize document in words
countVect.fit(toy_samples)
print("tokenization")
print(countVect.vocabulary_, "\n")

In [None]:
# transform document into a matrix(the number indicates the number of words showing up in the document)
bag_of_words = countVect.transform(toy_samples)

In [None]:
type(bag_of_words)

In [None]:
print(bag_of_words)
print(countVect.get_feature_names())

In [None]:
print("Transformed sparse matrix is: ")
print(bag_of_words.toarray())

In [None]:
'''
Extra practice on bag of words model
'''

from sklearn.feature_extraction.text import CountVectorizer
countVect = CountVectorizer()

toy_samples = ['This is the first document.',
               'This document is the second document.',
              'And this is the third one.',
              'Is this the first document?',]


# tokenize document in words
countVect.fit(toy_samples)
print("tokenization")
print(countVect.vocabulary_, "\n")

# transform document into a matrix(the number indicates the number of words showing up in the document)
bag_of_words = countVect.transform(toy_samples)

print(countVect.get_feature_names())

print("Transformed sparse matrix is: ")
print(bag_of_words.toarray())

#### Move down 2

In [None]:
test_data_x[0]

In [None]:
if test_data_x[0] in train_data_X:
    print("Index: ", np.where(train_data_X == test_data_x[0]))
    print("Found")

In [None]:
# Create a numpy array from a list of numbers
arr = np.array([11, 12, 13, 14, 15, 16, 17, 15, 11, 12, 14, 15, 16, 17])


# Get the index of elements with value 15
result = np.where(arr == 15)

print(result)

In [None]:
listVar = 'A B C D E F G H I J K L M N O P Q R S T U V W X Y Z'.split(' ')
print(len(listVar))
print(listVar.index('C'))
print(listVar[-5:])

In [None]:
print(len(indices))
print(training_samples+2)
print(test_samples)

In [None]:
print(train_data_X[-3:])
print(test_data_x[0])
print(train_data_Y[-5:])
print(test_data_y[:5])

print(np.unique(test_data_y))
print(np.bincount(test_data_y))

In [None]:
print(texts[:5])

In [None]:
print(labels)

In [None]:
print(texts[:5])

### Move down 1

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer(max_df=0.5,min_df=2,stop_words='english')

In [None]:
X = vectorizer.fit_transform(text_df['Message'])
X

In [None]:
X[0]

In [None]:
print(X[0])

In [None]:
from sklearn.cluster import KMeans

In [None]:
km = KMeans(n_clusters = 3, init = 'k-means++', max_iter = 100, n_init = 1, verbose = True)

In [None]:
km.fit(X)

In [None]:
import numpy as np
np.unique(km.labels_, return_counts=True)

In [None]:
text={}
for i,cluster in enumerate(km.labels_):
    oneDocument = text_df['Message'][i]
    if cluster not in text.keys():
        text[cluster] = oneDocument
    else:
        text[cluster] += oneDocument

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from collections import defaultdict
from string import punctuation
from heapq import nlargest
import nltk 

In [None]:
_stopwords = set(stopwords.words('english') + list(punctuation)+["million","billion","year","millions","billions","y/y","'s","''","``"])
 

In [None]:
keywords = {}
counts={}
for cluster in range(3):
    word_sent = word_tokenize(text[cluster].lower())
    word_sent=[word for word in word_sent if word not in _stopwords]
    freq = FreqDist(word_sent)
    keywords[cluster] = nlargest(100, freq, key=freq.get)
    counts[cluster]=freq

In [None]:
unique_keys={}
for cluster in range(3):   
    other_clusters=list(set(range(3))-set([cluster]))
    keys_other_clusters=set(keywords[other_clusters[0]]).union(set(keywords[other_clusters[1]]))
    unique=set(keywords[cluster])-keys_other_clusters
    unique_keys[cluster]=nlargest(10, unique, key=counts[cluster].get)

In [None]:
unique_keys

In [None]:
article = "Facebook Inc. has been giving advertisers an inflated metric for the average time users spent watching a video, a measurement that may have helped boost marketer spending on one of Facebook’s most popular ad products. The company, owner of the world’s largest social network, only counts a video as viewed if it has been seen for more than 3 seconds. The metric it gave advertisers for their average video view time incorporated only the people who had watched the video long enough to count as a view in the first place, inflating the metric because it didn’t count anyone who didn’t watch, or watched for a shorter time. Facebook’s stock fell more than 1.5 percent in extended trading after the miscalculation was earlier reported in the Wall Street Journal. Facebook had disclosed the mistake in a posting on its advertiser help center web page several weeks ago. Big advertising buyers and marketers are upset about the inflated metric, and asked the company for more details, according to the report in the Journal, citing unidentified people familiar with the situation. The Menlo Park, California-based company has kept revenue surging in part because of enthusiasm for its video ads, which advertisers compare in performance to those on Twitter, YouTube and around the web."

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=3)
classifier.fit(X,km.labels_)

In [None]:
test=vectorizer.transform([article.decode('utf8').encode('ascii',errors='ignore')])

In [None]:
classifier.predict(test)

In [None]:
from dotenv import load_dotenv, find_dotenv

In [None]:
import requests
from requests import session
import os

In [None]:
# payload
payload = {
    'action'   : 'login',
    'username' : 'lasetti',
    'password' : 'vinnu456'
}

url = 'https://www.kaggle.com/team-ai/spam-text-message-classification/downloads/SPAM%20text%20message%2020170820%20-%20Data.csv/1'

with session() as c:
    c.post('https://www.kaggle.com/account/login', data=payload)
    response = c.get(url)
    print(response.text)

#### Move down - Initial Data extraction 

In [None]:
#!kaggle datasets list -s spam

In [None]:
#!kaggle datasets download -d team-ai/spam-text-message-classification

In [None]:
!ls -ltr

In [None]:
#'''
#import zipfile
#with zipfile.ZipFile("spam-text-message-classification.zip","r") as zip_ref:
#    zip_ref.extractall("dataset")
#'''

In [None]:
!ls -ltr dataset