#### Import modules etc.

In [1]:
import pandas as pd
import nltk # Library is used for natural language processing
nltk.download('stopwords')
nltk.download('punkt')
import string # This module will help you quickly access some string constants.

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Алексей\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Алексей\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#### Load spam collection.

In [2]:
df = pd.read_csv('SMSSpamCollection.txt', sep='\t', header=None, names=['type', 'message'])

In [3]:
df.head(5)

Unnamed: 0,type,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


#### Stop words are words which are filtered out before or after processing of natural language data (text).

In [4]:
stopwords_en = nltk.corpus.stopwords.words('english')
print(stopwords_en[:5])

['i', 'me', 'my', 'myself', 'we']


#### Punctuation.

In [5]:
punctuation = string.punctuation
print(punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


#### Function for removing stopwords and punctuation.

In [6]:
def pre_process(message):
    # Remove punctuation and make text lowercase.
    string_text = "".join([char.lower() for char in message if char not in punctuation])
    tokenize = nltk.tokenize.word_tokenize(string_text)
    string_text = [word for word in tokenize if word not in stopwords_en]
    return string_text

#### Creating new column in dataframe with processed text.

In [7]:
df['processed_message'] = df['message'].apply(lambda x: pre_process(x))

In [8]:
df.head(5)

Unnamed: 0,type,message,processed_message
0,ham,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, dont, think, goes, usf, lives, around, t..."


#### Now categorizing & Counting Tokens

In [9]:
def categorize_words():
    spam_words, ham_words = [], []
    
    # Put spam associated words to list.
    for message in df['processed_message'][df['type']== 'spam']:
        for word in message:
            spam_words.append(word)
            
    # Put ham associated words to list.
    for message in df['processed_message'][df['type']== 'ham']:
        for word in message:
            ham_words.append(word)
            
    return spam_words, ham_words

In [10]:
spam_words, ham_words = categorize_words()

In [11]:
spam_words[:5]

['free', 'entry', '2', 'wkly', 'comp']

In [12]:
ham_words[:5]

['go', 'jurong', 'point', 'crazy', 'available']

#### Building predict function, that excepts user generated text and returns the verdict - spam or not.

In [13]:
def predict(user_input):
    
    spam_counter, ham_counter = 0, 0
    
    for word in user_input:
        spam_counter += spam_words.count(word)
        ham_counter += ham_words.count(word)
    
    print('*' * 20 + 'RESULTS' + '*' * 20) 
    print('Ham counter:', ham_counter)
    print('Spam counter:', spam_counter)
    
    if spam_counter != 0:
        ham_spam_ratio = ham_counter / spam_counter
        print('Ham/Spam ratio is :', '{:.1%}'.format(ham_spam_ratio))
    
    if ham_counter + spam_counter != 0:
        ham_part = ham_counter / (ham_counter + spam_counter)
        spam_part = spam_counter / (ham_counter + spam_counter)
        print('Ham part is :', '{:.1%}'.format(ham_part))
        print('Spam part is :', '{:.1%}'.format(spam_part))
    
    print('\n')

examples:

Had your mobile 11 months or more? U R entitle...<br>
I'm gonna be home soon and i don't want to tal...<br>
SIX chances to win CASH! From 100 to 20,000 po..<br>
Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's<br>

#### Collection user input

In [14]:
while 1:
    user_input = input('Please type a spam or ham message (type EXIT to end work):\n')
    
    # Check if user wants to escape.
    if user_input == 'EXIT':
        break
    
    # Process user input.
    processed_input = pre_process(user_input)
    
    # Function returns results by printing.
    predict(processed_input)

Please type a spam or ham message (type EXIT to end work):
Had your mobile 11 months or more? U R entitle...
********************RESULTS********************
Ham counter: 1144
Spam counter: 302
Ham/Spam ratio is : 378.8%
Ham part is : 79.1%
Spam part is : 20.9%


Please type a spam or ham message (type EXIT to end work):
I'm gonna be home soon and i don't want to tal...
********************RESULTS********************
Ham counter: 1251
Spam counter: 79
Ham/Spam ratio is : 1583.5%
Ham part is : 94.1%
Spam part is : 5.9%


Please type a spam or ham message (type EXIT to end work):
SIX chances to win CASH! From 100 to 20,000 po..
********************RESULTS********************
Ham counter: 29
Spam counter: 176
Ham/Spam ratio is : 16.5%
Ham part is : 14.1%
Spam part is : 85.9%


Please type a spam or ham message (type EXIT to end work):
Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
*