In [1]:
import pandas as pd

messages = pd.read_csv('smsspamcollection/SMSSpamCollection', sep='\t', names = ["label","message"])
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
import nltk
import re

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()

In [5]:
corpus=[]
for i in range(0,len(messages)):
    review=re.sub('[^a-zA-z]',' ',messages['message'][i])
    review=review.lower()
    review=review.split()
    review=[wordnet_lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review=' '.join(review)
    corpus.append(review)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=100)
X = tfidf.fit_transform(corpus).toarray()
tfidf.vocabulary_

{'go': 22,
 'great': 26,
 'got': 25,
 'wat': 89,
 'ok': 58,
 'free': 18,
 'win': 93,
 'text': 77,
 'txt': 85,
 'say': 68,
 'already': 0,
 'think': 80,
 'life': 38,
 'hey': 29,
 'week': 91,
 'back': 5,
 'like': 39,
 'still': 73,
 'send': 70,
 'friend': 19,
 'prize': 63,
 'claim': 9,
 'call': 6,
 'mobile': 48,
 'co': 10,
 'home': 31,
 'want': 88,
 'today': 82,
 'cash': 8,
 'day': 14,
 'reply': 65,
 'www': 96,
 'right': 66,
 'take': 75,
 'time': 81,
 'message': 45,
 'com': 11,
 'oh': 57,
 'yes': 99,
 'make': 43,
 'way': 90,
 'dont': 16,
 'miss': 47,
 'ur': 87,
 'going': 23,
 'da': 13,
 'lor': 40,
 'meet': 44,
 'really': 64,
 'know': 34,
 'love': 41,
 'amp': 1,
 'let': 37,
 'work': 94,
 'yeah': 97,
 'tell': 76,
 'anything': 2,
 'thanks': 78,
 'uk': 86,
 'please': 61,
 'msg': 50,
 'see': 69,
 'pls': 62,
 'need': 52,
 'nokia': 55,
 'tomorrow': 83,
 'hope': 32,
 'well': 92,
 'lt': 42,
 'gt': 27,
 'get': 20,
 'ask': 3,
 'morning': 49,
 'happy': 28,
 'sorry': 72,
 'give': 21,
 'new': 53,
 'find

In [7]:
import numpy as np

np.set_printoptions(edgeitems=30, linewidth=100000, formatter=dict(float=lambda x: "%.3g" % x))

In [9]:
X # word importance

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.434, 0, 0, 0.461, 0.544, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.55, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.456, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0.473, 0, 0, 0, 0, 0, 0, 0, 0.492, 0, 0, 0, 0, 0, 0, 0, 0.571, 0, 0, 0, 0, 0, 0],
       [0.465, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.486, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.574, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 