# Reading a text-based dataset into pandas

In [2]:
import pandas as pd
path = 'data/sms.tsv'
sms = pd.read_table(path, header=None, names=['label','message'])

In [5]:
# examine the shape of dataset
sms.shape

(4367, 2)

In [6]:
# examine the first 10 rows of dataset
sms.head(10)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [7]:
# examine the class distribution
sms.label.value_counts()

ham     3775
spam     592
Name: label, dtype: int64

In [8]:
# convert label to a numerical variable
sms['label_num'] = sms.label.map({'ham':0, 'spam':1})

In [10]:
# check whether the conversion worked
sms.head(10)

Unnamed: 0,label,message,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
5,spam,FreeMsg Hey there darling it's been 3 week's n...,1
6,ham,Even my brother is not like to speak with me. ...,0
7,ham,As per your request 'Melle Melle (Oru Minnamin...,0
8,spam,WINNER!! As a valued network customer you have...,1
9,spam,Had your mobile 11 months or more? U R entitle...,1


In [18]:
# how to define X and y (from the SMS data)
# --> all these gives only a VECTOR, not a MATRIX!
X = sms.message
y = sms.label_num
print(X.shape)
print(y.shape)

(4367,)
(4367,)


In [15]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=7)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(3275,)
(1092,)
(3275,)
(1092,)


# Vectorizing the dataset

In [16]:
# instantiate the vectorizer
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

In [None]:
# learn training data vocabulory, then use it to create
vect.fit(X_train)