In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('titanic.csv')

In [4]:
df.head(2)

Unnamed: 0,PassengerId,Name,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,"Braund, Mr. Owen Harris",3,male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,female,38.0,1,0,PC 17599,71.2833,C85,C,1


In [6]:
df = df[['Pclass','Sex','Age','Fare','Survived']]  # or df.drop(['',''.., axis='columns',inplace=True])

In [7]:
df.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Survived
0,3,male,22.0,7.25,0
1,1,female,38.0,71.2833,1
2,3,female,26.0,7.925,1
3,1,female,35.0,53.1,1
4,3,male,35.0,8.05,0


In [8]:
target = df.Survived
inputs = df.drop('Survived',axis='columns')

In [9]:
dummies = pd.get_dummies(inputs.Sex)
dummies.head()

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1


In [10]:
inputs = pd.concat([inputs,dummies],axis='columns')
inputs.head()

Unnamed: 0,Pclass,Sex,Age,Fare,female,male
0,3,male,22.0,7.25,0,1
1,1,female,38.0,71.2833,1,0
2,3,female,26.0,7.925,1,0
3,1,female,35.0,53.1,1,0
4,3,male,35.0,8.05,0,1


In [12]:
inputs.drop('Sex',axis='columns',inplace=True)
inputs.head()

Unnamed: 0,Pclass,Age,Fare,female,male
0,3,22.0,7.25,0,1
1,1,38.0,71.2833,1,0
2,3,26.0,7.925,1,0
3,1,35.0,53.1,1,0
4,3,35.0,8.05,0,1


In [13]:
inputs.columns[inputs.isna().any()]

Index(['Age'], dtype='object')

In [14]:
inputs.Age = inputs.Age.fillna(inputs.Age.mean())

In [15]:
inputs.columns[inputs.isna().any()]

Index([], dtype='object')

In [19]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(inputs,target,test_size=0.2)

In [20]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()

In [21]:
model.fit(X_train,y_train)

GaussianNB()

In [22]:
model.score(X_test,y_test)

0.776536312849162

In [23]:
X_test[:10]

Unnamed: 0,Pclass,Age,Fare,female,male
291,1,19.0,91.0792,1,0
321,3,27.0,7.8958,0,1
367,3,29.699118,7.2292,1,0
438,1,64.0,263.0,0,1
535,2,7.0,26.25,1,0
269,1,35.0,135.6333,1,0
594,2,37.0,26.0,0,1
690,1,31.0,57.0,0,1
338,3,45.0,8.05,0,1
357,2,38.0,13.0,1,0


In [24]:
y_test[:10]

291    1
321    0
367    1
438    0
535    1
269    1
594    0
690    1
338    1
357    0
Name: Survived, dtype: int64

In [25]:
model.predict(X_test)[:10]

array([1, 0, 1, 1, 1, 1, 0, 0, 0, 1], dtype=int64)

In [26]:
model.predict_proba(X_test[:10]) # probability of not survived/survived, use higher value

array([[4.46171768e-04, 9.99553828e-01],
       [9.91409843e-01, 8.59015740e-03],
       [7.47847123e-02, 9.25215288e-01],
       [2.55813960e-12, 1.00000000e+00],
       [1.95988326e-02, 9.80401167e-01],
       [8.96632919e-06, 9.99991034e-01],
       [9.79405016e-01, 2.05949842e-02],
       [8.29362630e-01, 1.70637370e-01],
       [9.91093315e-01, 8.90668466e-03],
       [3.32754850e-02, 9.66724515e-01]])

## Spam

In [27]:
df2 = pd.read_csv('spam.csv')

In [28]:
df2.head(3)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...


In [31]:
df2.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [33]:
df2['spam'] = df2['Category'].apply(lambda x: 1 if x=='spam' else 0)
df2.head(2)

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0


In [34]:
X_train2,X_test2,y_train2,y_test2 = train_test_split(df2.Message,df2.spam,test_size=0.25)

### convert message into numerical array

In [None]:
# find unique words, each word as a column, row value show the count of each word in the sentence

In [37]:
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer()
X_train2_count = v.fit_transform(X_train2.values)
X_train2_count.toarray()[:3]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [38]:
from sklearn.naive_bayes import MultinomialNB
model2 = MultinomialNB()
model2.fit(X_train2_count,y_train2)

MultinomialNB()

In [40]:
emails =[
    'Hey you, can we get together to watch football game tomorrow',
    'Upto 2 discount on parking, exclusive offer just for you, donnot miss this reward'
]
emails_count = v.transform(emails)
model2.predict(emails_count)

array([0, 1], dtype=int64)

In [49]:
X_test2_count = v.transform(X_test2)

In [51]:
model2.score(X_test2_count,y_test2)

ValueError: dimension mismatch

In [52]:
from sklearn.pipeline import Pipeline

In [53]:
clf = Pipeline([
    ('vectorizer',CountVectorizer()),
    ('nb',MultinomialNB())
])

In [54]:
clf.fit(X_train2,y_train2)

Pipeline(steps=[('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])

In [55]:
clf.score(X_test2,y_test2)

0.9842067480258435