# Using Titanic Data to predict if one survived or Not
## We will use Gaussian Naive Bayes since there cannot be occurance of a data

In [15]:
import pandas as pd
import numpy as np

In [16]:
data = pd.read_csv('data/titanic.csv')
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,0,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,0,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,0,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [17]:
data = data.drop(['PassengerId', 'Name', 'SibSp', 'Ticket', 'Fare', 'Cabin', 'Embarked'], axis='columns')

In [18]:
data

Unnamed: 0,Survived,Pclass,Sex,Age,Parch
0,0,3,male,34.5,0
1,1,3,female,47.0,0
2,0,2,male,62.0,0
3,0,3,male,27.0,0
4,1,3,female,22.0,1
...,...,...,...,...,...
413,0,3,male,,0
414,1,1,female,39.0,0
415,0,3,male,38.5,0
416,0,3,male,,0


In [19]:
# One Hot Encoding
data = pd.get_dummies(data)

In [20]:
data

Unnamed: 0,Survived,Pclass,Age,Parch,Sex_female,Sex_male
0,0,3,34.5,0,0,1
1,1,3,47.0,0,1,0
2,0,2,62.0,0,0,1
3,0,3,27.0,0,0,1
4,1,3,22.0,1,1,0
...,...,...,...,...,...,...
413,0,3,,0,0,1
414,1,1,39.0,0,1,0
415,0,3,38.5,0,0,1
416,0,3,,0,0,1


In [21]:
data.isna().sum()

Survived       0
Pclass         0
Age           86
Parch          0
Sex_female     0
Sex_male       0
dtype: int64

In [22]:
data['Age'] = data['Age'].fillna(data['Age'].mean())
data.isna().sum()

Survived      0
Pclass        0
Age           0
Parch         0
Sex_female    0
Sex_male      0
dtype: int64

In [23]:
X = data.drop('Survived', axis='columns')
X

Unnamed: 0,Pclass,Age,Parch,Sex_female,Sex_male
0,3,34.50000,0,0,1
1,3,47.00000,0,1,0
2,2,62.00000,0,0,1
3,3,27.00000,0,0,1
4,3,22.00000,1,1,0
...,...,...,...,...,...
413,3,30.27259,0,0,1
414,1,39.00000,0,1,0
415,3,38.50000,0,0,1
416,3,30.27259,0,0,1


In [24]:
Y = data['Survived']
Y

0      0
1      1
2      0
3      0
4      1
      ..
413    0
414    1
415    0
416    0
417    0
Name: Survived, Length: 418, dtype: int64

In [25]:
from sklearn.model_selection import train_test_split
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.3) 

In [26]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()

In [27]:
model.fit(Xtrain, Ytrain)

In [28]:
model.score(Xtest, Ytest)

1.0

# Email Spam/Not Spam Classification by text analysis
## We will use Multinomial Naive Bayes, since there is a possiblity of occurances

In [29]:
data = pd.read_csv('data/spam.csv')
data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [30]:
data['Category'] = data['Category'].apply(lambda x: 1 if x == 'spam' else 0)

In [31]:
data

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ü b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [32]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

X = vectorizer.fit_transform(data['Message'])

In [33]:
print(X.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [34]:
print(vectorizer.get_feature_names_out())
Y = data['Category']

['00' '000' '000pes' ... 'èn' 'ú1' '〨ud']


In [35]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size = 0.3)

In [36]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()

model.fit(Xtrain, Ytrain)

In [37]:
model.score(Xtest, Ytest)

0.979066985645933

In [38]:
email = ['Free entry in 2 a wkly comp to win FA Cup finals, Discount upto 20%',
         'Hello Davis, Do you wanna hang out today? Regards, Jane'
        ]



model.predict(vectorizer.transform(email))

array([1, 0], dtype=int64)

# Pipeline, Here we will perform the same steps as above, just with little convinence

In [39]:
from sklearn.pipeline import Pipeline
clf = Pipeline([('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])

X = data['Message']
Y = data['Category']

Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.3)

In [40]:
clf.fit(Xtrain, Ytrain)

In [41]:
clf.score(Xtest, Ytest)

0.9880382775119617

In [42]:
clf.predict(email)

array([1, 0], dtype=int64)

# Excercise

In [43]:
from sklearn.datasets import load_wine

data = load_wine()
dir(data)

['DESCR', 'data', 'feature_names', 'frame', 'target', 'target_names']

In [44]:
data.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])

In [45]:
data.data[0]

array([1.423e+01, 1.710e+00, 2.430e+00, 1.560e+01, 1.270e+02, 2.800e+00,
       3.060e+00, 2.800e-01, 2.290e+00, 5.640e+00, 1.040e+00, 3.920e+00,
       1.065e+03])

In [46]:
data.feature_names

['alcohol',
 'malic_acid',
 'ash',
 'alcalinity_of_ash',
 'magnesium',
 'total_phenols',
 'flavanoids',
 'nonflavanoid_phenols',
 'proanthocyanins',
 'color_intensity',
 'hue',
 'od280/od315_of_diluted_wines',
 'proline']

In [47]:
print(data.target.shape)
print(data.data.shape)
print(len(data.feature_names))

(178,)
(178, 13)
13


In [48]:
X = data.data
Y = data.target

In [49]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2)

In [50]:
model = GaussianNB()
model.fit(Xtrain, Ytrain)

In [51]:
model.score(Xtest, Ytest)

1.0

In [52]:
model = MultinomialNB()
model.fit(Xtrain, Ytrain)

In [53]:
model.score(Xtest, Ytest)

0.8888888888888888