# Naive Bayes

In [26]:
# If additional packages are needed but are not installed by default, uncomment the last two lines of this cell
# and replace <package list> with a list of additional packages.
# This will ensure the notebook has all the dependencies and works everywhere

#import sys
#!{sys.executable} -m pip install <package list>

In [1]:
# Libraries
import pandas as pd
import numpy as np
import sklearn
import string

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

pd.set_option("display.max_columns", 101)
pd.set_option('display.max_colwidth', 100)

## Data Description

Column | Description
:---|:---
`text` | Text which needs to be classified
`label`  | Label ( 1 = spam, 0 = not spam)

In [2]:
# The information dataset for the training set is already loaded below
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,text,label
0,No prob. I will send to your email.,0
1,"I do know what u mean, is the king of not havin credit! I'm goin2bed now. Night night sweet! On...",0
2,Ok... I din get ur msg...,0
3,Check mail.i have mailed varma and kept copy to you regarding membership.take care.insha allah.,0
4,Then what about further plan?,0


In [3]:
data.label.value_counts()

0    3377
1     523
Name: label, dtype: int64

In [4]:
without_pun = []
string_ = ''
for i in data.text.to_list():
    new = i.translate(str.maketrans('', '', string.punctuation))
    string_ += new
    without_pun.append(new)

In [5]:
data['without_pun'] = without_pun

In [6]:
vectorizer = CountVectorizer()
vectorizer.fit(without_pun)
df = vectorizer.transform(without_pun)
print(df.shape)

(3900, 7706)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [7]:
df = pd.DataFrame(df.toarray())

In [8]:
df['label'] = data.label

In [9]:
df[45].value_counts()

0    3898
1       2
Name: 45, dtype: int64

## Machine Learning

Build a Naive Bayes model that can predict the label.
- **The model's performance will be evaluated on the basis of F1 Score.**

In [11]:
rs = 44
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns = ['label']), df.label, test_size=0.5, random_state=rs)

In [12]:
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)

In [13]:
y_test

854     0
2777    0
2117    1
2227    0
1468    0
       ..
2991    0
1926    0
2347    0
344     0
1414    0
Name: label, Length: 1950, dtype: int64

In [14]:
f1_score(y_test, y_pred, average='micro')

0.9087179487179488

> #### Task:
- **Submit the predictions on the test dataset using your optimized model** <br/>
    Submit a CSV file with a header row plus each of the test entries, each on its own line. 

The file (`submissions.csv`) should have exactly 1 column:

Column | Description
:---|:---
`label`  | Label

In [114]:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,text
0,From next month get upto 50% More Calls 4 Ur standard network charge 2 activate Call 9061100010 ...
1,"Ooh, 4got, i'm gonna start belly dancing in moseley weds 6.30 if u want 2 join me, they have a c..."
2,Gud mrng dear hav a nice day
3,Living is very simple.. Loving is also simple.. Laughing is too simple.. Winning is tooo simple....
4,On the way to office da..


In [115]:
vectorizer.transform(test.text.to_list()).toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [117]:
gnb.predict(vectorizer.transform(test.text.to_list()).toarray())

array([1, 0, 0, ..., 0, 1, 0])

In [None]:
submission_df = pd.DataFrame(gnb.predict(vectorizer.transform(test.text.to_list()).toarray()))
submission_df

In [None]:
#Submission
submission_df.to_csv('submissions.csv', index=False)