# Mini Project 3

## Fetch the data

In [180]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import re
import spacy

import sklearn 

In [181]:
# import the spam email csv
df = pd.read_csv('email_train.csv', encoding='unicode_escape')

In [182]:
df.head()

Unnamed: 0,S. No.,Message_body,Label
0,1,Rofl. Its true to its name,Non-Spam
1,2,The guy did some bitching but I acted like i'd...,Non-Spam
2,3,"Pity, * was in mood for that. So...any other s...",Non-Spam
3,4,Will ü b going to esplanade fr home?,Non-Spam
4,5,This is the 2nd time we have tried 2 contact u...,Spam


In [183]:
# check if there are any null values
df.isna().sum()

S. No.          0
Message_body    0
Label           0
dtype: int64

In [184]:
# check data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 957 entries, 0 to 956
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   S. No.        957 non-null    int64 
 1   Message_body  957 non-null    object
 2   Label         957 non-null    object
dtypes: int64(1), object(2)
memory usage: 22.6+ KB


In [185]:
df['Message_body'] = df['Message_body'].astype('string')

In [186]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 957 entries, 0 to 956
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   S. No.        957 non-null    int64 
 1   Message_body  957 non-null    string
 2   Label         957 non-null    object
dtypes: int64(1), object(1), string(1)
memory usage: 22.6+ KB


In [187]:
# remove unnecessary columns
df.drop(columns=['S. No.'], inplace=True)

In [188]:
# change label column to 0s and 1s
df['Label'].replace({'Non-Spam': 0, 'Spam': 1}, inplace=True)

In [189]:
# check if there are any null values
df.isna().sum()

Message_body    0
Label           0
dtype: int64

In [190]:
df.head()

Unnamed: 0,Message_body,Label
0,Rofl. Its true to its name,0
1,The guy did some bitching but I acted like i'd...,0
2,"Pity, * was in mood for that. So...any other s...",0
3,Will ü b going to esplanade fr home?,0
4,This is the 2nd time we have tried 2 contact u...,1


## Clean the data

In [191]:
def clean_text(text):
    
    # remove full stops
    text = re.sub(r'\.', '', text)

    # remove punctuation
    text = re.sub(r'[!?*,()/:-;+=#&^@<>£\"\']', '', text)
    text = re.sub(r'ü', 'u', text)

    return text

In [192]:
df['Message_body'] = df['Message_body'].apply(clean_text)

In [193]:
df.sample(50)

Unnamed: 0,Message_body,Label
333,Am on the uworld site Am i buying the qbank on...,0
307,Did you see that film,0
401,Oh did you charge camera,0
907,I cant pick the phone right now Pls send a mes...,0
407,Lol no I just need to cash in my nitros Hurry ...,0
949,Good do you think you could send me some pix I...,0
81,Oh thanks a loti already bought 2 eggs,0
507,I think i am disturbing her da,0
265,Pls ask macho how much is budget for bb bold 2...,0
553,Uncle G just checking up on you Do have a rewa...,0


In [201]:
nlp = spacy.load('en_core_web_md')

In [203]:
doc = nlp(df['Message_body'][0])

In [205]:
doc

Rofl Its true to its name

In [206]:
for token in doc:
    print(token.text)

Rofl
Its
true
to
its
name


In [209]:
for i, t in enumerate(doc): # t is token, i is index
    print('%2d| %r' % (i+1, t.text))
    if t.text == '.':
        break

 1| 'Rofl'
 2| 'Its'
 3| 'true'
 4| 'to'
 5| 'its'
 6| 'name'
