# Mail Spam Detection

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [24]:
data = pd.read_csv('spam.csv',encoding='latin1',usecols=['v1','v2'])

In [25]:
new_column_names = {'v1': 'Target', 'v2': 'Mail'}
df = data.rename(columns=new_column_names)


In [26]:
df.isna().sum()

Target    0
Mail      0
dtype: int64

## Text Processing

#### Removing Special Characters

In [27]:
import re

def remove_special_characters(text):
    # Remove special characters and punctuation
    clean_text = re.sub('[^A-Za-z\s]', '', text)
    return clean_text
df['Mail'] = df['Mail'].apply(remove_special_characters)

#### Removing Symbols

In [28]:
import re

def remove_symbols(text):
    # Remove symbols using regular expression
    cleaned_text = re.sub(r'[^\w\s]', '', text)
    return cleaned_text
df['Mail'] = df['Mail'].apply(remove_symbols)

#### Removing Numeric digits

In [29]:
def remove_numeric(text):
    # Remove numeric values
    clean_text = re.sub(r'\d+', '', text)
    return clean_text
df['Mail'] = df['Mail'].apply(remove_numeric)

#### Converting to Lower Case

In [30]:
def convert_to_lowercase(text):
    # Convert text to lowercase
    lowercase_text = text.lower()
    return lowercase_text
df['Mail'] = df['Mail'].apply(convert_to_lowercase)

#### Lemmatization 

In [31]:
import nltk

# Specify the data path where NLTK should look for resources
nltk.data.path.append("/path/to/nltk_data")

# Download the "wordnet" resource
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer

def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    lemmatized_text = [lemmatizer.lemmatize(word) for word in text.split()]
    return ' '.join(lemmatized_text)
df['Mail'] = df['Mail'].apply(lemmatize_text)

[nltk_data] Error loading wordnet: <urlopen error [WinError 10061] No
[nltk_data]     connection could be made because the target machine
[nltk_data]     actively refused it>


#### Removing Stop words

In [33]:
import nltk
from nltk.corpus import stopwords

def remove_stopwords(text):
    x = []
    for i in text.split():
        if i not in stopwords.words('english'):
            x.append(i)
    return x
df['Mail'] = df['Mail'].apply(remove_stopwords)

In [32]:
df

Unnamed: 0,Target,Mail
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in a wkly comp to win fa cup final ...
3,ham,u dun say so early hor u c already then say
4,ham,nah i dont think he go to usf he life around h...
...,...,...
5567,spam,this is the nd time we have tried contact u u ...
5568,ham,will b going to esplanade fr home
5569,ham,pity wa in mood for that soany other suggestion
5570,ham,the guy did some bitching but i acted like id ...


### Data Split

In [11]:
X = df[['Mail']]
y = df['Target']

In [21]:
from sklearn.model_selection import train_test_split

# Split the data into training (80%) and temporary (20%)
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Split the temporary data into training (80%) and validation (20%)
X_train, X_valid, y_train, y_valid = train_test_split(X_temp, y_temp, test_size=0.15, random_state=42)

In [22]:
print(X_train.shape,y_train.shape,X_valid.shape,y_valid.shape,X_test.shape,y_test.shape)

(3788, 1) (3788,) (669, 1) (669,) (1115, 1) (1115,)
