In [5]:
# Import required libraries
import pandas as pd

# Load the dataset
df = pd.read_csv("spam.csv", encoding='latin-1')

# Display the first few rows
print("First 5 rows of the dataset:")
print(df.head())

# Check the shape of the dataset
print("\nDataset shape:", df.shape)

# Display the column names
print("\nColumn Names:", df.columns)

# Since the dataset has unnecessary columns, we'll clean it up
df = df[['v1', 'v2']]  # Keeping only the necessary columns

# Rename columns for better understanding
df.columns = ['label', 'text']

# Check for class distribution (ham vs spam)
print("\nClass distribution:")
print(df['label'].value_counts())


First 5 rows of the dataset:
     v1                                                 v2 Unnamed: 2   
0   ham  Go until jurong point, crazy.. Available only ...        NaN  \
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  

Dataset shape: (5572, 5)

Column Names: Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

Class distribution:
label
ham     4825
spam     747
Name: count, dtype: int64


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import re

# Convert 'ham' to 0 and 'spam' to 1
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Clean the text (remove non alphabetic characters, convert to lowercase, and remove extra spaces)
def clean_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    return text

df['text'] = df['text'].apply(clean_text)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# Use TF_IDF Vectorizer to convert text data into numerical data
tfidf = TfidfVectorizer(max_features=5000);

X_train_tfidf = tfidf.fit_transform(X_train).toarray()
X_test_tfidf = tfidf.transform(X_test).toarray()

# Check the shape of the transformed data
print("\nShape of TF-IDF transformed training data:", X_train_tfidf.shape)
print("Shape of TF-IDF transformed test data:", X_test_tfidf.shape)




Shape of TF-IDF transformed training data: (4457, 5000)
Shape of TF-IDF transformed test data: (1115, 5000)


In [None]:
import tensorflow as tf
