In [7]:
# import
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
lemmer = WordNetLemmatizer()
tfIdfVectorizer = TfidfVectorizer(max_features=100, ngram_range=(1,3))

In [8]:
df = pd.read_csv('spam.csv', encoding='latin1', names=['label', 'message', 'test', 'test1', 'test2'])
df = df.iloc[1:, :-3]
df.head()

Unnamed: 0,label,message
1,ham,"Go until jurong point, crazy.. Available only ..."
2,ham,Ok lar... Joking wif u oni...
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...
4,ham,U dun say so early hor... U c already then say...
5,ham,"Nah I don't think he goes to usf, he lives aro..."


To perform data cleaning on the `df` DataFrame and then apply Word2Vec for converting text to vectors, follow these steps:

1. **Clean the Text Data**: Remove punctuation, stopwords, lemmatize words.
2. **Vectorize the Cleaned Text**: Use Word2Vec to convert text into vectors.

Here are the code snippets you should add in your Jupyter notebook:

### Markdown Cell


## Data Cleaning and Word2Vec Implementation

1. **Text Cleaning**:
   - Remove punctuation.
   - Remove stopwords.
   - Lemmatize words.

2. **Vectorization using Word2Vec**:
   - Convert cleaned text to vectors using Word2Vec.




### Python Code Cell for Text Cleaning


In [9]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    words = text.split()
    words = [word for word in words if word.lower() not in stop_words]  # Remove stopwords
    words = [lemmatizer.lemmatize(word) for word in words]  # Lemmatize words
    return ' '.join(words)


df['clean_message'] = df['message'].apply(clean_text)



### Python Code Cell for Vectorization using Word2Vec


In [10]:
from gensim.models import Word2Vec

# Tokenize the cleaned messages
sentences = [sentence.split() for sentence in df['clean_message']]

# Train a Word2Vec model
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)


# Function to get word vectors
def get_word_vector(word):
    return word2vec_model.wv[word] if word in word2vec_model.wv else None


# Apply the function to each message and store the vector in a new column
df['word_vector'] = df['clean_message'].apply(
    lambda x: sum([get_word_vector(word) for word in x.split()]) / len(x.split()) if any(
        get_word_vector(word) is not None for word in x.split()) else None)



### Python Code Cell to Display the Result


In [11]:
df.head()

Unnamed: 0,label,message,clean_message,word_vector
1,ham,"Go until jurong point, crazy.. Available only ...",Go jurong point crazy Available bugis n great ...,"[-0.04204016, 0.063034534, 0.027055165, 0.0120..."
2,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[-0.06485312, 0.0966891, 0.04034244, 0.0195779..."
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry 2 wkly comp win FA Cup final tkts 2...,"[-0.038558878, 0.05551032, 0.026199903, 0.0095..."
4,ham,U dun say so early hor... U c already then say...,U dun say early hor U c already say,"[-0.10374182, 0.15123588, 0.06457387, 0.029691..."
5,ham,"Nah I don't think he goes to usf, he lives aro...",Nah dont think go usf life around though,"[-0.080117665, 0.11535187, 0.052478477, 0.0196..."




This will give you a DataFrame with an additional column `word_vector` containing the average Word2Vec vector for each message.
