In [1]:
import polars as pl 
from polars import col
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import matplotlib.pyplot as plt
import altair as alt

In [2]:
# Read data
data = pl.read_csv("spam_dataset.csv")

In [3]:
data.head()

Unnamed: 0_level_0,label,text,label_num
i64,str,str,i64
605,"""ham""","""Subject: enron methanol ; mete…",0
2349,"""ham""","""Subject: hpl nom for january 9…",0
3624,"""ham""","""Subject: neon retreat ho ho h…",0
4685,"""spam""","""Subject: photoshop , windows ,…",1
2030,"""ham""","""Subject: re : indian springs …",0


In [4]:
# Check for null values
data.null_count()

Unnamed: 0_level_0,label,text,label_num
u32,u32,u32,u32
0,0,0,0


In [5]:
#data.select(col("label").value_counts()).unnest('label')
label_count = data.group_by('label').count().with_columns((pl.col('count')/pl.sum('count')).alias('percentage'))
label_count

  label_count = data.group_by('label').count().with_columns((pl.col('count')/pl.sum('count')).alias('percentage'))


label,count,percentage
str,u32,f64
"""spam""",1499,0.289886
"""ham""",3672,0.710114


In [6]:
plt.figure(figsize=(10, 6))
alt.Chart(label_count).mark_bar().encode(
    x='label',
    y='percentage'
).properties(
    width=400,
    height=400, 
    title='Spam vs Ham'
)


<Figure size 1000x600 with 0 Axes>

In [11]:
import re
ps = PorterStemmer()
def string_preprocessing(message:str): 
    '''Removes special characters, numbers, and stopwords from the message'''
    review = re.sub('[^a-zA-Z]', ' ', message)
    review = review.lower()
    review = review.split()

    # Drop stopwords 
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    return review


In [42]:
data = data.with_columns(text=col('text').str.replace('Subject:', '').str.strip_chars())

In [43]:
# Apply the function to the text column
data = data.with_columns(  
        col('text')
        .map_elements(string_preprocessing, return_dtype=pl.String)
        .alias('preprocessed_text')
        )

In [44]:
corpus = data['preprocessed_text'].to_list()

In [71]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [76]:
from nltk import word_tokenize

In [85]:
words = []
for sent in corpus: 
    sent_token = [sent.split()]
    for sent in sent_token: 
        print(sent)
        words.append(sent)

['enron', 'methanol', 'meter', 'follow', 'note', 'gave', 'monday', 'preliminari', 'flow', 'data', 'provid', 'daren', 'pleas', 'overrid', 'pop', 'daili', 'volum', 'present', 'zero', 'reflect', 'daili', 'activ', 'obtain', 'ga', 'control', 'chang', 'need', 'asap', 'econom', 'purpos']
['hpl', 'nom', 'januari', 'see', 'attach', 'file', 'hplnol', 'xl', 'hplnol', 'xl']
['neon', 'retreat', 'ho', 'ho', 'ho', 'around', 'wonder', 'time', 'year', 'neon', 'leader', 'retreat', 'time', 'know', 'time', 'year', 'extrem', 'hectic', 'tough', 'think', 'anyth', 'past', 'holiday', 'life', 'go', 'past', 'week', 'decemb', 'januari', 'like', 'think', 'minut', 'calend', 'hand', 'begin', 'fall', 'semest', 'retreat', 'schedul', 'weekend', 'januari', 'youth', 'minist', 'confer', 'brad', 'dustin', 'connect', 'week', 'go', 'chang', 'date', 'follow', 'weekend', 'januari', 'come', 'part', 'need', 'think', 'think', 'agre', 'import', 'us', 'get', 'togeth', 'time', 'recharg', 'batteri', 'get', 'far', 'spring', 'semest', 

In [86]:
words

[['enron',
  'methanol',
  'meter',
  'follow',
  'note',
  'gave',
  'monday',
  'preliminari',
  'flow',
  'data',
  'provid',
  'daren',
  'pleas',
  'overrid',
  'pop',
  'daili',
  'volum',
  'present',
  'zero',
  'reflect',
  'daili',
  'activ',
  'obtain',
  'ga',
  'control',
  'chang',
  'need',
  'asap',
  'econom',
  'purpos'],
 ['hpl',
  'nom',
  'januari',
  'see',
  'attach',
  'file',
  'hplnol',
  'xl',
  'hplnol',
  'xl'],
 ['neon',
  'retreat',
  'ho',
  'ho',
  'ho',
  'around',
  'wonder',
  'time',
  'year',
  'neon',
  'leader',
  'retreat',
  'time',
  'know',
  'time',
  'year',
  'extrem',
  'hectic',
  'tough',
  'think',
  'anyth',
  'past',
  'holiday',
  'life',
  'go',
  'past',
  'week',
  'decemb',
  'januari',
  'like',
  'think',
  'minut',
  'calend',
  'hand',
  'begin',
  'fall',
  'semest',
  'retreat',
  'schedul',
  'weekend',
  'januari',
  'youth',
  'minist',
  'confer',
  'brad',
  'dustin',
  'connect',
  'week',
  'go',
  'chang',
  'date'

In [87]:
# Create model 
from gensim.models import Word2Vec

In [88]:
model = Word2Vec(words, window=5, min_count=1 )

In [89]:
model.wv.index_to_key

['ect',
 'hou',
 'enron',
 'com',
 'deal',
 'pleas',
 'ga',
 'subject',
 'meter',
 'cc',
 'pm',
 'hpl',
 'thank',
 'need',
 'e',
 'daren',
 'forward',
 'price',
 'corp',
 'volum',
 'know',
 'day',
 'get',
 'new',
 'compani',
 'mmbtu',
 'product',
 'may',
 'chang',
 'j',
 'inform',
 'http',
 'see',
 'attach',
 'nom',
 'let',
 'contract',
 'time',
 'farmer',
 'l',
 'month',
 'would',
 'messag',
 'xl',
 'nomin',
 'mail',
 'us',
 'call',
 'sale',
 'one',
 'follow',
 'flow',
 'use',
 'juli',
 'question',
 'th',
 'robert',
 'email',
 'sitara',
 'ticket',
 'p',
 'want',
 'file',
 'look',
 'texa',
 'servic',
 'energi',
 'work',
 'manag',
 'report',
 'list',
 'number',
 'sent',
 'go',
 'also',
 'www',
 'pec',
 'ena',
 'purchas',
 'bob',
 'actual',
 'like',
 'x',
 'make',
 'contact',
 'order',
 'receiv',
 'market',
 'origin',
 'net',
 'b',
 'busi',
 'take',
 'system',
 'c',
 'back',
 'effect',
 'provid',
 'schedul',
 'account',
 'avail',
 'daili',
 'help',
 'secur',
 'statement',
 'per',
 'inclu

In [94]:
model.wv.most_similar('analyst')

[('associ', 0.9777567386627197),
 ('conclud', 0.9726781845092773),
 ('prc', 0.9725981950759888),
 ('effort', 0.9709834456443787),
 ('facilit', 0.9680378437042236),
 ('small', 0.9678929448127747),
 ('across', 0.9678568840026855),
 ('throughout', 0.9678196310997009),
 ('procedur', 0.9658692479133606),
 ('personnel', 0.9653780460357666)]