In [42]:
from collections import Counter
import pandas as pd
import sklearn

# Read Dataset

In [43]:
# Read dataset from .csv-file and set column names
col_label = 'label'
col_message = 'sms'
df = pd.read_csv('./SMSSpamCollection.csv', sep='\t', names=[col_label, col_message])
print(df.head())

df.info()

# Checking for duplicates
print('\n# Duplicates:', len(df[df.duplicated()]))
# Remove duplicated samples
df.drop_duplicates(inplace=True, ignore_index=True)
df.info()

  label                                                sms
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5572 non-null   object
 1   sms     5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB

# Duplicates: 403
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5169 entries, 0 to 5168
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5169 non-null   object
 1   sms     5169 non-null   object
dtypes: object(2)
memory usage: 80.9+ KB


In [44]:
# Strip Messages
df[col_message] = df[col_message].str.strip()
print(df.head())


  label                                                sms
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [45]:
# Investigate distribution of labels
print('Distribution of Labels\n', df[col_label].value_counts())
print('\nRatio Ham:Spam', df[col_label].value_counts()['ham'] / df[col_label].value_counts()['spam'])
# ==> Label Imbalance

# Transform label column to boolean values:
# Spam -> True; Ham -> False
#df['label'] = df['label'].map({'spam': True, 'ham': False})

Distribution of Labels
 label
ham     4516
spam     653
Name: count, dtype: int64

Ratio Ham:Spam 6.915773353751915


In [None]:
# ToDo Preprocessing
# Lowercasing, remove stop words, tokenization, stemming, ...

# Apply lowercase to sms messages
df[col_message] = df[col_message].apply(str.lower)


# Analyze most frequent words per class
# TODO: remove stop words
df_spam = df[df[col_label] == 'spam']
spam_text = ' '.join(df_spam[col_message])
spam_word_counter = Counter(spam_text.split())
print('Top 10 SPAM words', spam_word_counter.most_common(10))

df_ham = df[df[col_label] == 'ham']
ham_text = ' '.join(df_ham[col_message])
ham_word_counter = Counter(ham_text.split())
print('Top 10 HAM words', ham_word_counter.most_common(10))

Top 10 SPAM words [('to', 594), ('a', 330), ('call', 298), ('your', 241), ('you', 226), ('for', 183), ('the', 181), ('free', 157), ('or', 157), ('2', 151)]
Top 10 HAM words [('i', 2070), ('you', 1610), ('to', 1471), ('the', 1054), ('a', 965), ('and', 816), ('u', 801), ('in', 730), ('my', 670), ('is', 635)]


In [None]:
# Save preprocessed data
df.to_csv('./SMSSpamCollection_preprocessed.csv', index=False)


# Perform Train-Val-Test split to quick check its validity
seed = 219
eval_size = 0.2

sms_train_val, sms_test, label_train_val, label_test = sklearn.model_selection.train_test_split(
    df['sms'], df['label'], test_size=eval_size, random_state=seed)
# Split train_val set into train and val
sms_train, sms_val, label_train, label_val = sklearn.model_selection.train_test_split(
    sms_train_val, label_train_val, test_size=eval_size, random_state=seed)

for set_name, label_set in [('train', label_train), ('val', label_val), ('test', label_test)]:
    print(f'Ratio Ham:Spam in {set_name.upper()}', label_set.value_counts()['ham'] / label_set.value_counts()['spam'])
    # print('Size', len(label_set))
# ==> Ratios are similar (enough), split is ok

Ratio Ham:Spam in TRAIN 7.009685230024213
Ratio Ham:Spam in VAL 6.31858407079646
Ratio Ham:Spam in TEST 7.141732283464567
