# Investigation of the Kaggle Dataset

In [8]:
import pandas as pd
import sklearn

In [9]:
df = pd.read_csv('./spam.csv', encoding = 'cp1252')
print(df.head())

df.info()

# Checking for duplicates
print('\n# Duplicates:', len(df[df.duplicated()]))
# Remove duplicated samples
df.drop_duplicates(inplace=True, ignore_index=True)
df.info()

# First column contains the labels, second column contains the messages
# Columns 2-4 should not be present, so let's investigate them and fix that!
rows_to_investigate = df.dropna(subset=df.columns[2:], how='all')
indices_to_investigate = rows_to_investigate.index
print('\n', rows_to_investigate.to_string())

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
m

In [10]:
# Fix dataset: Let's join all text in collums 1-4 to form column 1

df[df.columns[1]] = df[df.columns[1:]].fillna('').aggregate(','.join, axis=1).str.strip()
# df.loc[:, 2] = df.iloc[:, 2:].astype(str).fillna('').aggregate(' '.join, axis=1).str.strip()


# Drop unnecessary columns
df.drop(columns=df.columns[2:], inplace=True)
df.info()

# Quick check:
print(df.loc[indices_to_investigate, :].to_string())

# Rename columns
df.rename({df.columns[0]: 'label', df.columns[1]: 'sms'}, axis=1, inplace=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5169 entries, 0 to 5168
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5169 non-null   object
 1   v2      5169 non-null   object
dtypes: object(2)
memory usage: 80.9+ KB
        v1                                                                                                                                                                                                                                                                                                                                          v2
95    spam                                                                                                                                                                           Your free ringtone is waiting to be collected. Simply text the password \MIX\" to 85069 to verify. Get Usher and Britney. FML, PO Box 5249, MK17 92H. 450Ppw 16",
277    ham             

In [11]:
# Investigate distribution of labels
print('Distribution of Labels\n', df['label'].value_counts())
print('Ratio Ham:Spam', df['label'].value_counts()['ham'] / df['label'].value_counts()['spam'])

# Transform label column to boolean values:
# Spam -> True; Ham -> False
#df['label'] = df['label'].map({'spam': True, 'ham': False})

Distribution of Labels
 label
ham     4516
spam     653
Name: count, dtype: int64
Ratio Ham:Spam 6.915773353751915


In [None]:
# ToDo Preprocessing
# Lowercasing, remove stop words, tokenization, stemming, ...

# Apply lowercase to sms messages
df['sms'] = df['sms'].apply(str.lower)

In [None]:
# Save preprocessed data
df.to_csv('./spam_preprocessed.csv', index=False)


# Perform Train-Test split
seed = 219
eval_size = 0.2
sms_train, sms_test, label_train, label_test = sklearn.model_selection.train_test_split(df['sms'], df['label'], test_size=eval_size, random_state=seed)


print('Ratio Ham:Spam in TRAINING', label_train.value_counts()['ham'] / label_train.value_counts()['spam'])
print('Ratio Ham:Spam in TEST', label_test.value_counts()['ham'] / label_test.value_counts()['spam'])

Ratio Ham:Spam in TRAINING 6.921455938697318
Ratio Ham:Spam in TEST 6.893129770992366
