In [50]:
import os
import pandas as pd
import numpy as np

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
#reading the data set
spam = pd.read_csv("raw_data.csv", header=0, names=['spamorham', 'text'])
print(spam.shape)
spam.head(10)

(5571, 2)


Unnamed: 0,spamorham,text
0,ham,Ok lar... Joking wif u oni...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,U dun say so early hor... U c already then say...
3,ham,"Nah I don't think he goes to usf, he lives aro..."
4,spam,FreeMsg Hey there darling it's been 3 week's n...
5,ham,Even my brother is not like to speak with me. ...
6,ham,As per your request 'Melle Melle (Oru Minnamin...
7,spam,WINNER!! As a valued network customer you have...
8,spam,Had your mobile 11 months or more? U R entitle...
9,ham,I'm gonna be home soon and i don't want to tal...


# Data Preprocessing

In [3]:
#creating a target col, with 0 for ham, 1 for spam
df = spam.drop(columns="spamorham")
df["spam"] = spam["spamorham"].apply(lambda x:1 if 'spam' in str(x) else 0)
df.columns=['text','target']
df.head(10)

Unnamed: 0,text,target
0,Ok lar... Joking wif u oni...,0
1,Free entry in 2 a wkly comp to win FA Cup fina...,1
2,U dun say so early hor... U c already then say...,0
3,"Nah I don't think he goes to usf, he lives aro...",0
4,FreeMsg Hey there darling it's been 3 week's n...,1
5,Even my brother is not like to speak with me. ...,0
6,As per your request 'Melle Melle (Oru Minnamin...,0
7,WINNER!! As a valued network customer you have...,1
8,Had your mobile 11 months or more? U R entitle...,1
9,I'm gonna be home soon and i don't want to tal...,0


# Exploratory Data Analysis 

In [4]:
#creating two seperate dfs: 1 for spam and 1 for non spam messages only
df_s = df.loc[ df['target']==1]
df_ns = df.loc[ df['target']==0]
    
df_s['len'] = [len(x) for x in df_s["text"]]
spamavg = df_s.len.mean()
print('df_s.head(5)')
print(df_s.head(5))

print('\n\ndf_ns.head(5)')
df_ns['len'] = [len(x) for x in df_ns["text"]]
nonspamavg = df_ns.len.mean()
print(df_ns.head(5))

df_s.head(5)
                                                 text  target  len
1   Free entry in 2 a wkly comp to win FA Cup fina...       1  155
4   FreeMsg Hey there darling it's been 3 week's n...       1  148
7   WINNER!! As a valued network customer you have...       1  158
8   Had your mobile 11 months or more? U R entitle...       1  154
10  SIX chances to win CASH! From 100 to 20,000 po...       1  136


df_ns.head(5)
                                                text  target  len
0                      Ok lar... Joking wif u oni...       0   29
2  U dun say so early hor... U c already then say...       0   49
3  Nah I don't think he goes to usf, he lives aro...       0   61
5  Even my brother is not like to speak with me. ...       0   77
6  As per your request 'Melle Melle (Oru Minnamin...       0  160


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_s['len'] = [len(x) for x in df_s["text"]]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ns['len'] = [len(x) for x in df_ns["text"]]


In [5]:
df['length'] = df['text'].apply(lambda x: len(''.join([a for a in x if a.isdigit()])))

print(df.head(10))

                                                text  target  length
0                      Ok lar... Joking wif u oni...       0       0
1  Free entry in 2 a wkly comp to win FA Cup fina...       1      25
2  U dun say so early hor... U c already then say...       0       0
3  Nah I don't think he goes to usf, he lives aro...       0       0
4  FreeMsg Hey there darling it's been 3 week's n...       1       4
5  Even my brother is not like to speak with me. ...       0       0
6  As per your request 'Melle Melle (Oru Minnamin...       0       1
7  WINNER!! As a valued network customer you have...       1      19
8  Had your mobile 11 months or more? U R entitle...       1      13
9  I'm gonna be home soon and i don't want to tal...       0       0


# splitting the data into train/validation/test (random state 42)

In [36]:
X_train, X_validation_test, y_train, y_validation_test = train_test_split(df['text'], df['target'], test_size=0.30, random_state=42)
X_test, X_validation, y_test, y_validation = train_test_split(X_validation_test, y_validation_test, test_size=0.5, random_state=42)

train_data = pd.concat([y_train, X_train], axis=1)
test_data = pd.concat([y_test,X_test],axis=1)
validation_data =pd.concat([y_validation,X_validation],axis=1)

# storing the splits at train.csv/validation.csv/test.csv

In [37]:
train_data.to_csv("data/train.csv")
test_data.to_csv("data/test.csv")
validation_data.to_csv("data/validation.csv")

In [38]:
# checking with seed 42
print("Train - num of 0s:", sum(y_train == 0), "num of 1s:", sum(y_train == 1))
print("Validation - num of 0s:", sum(y_validation == 0), "num of 1s:", sum(y_validation== 1))
print("Test - num of 0s:", sum(y_test== 0), "num of 1s:", sum(y_test== 1))

Train set - number of 0s: 3371 number of 1s: 528
Validation set - number of 0s: 722 number of 1s: 114
Test set - number of 0s: 731 number of 1s: 105


# data version control

In [41]:
#!pip install dvc[all]
#!git init
#!dvc init

In [42]:
#!dvc add Data
#!git add Data.dvc
#!git commit -m "started Tracking Data"

In [43]:
#!dvc remote add -d storage gdrive://1AGQIW8I4WJATybnk8JscnCnhkMN7bIyu
#!dvc push

In [44]:
#!git remote add origin https://github.com/KrPriyank/AML-2023.git

# splitting the data into train/validation/test (random state 123)

In [25]:
X_train, X_validation_test, y_train, y_validation_test = train_test_split(df['text'], df['target'], test_size=0.30, random_state=123)
X_test, X_validation, y_test, y_validation = train_test_split(X_validation_test, y_validation_test, test_size=0.5, random_state=123)

train_data = pd.concat([y_train, X_train], axis=1)
test_data = pd.concat([y_test,X_test],axis=1)
validation_data =pd.concat([y_validation,X_validation],axis=1)

In [26]:
# checking with seed 123
print("Train - num of 0s:", sum(y_train == 0), "num of 1s:", sum(y_train == 1))
print("Validation - num of 0s:", sum(y_validation == 0), "num of 1s:", sum(y_validation== 1))
print("Test - num of 0s:", sum(y_test== 0), "num of 1s:", sum(y_test== 1))

Train set - number of 0s: 3381 number of 1s: 518
Validation set - number of 0s: 719 number of 1s: 117
Test set - number of 0s: 724 number of 1s: 112


In [27]:
train_data = pd.concat([y_train, X_train], axis=1)
test_data = pd.concat([y_test,X_test],axis=1)
validation_data =pd.concat([y_validation,X_validation],axis=1)

train_data.to_csv("data/train.csv")
test_data.to_csv("data/test.csv")
validation_data.to_csv("data/validation.csv")

In [45]:
#!git push --set-upstream origin master

In [46]:
#!dvc add data
#!git add data.dvc
#!git commit -m "Seed Changes"
#!dvc push
#!git push

In [47]:
#!git log

Getting the first version i.e, seed 42

In [48]:
#! git checkout 4ccc46b87095b7a1e3c3f1d2556e2c9f33b6d008 Data.dvc      
#! dvc checkout  

In [39]:
train = pd.read_csv('data/train.csv')
val = pd.read_csv('data/validation.csv')
test = pd.read_csv('data/test.csv')

In [40]:
# checking before update
print("Train - num of 0s:", sum(train['target'] == 0), "num of 1s:", sum(train['target'] == 1))
print("Validation - num of 0s:", sum(val['target'] == 0), "num of 1s:", sum(val['target'] == 1))
print("Test - num of 0s:", sum(test['target'] == 0), "num of 1s:", sum(test['target'] == 1))

Train set - number of 0s: 3371 number of 1s: 528
Validation set - number of 0s: 722 number of 1s: 114
Test set - number of 0s: 731 number of 1s: 105


Getting the second version i.e, seed 123

In [49]:
#! git checkout 6be76f998582c2d9002945db6ebc22db68062b95 Data.dvc      
#! dvc checkout

In [33]:
train = pd.read_csv('data/train.csv')
val = pd.read_csv('data/validation.csv')
test = pd.read_csv('data/test.csv')

In [35]:
# checking before update
print("Train - num of 0s:", sum(train['target'] == 0), "num of 1s:", sum(train['target'] == 1))
print("Validation - num of 0s:", sum(val['target'] == 0), "num of 1s:", sum(val['target'] == 1))
print("Test - num of 0s:", sum(test['target'] == 0), "num of 1s:", sum(test['target'] == 1))

Train set - number of 0s: 3381 number of 1s: 518
Validation set - number of 0s: 719 number of 1s: 117
Test set - number of 0s: 724 number of 1s: 112
