## Importing Modules

In [51]:
import re
import string
import pandas as pd
import csv
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

## Loading The Data

In [52]:
raw_data = pd.read_csv('./data/SMSSpamCollection', sep='\t', quoting=csv.QUOTE_NONE,
                           names=["Label", "Message"])
raw_data.head()

Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Data Preprocessing

In [53]:
# Converting Ham To 0 And Spam To 1 In The Label Column
raw_data.loc[raw_data.Label == 'spam', 'Label'] = 1
raw_data.loc[raw_data.Label == 'ham', 'Label'] = 0
raw_data['Label'].value_counts()

0    4827
1     747
Name: Label, dtype: int64

In [54]:
lab = pd.DataFrame(raw_data["Label"])
txt = pd.DataFrame(raw_data['Message'])

In [55]:
raw_data.head()

Unnamed: 0,Label,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


## Data Splitting

In [56]:
# Train/Test/Validation Split
X_train, X_test, y_train, y_test = train_test_split(txt, raw_data["Label"], test_size=0.15, random_state=108)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=108)
train_set = pd.concat([y_train, X_train], axis=1)
test_set = pd.concat([y_test,X_test],axis=1)
validation_set =pd.concat([y_val,X_val],axis=1)

In [57]:
train_set = train_set.reset_index(drop=True)
test_set = test_set.reset_index(drop=True)
validation_set = validation_set.reset_index(drop=True)

In [58]:
## Checking Data Profile With Seed 108
print(f"The number of 0s in the train set is: {sum(y_train == 0)}. The number of 1s in the train set is: {sum(y_train == 1)}.")
print(f"The number of 0s in the validation set is: {sum(y_val == 0)}. The number of 1s in the validation set is: {sum(y_val == 1)}.")
print(f"The number of 0s in the test set is: {sum(y_test == 0)}. The number of 1s in the test set is: {sum(y_test == 1)}.")

The number of 0s in the train set is: 3501. The number of 1s in the train set is: 525.
The number of 0s in the validation set is: 625. The number of 1s in the validation set is: 86.
The number of 0s in the test set is: 701. The number of 1s in the test set is: 136.


In [59]:
# Storing And Saving The Datasets As 3 Separate CSV Files
train_set.to_csv("Data/train.csv")
test_set.to_csv("Data/test.csv")
validation_set.to_csv("Data/validation.csv")

## Working with Git and DVC to track data

In [60]:
# import sys
# !{sys.executable} -m pip install dvc

In [61]:
# ! git init
# ! dvc init

In [62]:
# ! dvc add Data
# ! git add Data.dvc
# ! git commit -m "Tracking Data"

In [63]:
# ! dvc add remote -d storage gdrive://1EmEwAn76sQuzONuBhR2rfib3a4olsrPX
# ! dvc push

### Data Splitting Again

In [64]:
X_train, X_test, y_train, y_test = train_test_split(txt, raw_data["Label"], test_size=0.15, random_state=27)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=27)
train_set = pd.concat([y_train, X_train], axis=1)
test_set = pd.concat([y_test,X_test],axis=1)
validation_set =pd.concat([y_val,X_val],axis=1)

In [65]:
train_set = train_set.reset_index(drop=True)
test_set = test_set.reset_index(drop=True)
validation_set = validation_set.reset_index(drop=True)

In [66]:
## Checking Data Profile With Seed 27
print(f"The number of 0s in the train set is: {sum(y_train == 0)}. The number of 1s in the train set is: {sum(y_train == 1)}.")
print(f"The number of 0s in the validation set is: {sum(y_val == 0)}. The number of 1s in the validation set is: {sum(y_val == 1)}.")
print(f"The number of 0s in the test set is: {sum(y_test == 0)}. The number of 1s in the test set is: {sum(y_test == 1)}.")

The number of 0s in the train set is: 3497. The number of 1s in the train set is: 529.
The number of 0s in the validation set is: 612. The number of 1s in the validation set is: 99.
The number of 0s in the test set is: 718. The number of 1s in the test set is: 119.


In [67]:
# Storing And Saving The Datasets As 3 Separate CSV Files
train_set.to_csv("Data/train.csv", index=False)
test_set.to_csv("Data/test.csv", index=False)
validation_set.to_csv("Data/validation.csv", index=False)

In [68]:
# ! dvc add Data
# ! git add Data.dvc
# ! git commit -m "Seed Changes"
# ! dvc push

In [69]:
# ! git log --oneline

### Getting First Version

In [70]:
# ! git checkout HEAD^1 Data.dvc
# ! dvc checkout

In [71]:
train = pd.read_csv("Data/train.csv")
val = pd.read_csv("Data/validation.csv")
test = pd.read_csv("Data/test.csv")

In [72]:
## Checking Data Profile Before Update
print(f"The number of 0s in the train set is: {sum(y_train == 0)}. The number of 1s in the train set is: {sum(y_train == 1)}.")
print(f"The number of 0s in the validation set is: {sum(y_val == 0)}. The number of 1s in the validation set is: {sum(y_val == 1)}.")
print(f"The number of 0s in the test set is: {sum(y_test == 0)}. The number of 1s in the test set is: {sum(y_test == 1)}.")

The number of 0s in the train set is: 3497. The number of 1s in the train set is: 529.
The number of 0s in the validation set is: 612. The number of 1s in the validation set is: 99.
The number of 0s in the test set is: 718. The number of 1s in the test set is: 119.


In [73]:
# ! git checkout HEAD Data.dvc      
# ! dvc checkout                                                      

### Getting Second Version

In [74]:
train = pd.read_csv("Data/train.csv")
val = pd.read_csv("Data/validation.csv")
test = pd.read_csv("Data/test.csv")

In [75]:
## Checking Data Profile Before Update
print(f"The number of 0s in the train set is: {sum(y_train == 0)}. The number of 1s in the train set is: {sum(y_train == 1)}.")
print(f"The number of 0s in the validation set is: {sum(y_val == 0)}. The number of 1s in the validation set is: {sum(y_val == 1)}.")
print(f"The number of 0s in the test set is: {sum(y_test == 0)}. The number of 1s in the test set is: {sum(y_test == 1)}.")

The number of 0s in the train set is: 3497. The number of 1s in the train set is: 529.
The number of 0s in the validation set is: 612. The number of 1s in the validation set is: 99.
The number of 0s in the test set is: 718. The number of 1s in the test set is: 119.
