### Importing necessary libraries

In [7]:
import nltk
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
import random
random.seed(42)
import seaborn as sns
import matplotlib.pyplot as plt
import re

%matplotlib inline
import matplotlib.pyplot as plt

nltk.download('stopwords')

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
data = pd.read_csv("raw_data.csv")

### Preprocessing

In [9]:
def preprocess_text(text):
    text = text.lower()
    regex = f"^subject:\s(.*)"
    match = re.search(regex, text)
    if match:
      text = match.group(1)
    text = re.sub(r"[^a-z .]", "", text)
    words = text.split()
    words = [word for word in words if word.isalpha() and word not in stopwords.words('english')]
    return ' '.join(words)


print("Begin text preprocessing:", end="\n\n")
data["processed_text"] = ""
for i in range(data.shape[0]):
    if i % 500 == 0 and i != 0:
        a = round(i/data.shape[0]*100)
        print("+"*(a//10*4) + "-"*(40-(a//10*4)) + " : "+ str(a) + "% completed")
    data["processed_text"][i] = preprocess_text(data["text"][i])
    if i == data.shape[0]-1:
        print("+"*40 + " : " + "100% completed", end="\n\n")
print("Preprocessing complete")

Begin text preprocessing:

---------------------------------------- : 9% completed
++++------------------------------------ : 17% completed
++++++++-------------------------------- : 26% completed
++++++++++++---------------------------- : 35% completed
++++++++++++++++------------------------ : 44% completed
++++++++++++++++++++-------------------- : 52% completed
++++++++++++++++++++++++---------------- : 61% completed
++++++++++++++++++++++++++++------------ : 70% completed
++++++++++++++++++++++++++++------------ : 79% completed
++++++++++++++++++++++++++++++++-------- : 87% completed
++++++++++++++++++++++++++++++++++++---- : 96% completed
++++++++++++++++++++++++++++++++++++++++ : 100% completed

Preprocessing complete


### Initializing DVC

In [11]:
!dvc init --subdir

Initialized DVC repository.

You can now commit the changes to git.

+---------------------------------------------------------------------+
|                                                                     |
|        DVC has enabled anonymous aggregate usage analytics.         |
|     Read the analytics documentation (and how to opt-out) here:     |
|             <https://dvc.org/doc/user-guide/analytics>              |
|                                                                     |
+---------------------------------------------------------------------+

What's next?
------------
- Check out the documentation: <https://dvc.org/doc>
- Get help and share ideas: <https://dvc.org/chat>
- Star us on GitHub: <https://github.com/iterative/dvc>


### Setting up remote storage

In [12]:
!dvc remote add --default storage gdrive://1dkzdzTtGUES5kMWg9lk5N6ECtmy2a5u3
!git status

Setting 'storage' as a default remote.
On branch main
Your branch is up to date with 'origin/main'.

Changes to be committed:
  (use "git restore --staged <file>..." to unstage)
	new file:   .dvc/.gitignore
	new file:   .dvc/config
	new file:   .dvcignore

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	modified:   .dvc/config

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	../.gitignore
	.ipynb_checkpoints/
	prepare.ipynb



### First adding raw_data to dvc

In [16]:
!dvc add raw_data.csv


To track the changes with git, run:

	git add .gitignore raw_data.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true


\u280b Checking graph



In [18]:
!git add .gitignore raw_data.csv.dvc

In [19]:
!git add .dvc/config

In [20]:
!git commit -m "Adding the raw_data.csv"

[main 1c2e538] Adding the raw_data.csv
 4 files changed, 11 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 Assignment 2/raw_data.csv.dvc


In [21]:
!dvc push

1 file pushed


### Splitting the data (Version 1)

In [22]:
# Breaking the dataset into 70%, 15%, 15% for train, validation and test respectively
train, val_test = train_test_split(data[["processed_text", "spam"]], test_size = 0.30, random_state=42)
val, test = train_test_split(val_test, test_size = 0.50, random_state=42)

In [23]:
train.to_csv("train.csv", index=False)
val.to_csv("validation.csv", index=False)
test.to_csv("test.csv", index=False)

### Adding these split files to dvc tracking

In [24]:
!dvc add train.csv test.csv validation.csv


To track the changes with git, run:

	git add train.csv.dvc test.csv.dvc .gitignore validation.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true


\u280b Checking graph



In [25]:
!git add train.csv.dvc test.csv.dvc .gitignore validation.csv.dvc

In [26]:
!git commit -m "Version 1 split"

[main ecca3f3] Version 1 split
 4 files changed, 18 insertions(+)
 create mode 100644 Assignment 2/test.csv.dvc
 create mode 100644 Assignment 2/train.csv.dvc
 create mode 100644 Assignment 2/validation.csv.dvc


### Splitting the data (Version 2)

In [27]:
# Breaking the dataset into 70%, 15%, 15% for train, validation and test respectively
train, val_test = train_test_split(data[["processed_text", "spam"]], test_size = 0.30, random_state=21)
val, test = train_test_split(val_test, test_size = 0.50, random_state=21)

In [28]:
train.to_csv("train.csv", index=False)
val.to_csv("validation.csv", index=False)
test.to_csv("test.csv", index=False)

### Adding these dvc files to dvc tracking

In [29]:
!dvc add train.csv test.csv validation.csv


To track the changes with git, run:

	git add validation.csv.dvc test.csv.dvc train.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true


\u280b Checking graph



In [30]:
!git add validation.csv.dvc test.csv.dvc train.csv.dvc

In [31]:
!git commit -m "Version 2 split"

[main d56e477] Version 2 split
 3 files changed, 6 insertions(+), 6 deletions(-)


### Accessing different version splits

In [32]:
!git log

commit d56e477102e984884aa91d89f1245a0079e7e796
Author: Alok Dhar Dubey <ialok00001@gmail.com>
Date:   Tue Feb 20 22:47:49 2024 +0530

    Version 2 split

commit ecca3f39daefa2ad9193e183be75971bde532cb2
Author: Alok Dhar Dubey <ialok00001@gmail.com>
Date:   Tue Feb 20 22:42:02 2024 +0530

    Version 1 split

commit 1c2e5387b2985988488b64e87666519995010972
Author: Alok Dhar Dubey <ialok00001@gmail.com>
Date:   Tue Feb 20 22:08:44 2024 +0530

    Adding the raw_data.csv

commit 7ebfcd47955a68652bb86bbfaed3585a2de30e69
Author: Alok Dhar Dubey <ialok00001@gmail.com>
Date:   Tue Feb 20 22:04:02 2024 +0530

    stop tracking raw_data.csv

commit ec775949902edf9a3d8d3c03b668d7140d734282
Author: Alok Dhar Dubey <ialok00001@gmail.com>
Date:   Tue Feb 20 20:16:43 2024 +0530

    copied emails.csv into raw_data.csv

commit 7edb0e05b764f43db66f69d4fa9aa528a32bcf09
Author: Alok Dhar Dubey <ialok00001@gmail.com>
Date:   Tue Feb 20 19:53:37 2024 +0530

    Updated README.md according to assignments

In [33]:
!git checkout ecca3f39daefa2ad9193e183be75971bde532cb2

Note: switching to 'ecca3f39daefa2ad9193e183be75971bde532cb2'.

You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by switching back to a branch.

If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -c with the switch command. Example:

  git switch -c <new-branch-name>

Or undo this operation with:

  git switch -

Turn off this advice by setting config variable advice.detachedHead to false

HEAD is now at ecca3f3 Version 1 split


In [34]:
!dvc checkout

M       validation.csv
M       train.csv
M       test.csv


Distribution of Version 1 split

In [35]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
val = pd.read_csv("validation.csv")

In [36]:
test_y = test["spam"]
train_y = train["spam"]
val_y = val["spam"]

In [38]:
print("Training data:", end = "\n\n")
print("Number of 0 =", np.sum(train_y == 0))
print("Number of 1 =", np.sum(train_y == 1), end = "\n\n\n\n")
print("Validation data:", end = "\n\n")
print("Number of 0 =", np.sum(val_y == 0))
print("Number of 1 =", np.sum(val_y == 1), end = "\n\n\n\n")
print("Testing data:", end = "\n\n")
print("Number of 0 =", np.sum(test_y == 0))
print("Number of 1 =", np.sum(test_y == 1))

Training data:

Number of 0 = 3082
Number of 1 = 927



Validation data:

Number of 0 = 645
Number of 1 = 214



Testing data:

Number of 0 = 633
Number of 1 = 227


Distribution of Version 2 split

In [41]:
!git checkout d56e477102e984884aa91d89f1245a0079e7e796

Note: switching to 'd56e477102e984884aa91d89f1245a0079e7e796'.

You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by switching back to a branch.

If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -c with the switch command. Example:

  git switch -c <new-branch-name>

Or undo this operation with:

  git switch -

Turn off this advice by setting config variable advice.detachedHead to false

HEAD is now at d56e477 Version 2 split


In [43]:
!dvc checkout

M       validation.csv
M       train.csv
M       test.csv


In [44]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
val = pd.read_csv("validation.csv")

In [45]:
test_y = test["spam"]
train_y = train["spam"]
val_y = val["spam"]

In [46]:
print("Training data:", end = "\n\n")
print("Number of 0 =", np.sum(train_y == 0))
print("Number of 1 =", np.sum(train_y == 1), end = "\n\n\n\n")
print("Validation data:", end = "\n\n")
print("Number of 0 =", np.sum(val_y == 0))
print("Number of 1 =", np.sum(val_y == 1), end = "\n\n\n\n")
print("Testing data:", end = "\n\n")
print("Number of 0 =", np.sum(test_y == 0))
print("Number of 1 =", np.sum(test_y == 1))

Training data:

Number of 0 = 3045
Number of 1 = 964



Validation data:

Number of 0 = 661
Number of 1 = 198



Testing data:

Number of 0 = 654
Number of 1 = 206


Coming back to main branch

In [48]:
!git checkout main

Your branch is ahead of 'origin/main' by 4 commits.
  (use "git push" to publish your local commits)


Switched to branch 'main'


#### Push all data versions to gdrive

In [49]:
!dvc push

3 files pushed
