In [1]:
import pandas as pd
import csv
import numpy as np
import re
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [8]:
def load_data(file_path):
    messages = pd.read_csv('SMSSpamCollection', sep='\t', names=["label", "message"])
    return messages


def preprocess_data(df):
    """Preprocess the messages: Lowercase, remove non-alphabet characters, and tokenize."""
    # Lowercase and remove non-alphabetic characters
    df['message'] = df['message'].apply(lambda x: ' '.join(re.findall(r'\b[a-zA-Z]+\b', x.lower())))
    return df


def split_data(df, seed):
    """Split the data into train, validation, and test sets."""
    train, test = train_test_split(df, test_size=0.2, random_state=seed)
    train, validation = train_test_split(train, test_size=0.1, random_state=seed)
    return train, validation, test
    

def save_data_splits(train, validation, test):
    """Save the train, validation, and test splits into CSV files."""
    train.to_csv('train.csv', index=False)
    validation.to_csv('validation.csv', index=False)
    test.to_csv('test.csv', index=False)

In [4]:
data = load_data('SMSSpamCollection')
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
data = preprocess_data(data)
data.to_csv('raw_data.csv')

In [None]:
# Initialize Git repository
!git init

# Initialize DVC
!dvc init

Initialized empty Git repository in C:/Users/naren/OneDrive/Documents/CMI/Semester 4/Applied Machine Learning/Assignment_2/.git/
Initialized DVC repository.

You can now commit the changes to git.

+---------------------------------------------------------------------+
|                                                                     |
|        DVC has enabled anonymous aggregate usage analytics.         |
|     Read the analytics documentation (and how to opt-out) here:     |
|             <https://dvc.org/doc/user-guide/analytics>              |
|                                                                     |
+---------------------------------------------------------------------+

What's next?
------------
- Check out the documentation: <https://dvc.org/doc>
- Get help and share ideas: <https://dvc.org/chat>
- Star us on GitHub: <https://github.com/iterative/dvc>


In [7]:
# Add raw data to DVC
!dvc add raw_data.csv


To track the changes with git, run:

	git add .gitignore raw_data.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true


⠋ Checking graph



### Data Preparation:

In [9]:
train, validation, test = split_data(data, seed = 42)
save_data_splits(train, validation, test)

### Tracking Data with DVC:

In [10]:
# Add train, validation, and test data splits
!dvc add train.csv validation.csv test.csv


To track the changes with git, run:

	git add .gitignore train.csv.dvc validation.csv.dvc test.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true


⠋ Checking graph



### Commit to Git:

In [11]:
# Stage DVC files and .gitignore
!git add *.dvc .gitignore

# Commit changes
!git commit -m "Initial train/validation/test split"

[main (root-commit) 7df6b88] Initial train/validation/test split
 8 files changed, 30 insertions(+)
 create mode 100644 .dvc/.gitignore
 create mode 100644 .dvc/config
 create mode 100644 .dvcignore
 create mode 100644 .gitignore
 create mode 100644 raw_data.csv.dvc
 create mode 100644 test.csv.dvc
 create mode 100644 train.csv.dvc
 create mode 100644 validation.csv.dvc


### Update Data Split with New Random Seed:

In [13]:
train, validation, test = split_data(data, seed=17)
save_data_splits(train, validation, test)

### Track updated data with DVC

In [14]:
# Track updated data with DVC
!dvc add train.csv validation.csv test.csv


To track the changes with git, run:

	git add train.csv.dvc test.csv.dvc validation.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true


⠋ Checking graph



### Commit updated split

In [15]:
# Commit updated split
!git add *.dvc
!git commit -m "Updated data split with new random seed"

[main 5435e7b] Updated data split with new random seed
 3 files changed, 6 insertions(+), 6 deletions(-)


### Checkout the first version

In [16]:
!git checkout HEAD~1  
!dvc checkout

Note: switching to 'HEAD~1'.

You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by switching back to a branch.

If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -c with the switch command. Example:

  git switch -c <new-branch-name>

Or undo this operation with:

  git switch -

Turn off this advice by setting config variable advice.detachedHead to false

HEAD is now at 7df6b88 Initial train/validation/test split


M       test.csv
M       train.csv
M       validation.csv


### Target Variable Distributions Before Update:

In [18]:
for split in ['train', 'validation', 'test']:
    df = pd.read_csv(f'{split}.csv')
    print(f'Distribution in {split}.csv:')
    print(df['label'].value_counts(), '\n')

Distribution in train.csv:
ham     3468
spam     543
Name: label, dtype: int64 

Distribution in validation.csv:
ham     391
spam     55
Name: label, dtype: int64 

Distribution in test.csv:
ham     966
spam    149
Name: label, dtype: int64 



### Checkout the updated version

In [19]:
!git checkout main  
!dvc checkout

Previous HEAD position was 7df6b88 Initial train/validation/test split
Switched to branch 'main'


M       validation.csv
M       train.csv
M       test.csv


### Target Variable Distributions After Update:

In [20]:
for split in ['train', 'validation', 'test']:
    df = pd.read_csv(f'{split}.csv')
    print(f'Distribution in {split}.csv:')
    print(df['label'].value_counts(), '\n')

Distribution in train.csv:
ham     3467
spam     544
Name: label, dtype: int64 

Distribution in validation.csv:
ham     388
spam     58
Name: label, dtype: int64 

Distribution in test.csv:
ham     970
spam    145
Name: label, dtype: int64 

