In [1]:
import pandas as pd
import subprocess

def run_command(command):
    """Helper to run shell commands with error handling."""
    try:
        result = subprocess.run(command, shell=True, check=True, capture_output=True, text=True)
        print(result.stdout)
    except subprocess.CalledProcessError as e:
        print(f"Error executing command: {e.stderr}")


In [2]:
def initialize_repo():
    """Initializes Git and DVC if not already done."""
    run_command('git init')
    run_command('dvc init')
    run_command('git branch -M main')

# Check Git initialization
try:
    run_command('git status')
except subprocess.CalledProcessError:
    initialize_repo()


On branch master

No commits yet

Changes to be committed:
  (use "git rm --cached <file>..." to unstage)
	new file:   .dvc/.gitignore
	new file:   .dvc/config
	new file:   .dvcignore

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	ML_Assignment_Jason.ipynb
	movies.csv




In [3]:
# Display settings
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Read the original data
df = pd.read_csv('movies.csv')

# Stage 1: Initial data
run_command('dvc add movies.csv')
run_command('git add movies.csv.dvc')
run_command('git commit -m "Stage 1: Initial data"')

print("Initial Data:")
print(df.head().to_markdown(index=False, numalign="left", stralign="left"))



To track the changes with git, run:

	git add .gitignore movies.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true


[master (root-commit) 5274c5c] Stage 1: Initial data
 4 files changed, 11 insertions(+)
 create mode 100644 .dvc/.gitignore
 create mode 100644 .dvc/config
 create mode 100644 .dvcignore
 create mode 100644 movies.csv.dvc

Initial Data:
| MOVIES                              | YEAR        | GENRE                        | RATING   | ONE-LINE                                                                                                                                                                                                            | STARS                   | VOTES   | RunTime   | Gross   |
|:------------------------------------|:------------|:-----------------------------|:---------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------

In [4]:
# Stage 2: Drop rows with missing values
df.dropna(subset=['RATING', 'VOTES'], inplace=True)
df.to_csv('movies.csv', index=False)
run_command('dvc add movies.csv')  
run_command('git add movies.csv.dvc')
run_command('git commit -m "Stage 2: Drop missing values"')



To track the changes with git, run:

	git add movies.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true


[master 9f1a0ab] Stage 2: Drop missing values
 1 file changed, 2 insertions(+), 2 deletions(-)



In [5]:
# Stage 3: Clean and convert `YEAR`
df['YEAR'] = df['YEAR'].astype(str).str.replace(r'[()]', '', regex=True).str.replace(r'[^0-9]', '', regex=True)
df['YEAR'] = pd.to_numeric(df['YEAR'], errors='coerce')
df.to_csv('movies.csv', index=False)
run_command('dvc add movies.csv')  
run_command('git add movies.csv.dvc')
run_command('git commit -m "Stage 3: Clean and convert YEAR"')



To track the changes with git, run:

	git add movies.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true


[master a107754] Stage 3: Clean and convert YEAR
 1 file changed, 2 insertions(+), 2 deletions(-)



In [6]:
# Stage 4: Clean `GENRE`
df['GENRE'] = df['GENRE'].astype(str).str.strip()
df.to_csv('movies.csv', index=False)
run_command('dvc add movies.csv')
run_command('git add movies.csv.dvc')
run_command('git commit -m "Stage 4: Clean GENRE"')



To track the changes with git, run:

	git add movies.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true


[master 1b68b13] Stage 4: Clean GENRE
 1 file changed, 2 insertions(+), 2 deletions(-)



In [7]:
# Stage 5: Clean and convert `VOTES`
df['VOTES'] = df['VOTES'].astype(str).str.replace(',', '', regex=True)
df['VOTES'] = pd.to_numeric(df['VOTES'], errors='coerce')
df.to_csv('movies.csv', index=False)
run_command('dvc add movies.csv')
run_command('git add movies.csv.dvc')
run_command('git commit -m "Stage 5: Clean and convert VOTES"')



To track the changes with git, run:

	git add movies.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true


[master 3391509] Stage 5: Clean and convert VOTES
 1 file changed, 2 insertions(+), 2 deletions(-)



In [8]:
# Stage 6: Clean and convert `Gross` (in millions)
df['Gross'] = df['Gross'].astype(str).str.replace(r'[$,M]', '', regex=True)
df['Gross'] = pd.to_numeric(df['Gross'], errors='coerce')
df.to_csv('movies.csv', index=False)
run_command('dvc add movies.csv')
run_command('git add movies.csv.dvc')
run_command('git commit -m "Stage 6: Clean and convert Gross"')



To track the changes with git, run:

	git add movies.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true


[master 73ece56] Stage 6: Clean and convert Gross
 1 file changed, 2 insertions(+), 2 deletions(-)



In [9]:
# Stage 7: Fill missing Gross values
df['Gross'] = df['Gross'].fillna(0)
df.to_csv('movies.csv', index=False) 
run_command('dvc add movies.csv')
run_command('git add movies.csv.dvc')
run_command('git commit -m "Stage 7: Fill missing Gross values"')



To track the changes with git, run:

	git add movies.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true


[master 1703f1e] Stage 7: Fill missing Gross values
 1 file changed, 2 insertions(+), 2 deletions(-)



In [10]:
# Stage 8: Drop columns
df.drop(columns=['ONE-LINE', 'STARS'], inplace=True)
df.to_csv('movies.csv', index=False) 
run_command('dvc add movies.csv')
run_command('git add movies.csv.dvc')
run_command('git commit -m "Stage 8: Drop ONE-LINE and STARS columns"')



To track the changes with git, run:

	git add movies.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true


[master aae69fb] Stage 8: Drop ONE-LINE and STARS columns
 1 file changed, 2 insertions(+), 2 deletions(-)



In [11]:
# Final Stage: Drop duplicates
num_duplicates = df.duplicated().sum()
if num_duplicates > 0:
    df.drop_duplicates(inplace=True)
    df.to_csv('movies.csv', index=False)
    run_command('dvc add movies.csv')
    run_command('git add movies.csv.dvc')
    run_command('git commit -m "Final stage: Drop duplicates"')



To track the changes with git, run:

	git add movies.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true


[master cbd4cb1] Final stage: Drop duplicates
 1 file changed, 2 insertions(+), 2 deletions(-)



In [12]:
# Display Git log
run_command('git log --all')


commit cbd4cb12a8ed61952a54892460a3085120ef080f
Author: Jason Greich <jasongreich@gmail.com>
Date:   Tue Jul 30 21:29:48 2024 +0300

    Final stage: Drop duplicates

commit aae69fb465434aa8a7a419cc6fd8f682e3788d30
Author: Jason Greich <jasongreich@gmail.com>
Date:   Tue Jul 30 21:29:46 2024 +0300

    Stage 8: Drop ONE-LINE and STARS columns

commit 1703f1e352994a38f701cf149e2c4d515cfdf620
Author: Jason Greich <jasongreich@gmail.com>
Date:   Tue Jul 30 21:29:44 2024 +0300

    Stage 7: Fill missing Gross values

commit 73ece567ccfad4d03c316f9029fbb48e6931f49a
Author: Jason Greich <jasongreich@gmail.com>
Date:   Tue Jul 30 21:29:42 2024 +0300

    Stage 6: Clean and convert Gross

commit 3391509cb18ef16a5702db2bf9e2c7541d2cbd21
Author: Jason Greich <jasongreich@gmail.com>
Date:   Tue Jul 30 21:29:40 2024 +0300

    Stage 5: Clean and convert VOTES

commit 1b68b13a8738ff2ad3b87ab4e4aa8fa2f3b8d417
Author: Jason Greich <jasongreich@gmail.com>
Date:   Tue Jul 30 21:29:39 2024 +0300

    St

In [14]:
# Checkout initial commit and data
initial_commit_hash = '5274c5c1c461b380817a77a3eaf0bd468001b2ba'  
run_command(f'git checkout {initial_commit_hash}')
run_command('dvc checkout')

# Read the initial dataset and print the head
df_initial = pd.read_csv('movies.csv')
print("Initial Data After All Changes:")
print(df_initial.head().to_markdown(index=False, numalign="left", stralign="left"))



M       movies.csv

Initial Data After All Changes:
| MOVIES                              | YEAR        | GENRE                        | RATING   | ONE-LINE                                                                                                                                                                                                            | STARS                   | VOTES   | RunTime   | Gross   |
|:------------------------------------|:------------|:-----------------------------|:---------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------|:--------|:----------|:--------|
| Blood Red Sky                       | (2021)      | Action, Horror, Thriller     | 6.1      | A woman with a mysterious illness is forced into action when a group of terrorists attempt to hijack a transatlanti