# EDA

## Import Libraries

In [1]:
import os
import pandas as pd
from ydata_profiling import ProfileReport
import wandb
import warnings

## Initalize Wandb

In [2]:
# Check if WANDB_NOTEBOOK_NAME is set
notebook_name = os.getenv("WANDB_NOTEBOOK_NAME", "EDA.ipynb")
print(f"Running notebook: {notebook_name}")

Running notebook: EDA.ipynb


In [None]:
# Initialize a new W&B run
run = wandb.init(project="nyc_airbnb", group="eda", save_code=True)

In [None]:
# Load data
local_path = wandb.use_artifact("sample.csv:latest").file()
df = pd.read_csv(local_path)

# Profile data
print(df.shape)
df.head()

## Generate Profile Report

In [None]:
# Generate profile report and write to persistent storage
warnings.filterwarnings("ignore")
profile = ProfileReport(df, title="Pandas Profiling Report", explorative=True)
profile.to_file("ydata_report.html")
warnings.filterwarnings("ignore")

In [None]:
warnings.filterwarnings("default")

## Data Cleaning
- name - only 7 values missing. Lets just drop those rows.
- host_name - only 8 missting values. My guess is that when we drop the name rows, this one will drop also. Lets just drop both of them. This is a trivial number of rows.
- price - drop outliers outside of 10 and 350 for price.
- last_review - change to date format.

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
# Drop outliers
print(df.shape)
min_price = 10
max_price = 350
idx = df['price'].between(min_price, max_price)
df = df[idx].copy()
print(df.shape)

In [None]:
# Convert last_review to datetime
df['last_review'] = pd.to_datetime(df['last_review'])
df.info()

In [None]:
print(df.shape)
df.dropna(subset=['name', 'host_name'], inplace=True)
df.shape

## Finish Run

In [None]:
# Save the updated sample file to wandb

# Save the DataFrame to a CSV file
df.to_csv("sample_cleaned.csv", index=False)

# Create a wandb Artifact
artifact = wandb.Artifact('sample_cleaned', type='dataset')

# Add the CSV file to the artifact
artifact.add_file('sample_cleaned.csv')

# Log the artifact to wandb
run.log_artifact(artifact)

In [None]:
# Path to the notebook you want to save
notebook_path = "EDA.ipynb"

# Create an artifact
artifact = wandb.Artifact(name="EDA.ipynb", type="notebook")
artifact.add_file(notebook_path)

# Log the artifact to W&B
run.log_artifact(artifact)

In [None]:
# This will cause the notebook to be flushed to wandb and save the profile report
run.save('ydata_report.html')
wandb.finish()