Panda Data Access Basics

In [None]:
import pandas as pd

In [None]:
diamonds = pd.read_csv("../data/diamonds.csv")
diamonds.head()

In [None]:
diamonds.info()

In [None]:
diamonds.describe()

In [None]:
diamonds.loc[0, "cut"]      # single cell

In [None]:
diamonds.loc[2:5, "cut"]    # rows 0–3, "cut" column

In [None]:
diamonds.loc[:, ["cut", "color"]]  # all rows, two columns

Data Cleanup Basics

In [None]:
diamonds.isnull().sum()

In [None]:
diamonds["depth"] = diamonds["depth"].fillna(diamonds["depth"].mean())

In [None]:
diamonds = diamonds.dropna()
diamonds = diamonds.reset_index(drop=True)

Machine Learning Basics

In [None]:
features = pd.get_dummies(diamonds.drop("price", axis=1))
target = diamonds["price"]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error
import math

predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
rmse = math.sqrt(mse)

print(f"Linear Regression RMSE: ${rmse:.2f}")

Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import math

# Train the model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict and evaluate
rf_preds = rf.predict(X_test)
rmse = math.sqrt(mean_squared_error(y_test, rf_preds))

print(f"Random Forest RMSE: ${rmse:.2f}")