In [None]:
# Dataset Source:
# I found this dataset on Kaggle (Titanic Dataset).

# Variables:
# X1 (independent variable): Pclass (ticket class: 1 = 1st, 2 = 2nd, 3 = 3rd)
# X2 (independent variable): Age (passenger's age)
# Y (dependent variable): Fare (ticket price paid)

# Why I chose these:
# - "Pclass" is numeric (ordinal: 1, 2, 3) and strongly relates to ticket cost.
# - "Age" is numeric and may also influence ticket price (e.g., adult vs. child).
# - "Fare" is a continuous numeric variable, making it a suitable target for regression.

# Why this is a good prediction sample:
# - The Titanic dataset is well-known and provides enough rows (891 passengers) to train and test a regression model.
# - "Fare" is continuous and works well as Y, while "Pclass" and "Age" are solid numeric predictors.
# - This setup allows us to practice multiple regression with real-world data while avoiding classification targets like "Survived."


In [None]:
# Step 1: Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Step 2: Load dataset from YOUR GitHub repo
url = "https://raw.githubusercontent.com/KylePhan1/ITEC4700Assignment5/main/Titanic-Dataset.csv"
df = pd.read_csv(url)

# Step 3: Select relevant columns
df = df[['Pclass', 'Age', 'Fare']]   # keep only the columns we need
df = df.dropna()   # remove rows with missing Age values

# Step 4: Define Xs and Y
X = df[['Pclass', 'Age']]   # independent variables
y = df['Fare']              # dependent variable

# Step 5: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Fit regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Step 7: Make predictions
y_pred = model.predict(X_test)

# Step 8: Evaluate model
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)
print("R^2 Score:", r2_score(y_test, y_pred))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))

# Step 9: Visualization
plt.scatter(y_test, y_pred, alpha=0.7)
plt.xlabel("Actual Fare")
plt.ylabel("Predicted Fare")
plt.title("Actual vs Predicted Titanic Fares")
plt.show()

