In [None]:
# packages?
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [None]:
# 1 data exploratory
print("\n" + "="*50)
print("EXPLORATORY DATA ANALYSIS")
print("="*50)

# Load in dataset
train_data = pd.read_csv('train_2025_2026.csv')
print(train_data.head())

# Missing values summary
missing_by_column = train_data.isnull().sum()
columns_with_missing = missing_by_column[missing_by_column > 0]
print(f"\nColumns with missing values: {len(columns_with_missing)}")
print(f"Columns without missing values: {len(train_data.columns) - len(columns_with_missing)}")

# Row-level missing info
rows_with_any_missing = train_data.isnull().any(axis=1).sum()
rows_with_all_missing = train_data.isnull().all(axis=1).sum()
rows_with_half_missing = (train_data.isnull().sum(axis=1) > train_data.shape[1]//2).sum()

print(f"Rows with any missing values: {rows_with_any_missing} ({rows_with_any_missing/len(train_data)*100:.1f}%)")
print(f"Rows with all values missing: {rows_with_all_missing} ({rows_with_all_missing/len(train_data)*100:.1f}%)")
print(f"Rows with >50% values missing: {rows_with_half_missing} ({rows_with_half_missing/len(train_data)*100:.1f}%)")

print("\nMissing value patterns (first 10 columns with most missing):")
missing_analysis = pd.DataFrame({
    'Missing_Count': train_data.isnull().sum(),
    'Missing_Percentage': (train_data.isnull().sum() / len(train_data)) * 100
}).sort_values('Missing_Count', ascending=False)
print(missing_analysis.head(10))

# Class proportions
print("\n" + "="*50)
print("Proportion of classes in data")
class_counts = train_data['Outcome'].value_counts()
class_proportions = train_data['Outcome'].value_counts(normalize=True) * 100
print("Absolute counts:")
print(class_counts)
print("\nProportions (%):")
print(class_proportions)

# Correlation matrix
corr_matrix = train_data.corr()
print(corr_matrix)

print("\n" + "="*50)
print("EXPLORATORY DATA ANALYSIS SUMMARY")
print("="*50)
print(f"Dataset: {train_data.shape[0]} rows, {train_data.shape[1]} columns")
print(f"Total missing values: {train_data.isnull().sum().sum()}")


In [None]:
# Guided Exercise 2: Undersampling to balance classes
from sklearn.utils import resample
imbalanced_data = pd.read_csv('train_2025_2026.csv')

class_counts = imbalanced_data['Outcome'].value_counts()
print(class_counts)
num_min = min(class_counts)
print('Number of samples of the class with fewer samples:', num_min)

balanced_list = []
for class_ in imbalanced_data['Outcome'].unique():
    df_class = imbalanced_data[imbalanced_data['Outcome'] == class_]
    df_resampled = resample(df_class, replace=False, n_samples=num_min, random_state=42)
    balanced_list.append(df_resampled)

balanced_data = pd.concat(balanced_list)
print('New class distribution:\n', balanced_data['Outcome'].value_counts())
balanced_data.to_csv('training_balanced_data.csv', index=False)


In [None]:
# Logistic Regression model pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
import pandas as pd

data = pd.read_csv('training_balanced_data.csv')
X = data.drop(columns=['Outcome', 'Id'])
y = data['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = make_pipeline(SimpleImputer(), StandardScaler(), LogisticRegression(max_iter=1000))
model.fit(X_train, y_train)
score = model.score(X_test, y_test)
print(f"Model Accuracy: {score:.4f}")
