Title: Model Selection

Task 1: Linear Regression on House Prices<br>
Use Linear Regression and evaluate its performance on the validation set.

In [1]:
# Customer Churn Dataset - Data Splitting (Train, Validation, Test)

import pandas as pd
from sklearn.model_selection import train_test_split

# Example: Simulated telecom churn dataset (replace with actual dataset as needed)
data = {
    'monthly_minutes': [300, 250, 400, 150, 500, 100, 350, 200, 450, 120],
    'customer_support_calls': [1, 3, 0, 5, 2, 6, 1, 4, 0, 7],
    'contract_length_months': [12, 24, 12, 6, 24, 6, 12, 6, 24, 6],
    'churn': [0, 0, 0, 1, 0, 1, 0, 1, 0, 1]
}
df = pd.DataFrame(data)

# Features and target
X = df.drop('churn', axis=1)
y = df['churn']

# First split: train (70%) and temp (30%) with stratify
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Second split: validation (15%) and test (15%) from temp (no stratify due to small size)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

print("Train set size:", X_train.shape[0])
print("Validation set size:", X_val.shape[0])
print("Test set size:", X_test.shape[0])


Train set size: 7
Validation set size: 1
Test set size: 2


Task 2: Decision Tree Classifier on Iris Dataset<br>
Train a Decision Tree model and evaluate its performance on validation data.

In [1]:
# Task 2: Decision Tree Classifier on Iris Dataset

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Split into train (70%), validation (15%), and test (15%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

# Train Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Evaluate on validation data
y_val_pred = dt_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)

print("Validation Accuracy:", val_accuracy)
print("Classification Report:\n", classification_report(y_val, y_val_pred, target_names=iris.target_names))

Validation Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

      setosa       1.00      1.00      1.00         9
  versicolor       1.00      1.00      1.00         7
   virginica       1.00      1.00      1.00         6

    accuracy                           1.00        22
   macro avg       1.00      1.00      1.00        22
weighted avg       1.00      1.00      1.00        22



Task 3:  Random Forest on Customer Churn<br>
Apply Random Forest and assess its accuracy on the validation set.

In [2]:
# Task 3: Random Forest on Customer Churn

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Use the same train/val/test splits from the earlier churn dataset code
# X_train, X_val, y_train, y_val are already defined

# Train Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Predict on validation set
y_val_pred = rf_model.predict(X_val)

# Evaluate accuracy
val_accuracy = accuracy_score(y_val, y_val_pred)
print("Validation Accuracy:", val_accuracy)
print("Classification Report:\n", classification_report(y_val, y_val_pred))

Validation Accuracy: 0.9090909090909091
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1       0.78      1.00      0.88         7
           2       1.00      0.67      0.80         6

    accuracy                           0.91        22
   macro avg       0.93      0.89      0.89        22
weighted avg       0.93      0.91      0.91        22

