In [1]:
import pandas as pd

df = pd.read_csv('/datasets/users_behavior.csv')

print(df.head())

print("\nDataset Info:")
print(df.info())

print("\nSummary Statistics:")
print(df.describe())

print("\nMissing Values:")
print(df.isna().sum())

print("\nTarget Distribution (is_ultra):")
print(df['is_ultra'].value_counts(normalize=True))

   calls  minutes  messages   mb_used  is_ultra
0   40.0   311.90      83.0  19915.42         0
1   85.0   516.75      56.0  22696.96         0
2   77.0   467.66      86.0  21060.45         0
3  106.0   745.53      81.0   8437.39         1
4   66.0   418.74       1.0  14502.75         0

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3214 entries, 0 to 3213
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   calls     3214 non-null   float64
 1   minutes   3214 non-null   float64
 2   messages  3214 non-null   float64
 3   mb_used   3214 non-null   float64
 4   is_ultra  3214 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 125.7 KB
None

Summary Statistics:
             calls      minutes     messages       mb_used     is_ultra
count  3214.000000  3214.000000  3214.000000   3214.000000  3214.000000
mean     63.038892   438.208787    38.281269  17207.673836     0.306472
std      33.236368   234

split the data so that the ratio of smart and ultra is preserved in all sets

In [2]:
from sklearn.model_selection import train_test_split

# Features and target
X = df.drop(columns=['is_ultra'])
y = df['is_ultra']

# First split: train+validation vs test
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=12345, stratify=y
)

# Second split: train vs validation
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_val, y_train_val, test_size=0.25, random_state=12345, stratify=y_train_val
) 
# Note: 0.25 of the remaining 80% = 20%, so train=60%, valid=20%, test=20%

# Verify split sizes
print("Train set size:", X_train.shape[0])
print("Validation set size:", X_valid.shape[0])
print("Test set size:", X_test.shape[0])

# Check class balance in each set
print("\nTrain target distribution:\n", y_train.value_counts(normalize=True))
print("\nValidation target distribution:\n", y_valid.value_counts(normalize=True))
print("\nTest target distribution:\n", y_test.value_counts(normalize=True))

Train set size: 1928
Validation set size: 643
Test set size: 643

Train target distribution:
 0    0.693465
1    0.306535
Name: is_ultra, dtype: float64

Validation target distribution:
 0    0.693624
1    0.306376
Name: is_ultra, dtype: float64

Test target distribution:
 0    0.693624
1    0.306376
Name: is_ultra, dtype: float64


training multiple models to test to see which one performs best on validation set

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# --- Decision Tree ---
dt_model = DecisionTreeClassifier(random_state=12345)
dt_model.fit(X_train, y_train)
dt_valid_pred = dt_model.predict(X_valid)
dt_valid_acc = accuracy_score(y_valid, dt_valid_pred)

# --- Random Forest ---
rf_model = RandomForestClassifier(random_state=12345)
rf_model.fit(X_train, y_train)
rf_valid_pred = rf_model.predict(X_valid)
rf_valid_acc = accuracy_score(y_valid, rf_valid_pred)

# --- Logistic Regression ---
lr_model = LogisticRegression(random_state=12345, solver='liblinear')
lr_model.fit(X_train, y_train)
lr_valid_pred = lr_model.predict(X_valid)
lr_valid_acc = accuracy_score(y_valid, lr_valid_pred)

# Print results
print(f"Decision Tree validation accuracy: {dt_valid_acc:.4f}")
print(f"Random Forest validation accuracy: {rf_valid_acc:.4f}")
print(f"Logistic Regression validation accuracy: {lr_valid_acc:.4f}")

Decision Tree validation accuracy: 0.7481
Random Forest validation accuracy: 0.8134
Logistic Regression validation accuracy: 0.7045


tune decision tree and random forest and compare each to see which is best

In [4]:
# Decision Tree tuning 
best_dt_acc = 0
best_dt_depth = None
for depth in range(1, 16):
    model = DecisionTreeClassifier(max_depth=depth, random_state=12345)
    model.fit(X_train, y_train)
    pred = model.predict(X_valid)
    acc = accuracy_score(y_valid, pred)
    if acc > best_dt_acc:
        best_dt_acc = acc
        best_dt_depth = depth
print(f"Best Decision Tree: depth={best_dt_depth}, acc={best_dt_acc:.4f}")

# Random Forest tuning 
best_rf_acc = 0
best_rf_params = (None, None)
for depth in range(2, 11):
    for n in [10, 50, 100, 200]:
        model = RandomForestClassifier(max_depth=depth, n_estimators=n, random_state=12345)
        model.fit(X_train, y_train)
        pred = model.predict(X_valid)
        acc = accuracy_score(y_valid, pred)
        if acc > best_rf_acc:
            best_rf_acc = acc
            best_rf_params = (depth, n)
print(f"Best Random Forest: depth={best_rf_params[0]}, n_estimators={best_rf_params[1]}, acc={best_rf_acc:.4f}")

Best Decision Tree: depth=5, acc=0.8165
Best Random Forest: depth=8, n_estimators=10, acc=0.8320


random forest is the best as it has a higher validation score

test final model

In [5]:
# Combine train and validation sets
X_train_full = pd.concat([X_train, X_valid])
y_train_full = pd.concat([y_train, y_valid])

# Train best model on full training data
final_model = RandomForestClassifier(
    max_depth=8,
    n_estimators=10,
    random_state=12345
)
final_model.fit(X_train_full, y_train_full)

# Evaluate on test set
test_pred = final_model.predict(X_test)
test_acc = accuracy_score(y_test, test_pred)

print(f"Final Random Forest Test Accuracy: {test_acc:.4f}")

Final Random Forest Test Accuracy: 0.8149


above 0.75 threshold and just a bit below our validation accuracy(0.8320). generalizes well and isnt overfitting much.

sanity check

In [6]:
# 1. Baseline accuracy
baseline_acc = y_test.value_counts(normalize=True).max()
print(f"Baseline accuracy (majority class): {baseline_acc:.4f}")

# 2. Sample predictions
sample_results = pd.DataFrame({
    'Actual': y_test[:10].values,
    'Predicted': test_pred[:10]
})
print("\nSample predictions vs actual:")
print(sample_results)

Baseline accuracy (majority class): 0.6936

Sample predictions vs actual:
   Actual  Predicted
0       0          0
1       0          0
2       1          1
3       0          0
4       0          0
5       0          0
6       0          0
7       0          0
8       0          0
9       0          0


predictions mostly match the acutals in the small sample shown.. model correctly classified both classes not just the majority

the project goal was to predict if a customer was going to choose the smart or ultra plan.
the best model was as random forest(max_depth=8, n_estimators=10)
validation accuracy = 0.8320
test accuracy = 0.8149(above 0.75)
baseline accuracy = 0.6936(major improvement to 0.8149)