In [1]:
import sklearn.preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler

import pandas as pd
from prepare import telco_pipeline, telco_prep, telco_modeling, telco_modelingv2


In [2]:
train_X, val_X, test_X, train_y, val_y, test_y = telco_modeling()

In [3]:
train_X.head()

Unnamed: 0,tenure,monthly_charges,internet_dsl,internet_fiber_optics,internet_no_internet,payment_method_bank_transfer,payment_method_credit_card,payment_method_electronic_check,payment_method_mailed_check
0,14,76.45,False,True,False,False,False,True,False
1,5,70.0,True,False,False,False,False,False,True
2,35,75.2,False,True,False,False,False,True,False
3,58,86.1,True,False,False,False,False,True,False
4,2,49.6,True,False,False,False,False,False,True


In [4]:
train_y.head()

5609    False
2209     True
6919     True
2284    False
845      True
Name: churn, dtype: bool

In [5]:
train_X.shape, val_X.shape, test_X.shape, train_y.shape, val_y.shape, test_y.shape

((4930, 9), (1056, 9), (1057, 9), (4930,), (1056,), (1057,))

In [6]:
train_y.shape

(4930,)

In [7]:
train_y.mode()

0    False
Name: churn, dtype: bool

In [8]:
#Get the baseline accuracy. Accuracy = Ratio of true predictions to the total number of predictions
base_acc = (train_y == 0).mean()#when applying mean to a t/f df, the trues values are 1.
base_acc

0.734685598377282

In [9]:
# Instantiate the DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=42)

# Train the model
dt.fit(train_X, train_y)

# Predict on the training data
train_predictions = dt.predict(train_X)

# Evaluate the model
accuracy = accuracy_score(train_y, train_predictions)
classification_rep = classification_report(train_y, train_predictions)

print("Decision Tree Model Results:")
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_rep)


Decision Tree Model Results:
Accuracy: 0.9904665314401623

Classification Report:
               precision    recall  f1-score   support

       False       0.99      1.00      0.99      3622
        True       0.99      0.97      0.98      1308

    accuracy                           0.99      4930
   macro avg       0.99      0.98      0.99      4930
weighted avg       0.99      0.99      0.99      4930



In [10]:
dt.score(train_X, train_y)

0.9904665314401623

In [11]:

# Instantiate the DecisionTreeClassifier as 'dt'
dt = DecisionTreeClassifier(random_state=42)

# Train the model
dt.fit(train_X, train_y)

# Calculate and print the accuracy on the training data
train_score = dt.score(train_X, train_y)
print("Decision Tree Model Accuracy on Training Data:", train_score)

val_score = dt.score(val_X, val_y)
print("Decision Tree Model Accuracy on Validation Data:", val_score)

Decision Tree Model Accuracy on Training Data: 0.9904665314401623
Decision Tree Model Accuracy on Validation Data: 0.7348484848484849


In [12]:
#Loop to 
seed = 42
train_acc = []
val_acc = []
depth = []

for i in range(1, 10):
    
    clf = DecisionTreeClassifier(max_depth = i, random_state = seed)
    
    clf.fit(train_X, train_y)
    
    depth.append(i)
    
    train_acc.append(clf.score(train_X, train_y))
    
    val_acc.append(clf.score(val_X, val_y))
    
trees = pd.DataFrame({'max_depth': depth,
                      'train_acc': train_acc,
                      'val_acc': val_acc})

trees

Unnamed: 0,max_depth,train_acc,val_acc
0,1,0.734686,0.734848
1,2,0.791278,0.797348
2,3,0.791278,0.797348
3,4,0.793509,0.800189
4,5,0.803245,0.801136
5,6,0.811156,0.797348
6,7,0.818458,0.786932
7,8,0.831643,0.780303
8,9,0.84503,0.776515


In [13]:
def get_forest(train_X, val_X, train_y, val_y):
    '''get random forest accuracy on train and validate data'''

    # create model object and fit it to training data
    rf = RandomForestClassifier(max_depth=7, min_samples_leaf= 10, random_state=42)
    rf.fit(train_X,train_y)

    # print result
    print(f"Accuracy of Random Forest on train is {rf.score(train_X, train_y)}")
    print(f"Accuracy of Random Forest on validate is {rf.score(val_X, val_y)}")
    
get_forest(train_X, val_X, train_y, val_y)

Accuracy of Random Forest on train is 0.8129817444219067
Accuracy of Random Forest on validate is 0.8058712121212122


In [14]:
#Test a combination of depths and min leaf settings. 
seed = 42
train_acc = []
val_acc = []
depth = []
leaf = []

# Extract the target column as a Series
train_y_array = train_y
val_y_array = val_y

for max_depth in range(10, 0, -1):  # Decreasing depth from 10 to 1
    for min_samples_leaf in range(1, 10):  # Increasing min_samples_leaf from 1 to 5
        rf = RandomForestClassifier(max_depth=max_depth, min_samples_leaf=min_samples_leaf, random_state=seed)
        rf.fit(train_X, train_y)
        
        depth.append(max_depth)
        leaf.append(min_samples_leaf)
        
        train_acc.append(rf.score(train_X, train_y))
        val_acc.append(rf.score(val_X, val_y))

In [15]:
#Print the results
trees = pd.DataFrame({'depth': depth,
                      'train_acc': train_acc,
                      'val_acc': val_acc,
                      'leaf': leaf})

trees.sort_values(by=['val_acc', 'train_acc', 'depth', 'leaf'], ascending=[False, False, True, True]).head()

Unnamed: 0,depth,train_acc,val_acc,leaf
35,7,0.813185,0.803977,9
46,5,0.803245,0.80303,2
34,7,0.813185,0.802083,8
33,7,0.812982,0.802083,7
45,5,0.803448,0.802083,1


In [16]:
seed = 42

logreg = LogisticRegression(random_state = seed, max_iter = 100, multi_class= 'multinomial')

logreg.fit(train_X, train_y)

# print result
print(f"Accuracy of Random Forest on train is {logreg.score(train_X, train_y)}")
print(f"Accuracy of Random Forest on validate is {logreg.score(val_X, val_y)}")

Accuracy of Random Forest on train is 0.8016227180527383
Accuracy of Random Forest on validate is 0.7945075757575758


In [17]:

seed = 42

best_score = 0
best_C = 0
best_train_score = 0

for C in [0.001, 0.01, 0.1, 1, 10, 100]:
    logreg = LogisticRegression(random_state=seed, C=C, max_iter=1000)
    logreg.fit(train_X, train_y)
    
    train_score = logreg.score(train_X, train_y)
    val_score = logreg.score(val_X, val_y)
    
    print(f"C = {C:.3f}, Train Score = {train_score:.4f}, Validation Score = {val_score:.4f}")
    
    if val_score > best_score:
        best_score = val_score
        best_C = C
        best_train_score = train_score

print(f"\nBest C = {best_C:.3f}, Best Train Score = {best_train_score:.4f}, Best Validation Score = {best_score:.4f}")

C = 0.001, Train Score = 0.7884, Validation Score = 0.7992
C = 0.010, Train Score = 0.7943, Validation Score = 0.8002
C = 0.100, Train Score = 0.7980, Validation Score = 0.7992
C = 1.000, Train Score = 0.8022, Validation Score = 0.7945
C = 10.000, Train Score = 0.8014, Validation Score = 0.7964
C = 100.000, Train Score = 0.8016, Validation Score = 0.7964

Best C = 0.010, Best Train Score = 0.7943, Best Validation Score = 0.8002


In [18]:
seed = 42

best_score = 0
best_C = 0
best_train_score = 0
best_penalty = ''
best_solver = ''
best_class_weight = ''
best_multi_class = ''

penalties = ['l1', 'l2']
solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
class_weights = [None, 'balanced']
multi_classes = ['ovr', 'multinomial']

for C in [0.001, 0.01, 0.1, 1, 10, 100]:
    for penalty in penalties:
        for solver in solvers:
            for class_weight in class_weights:
                for multi_class in multi_classes:
                    try:
                        logreg = LogisticRegression(
                            random_state=seed, C=C, penalty=penalty,
                            solver=solver, class_weight=class_weight,
                            multi_class=multi_class, max_iter=500
                        )
                        logreg.fit(train_X, train_y)
                        
                        train_score = logreg.score(train_X, train_y)
                        val_score = logreg.score(val_X, val_y)
                        
                        print(f"C = {C:.3f}, Penalty = {penalty}, Solver = {solver}, Class Weight = {class_weight}, Multi-Class = {multi_class}")
                        print(f"Train Score = {train_score:.4f}, Validation Score = {val_score:.4f}")
                        
                        if val_score > best_score:
                            best_score = val_score
                            best_C = C
                            best_train_score = train_score
                            best_penalty = penalty
                            best_solver = solver
                            best_class_weight = class_weight
                            best_multi_class = multi_class
                    except ValueError as e:
                        print("Error:", e)
                        continue

print("\nBest Hyperparameters:")
print(f"Best C = {best_C:.3f}")
print(f"Best Penalty = {best_penalty}")
print(f"Best Solver = {best_solver}")
print(f"Best Class Weight = {best_class_weight}")
print(f"Best Multi-Class = {best_multi_class}")
print(f"Best Train Score = {best_train_score:.4f}")
print(f"Best Validation Score = {best_score:.4f}")


Error: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.
Error: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.
Error: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.
Error: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.
Error: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.
Error: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.
Error: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.
Error: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.
C = 0.001, Penalty = l1, Solver = liblinear, Class Weight = None, Multi-Class = ovr
Train Score = 0.7621, Validation Score = 0.7746
Error: Solver liblinear does not support a multinomial backend.
C = 0.001, Penalty = l1, Solver = liblinear, Class Weight = balanced, Multi-Class = ovr
Train Score = 0.6874, Validation Score = 0.6932
Error: Solver liblinear does not support a m



C = 0.001, Penalty = l2, Solver = saga, Class Weight = None, Multi-Class = ovr
Train Score = 0.7884, Validation Score = 0.7992
C = 0.001, Penalty = l2, Solver = saga, Class Weight = None, Multi-Class = multinomial
Train Score = 0.7913, Validation Score = 0.7945
C = 0.001, Penalty = l2, Solver = saga, Class Weight = balanced, Multi-Class = ovr
Train Score = 0.7314, Validation Score = 0.7311
C = 0.001, Penalty = l2, Solver = saga, Class Weight = balanced, Multi-Class = multinomial
Train Score = 0.7345, Validation Score = 0.7358
Error: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.
Error: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.
Error: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.
Error: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.
Error: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.
Error: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penal



C = 0.010, Penalty = l1, Solver = saga, Class Weight = None, Multi-Class = ovr
Train Score = 0.7870, Validation Score = 0.7973
C = 0.010, Penalty = l1, Solver = saga, Class Weight = None, Multi-Class = multinomial
Train Score = 0.7872, Validation Score = 0.7973
C = 0.010, Penalty = l1, Solver = saga, Class Weight = balanced, Multi-Class = ovr
Train Score = 0.7323, Validation Score = 0.7339
C = 0.010, Penalty = l1, Solver = saga, Class Weight = balanced, Multi-Class = multinomial
Train Score = 0.7320, Validation Score = 0.7339
C = 0.010, Penalty = l2, Solver = newton-cg, Class Weight = None, Multi-Class = ovr
Train Score = 0.7943, Validation Score = 0.8002
C = 0.010, Penalty = l2, Solver = newton-cg, Class Weight = None, Multi-Class = multinomial
Train Score = 0.7966, Validation Score = 0.8002
C = 0.010, Penalty = l2, Solver = newton-cg, Class Weight = balanced, Multi-Class = ovr
Train Score = 0.7406, Validation Score = 0.7443
C = 0.010, Penalty = l2, Solver = newton-cg, Class Weight = 



C = 0.010, Penalty = l2, Solver = sag, Class Weight = None, Multi-Class = ovr
Train Score = 0.7941, Validation Score = 0.7992
C = 0.010, Penalty = l2, Solver = sag, Class Weight = None, Multi-Class = multinomial
Train Score = 0.7966, Validation Score = 0.8002
C = 0.010, Penalty = l2, Solver = sag, Class Weight = balanced, Multi-Class = ovr
Train Score = 0.7406, Validation Score = 0.7443
C = 0.010, Penalty = l2, Solver = sag, Class Weight = balanced, Multi-Class = multinomial
Train Score = 0.7424, Validation Score = 0.7509




C = 0.010, Penalty = l2, Solver = saga, Class Weight = None, Multi-Class = ovr
Train Score = 0.7939, Validation Score = 0.7992




C = 0.010, Penalty = l2, Solver = saga, Class Weight = None, Multi-Class = multinomial
Train Score = 0.7970, Validation Score = 0.8011
C = 0.010, Penalty = l2, Solver = saga, Class Weight = balanced, Multi-Class = ovr
Train Score = 0.7406, Validation Score = 0.7443
C = 0.010, Penalty = l2, Solver = saga, Class Weight = balanced, Multi-Class = multinomial
Train Score = 0.7424, Validation Score = 0.7509
Error: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.
Error: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.
Error: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.
Error: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.
Error: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.
Error: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.
Error: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.
Error: Solver lbfgs supports only 'l2' or 'none



C = 0.100, Penalty = l1, Solver = saga, Class Weight = None, Multi-Class = ovr
Train Score = 0.7974, Validation Score = 0.8011




C = 0.100, Penalty = l1, Solver = saga, Class Weight = None, Multi-Class = multinomial
Train Score = 0.7961, Validation Score = 0.8011




C = 0.100, Penalty = l1, Solver = saga, Class Weight = balanced, Multi-Class = ovr
Train Score = 0.7460, Validation Score = 0.7528




C = 0.100, Penalty = l1, Solver = saga, Class Weight = balanced, Multi-Class = multinomial
Train Score = 0.7456, Validation Score = 0.7566
C = 0.100, Penalty = l2, Solver = newton-cg, Class Weight = None, Multi-Class = ovr
Train Score = 0.7972, Validation Score = 0.8002
C = 0.100, Penalty = l2, Solver = newton-cg, Class Weight = None, Multi-Class = multinomial
Train Score = 0.7986, Validation Score = 0.7983
C = 0.100, Penalty = l2, Solver = newton-cg, Class Weight = balanced, Multi-Class = ovr
Train Score = 0.7458, Validation Score = 0.7557
C = 0.100, Penalty = l2, Solver = newton-cg, Class Weight = balanced, Multi-Class = multinomial
Train Score = 0.7493, Validation Score = 0.7557
C = 0.100, Penalty = l2, Solver = lbfgs, Class Weight = None, Multi-Class = ovr
Train Score = 0.7980, Validation Score = 0.7992
C = 0.100, Penalty = l2, Solver = lbfgs, Class Weight = None, Multi-Class = multinomial
Train Score = 0.7986, Validation Score = 0.7983
C = 0.100, Penalty = l2, Solver = lbfgs, Clas



C = 0.100, Penalty = l2, Solver = sag, Class Weight = None, Multi-Class = ovr
Train Score = 0.7968, Validation Score = 0.7983
C = 0.100, Penalty = l2, Solver = sag, Class Weight = None, Multi-Class = multinomial
Train Score = 0.7986, Validation Score = 0.7983




C = 0.100, Penalty = l2, Solver = sag, Class Weight = balanced, Multi-Class = ovr
Train Score = 0.7446, Validation Score = 0.7557




C = 0.100, Penalty = l2, Solver = sag, Class Weight = balanced, Multi-Class = multinomial
Train Score = 0.7489, Validation Score = 0.7566




C = 0.100, Penalty = l2, Solver = saga, Class Weight = None, Multi-Class = ovr
Train Score = 0.7980, Validation Score = 0.7992




C = 0.100, Penalty = l2, Solver = saga, Class Weight = None, Multi-Class = multinomial
Train Score = 0.7982, Validation Score = 0.7983




C = 0.100, Penalty = l2, Solver = saga, Class Weight = balanced, Multi-Class = ovr
Train Score = 0.7460, Validation Score = 0.7557




C = 0.100, Penalty = l2, Solver = saga, Class Weight = balanced, Multi-Class = multinomial
Train Score = 0.7479, Validation Score = 0.7557
Error: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.
Error: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.
Error: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.
Error: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.
Error: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.
Error: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.
Error: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.
Error: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.
C = 1.000, Penalty = l1, Solver = liblinear, Class Weight = None, Multi-Class = ovr
Train Score = 0.8014, Validation Score = 0.7955
Error: Solver liblinear does not support a multinomial backend.
C = 1.000, Penalty = l1, Solver = libline



C = 1.000, Penalty = l1, Solver = saga, Class Weight = None, Multi-Class = ovr
Train Score = 0.7980, Validation Score = 0.8002




C = 1.000, Penalty = l1, Solver = saga, Class Weight = None, Multi-Class = multinomial
Train Score = 0.8008, Validation Score = 0.7973




C = 1.000, Penalty = l1, Solver = saga, Class Weight = balanced, Multi-Class = ovr
Train Score = 0.7471, Validation Score = 0.7595




C = 1.000, Penalty = l1, Solver = saga, Class Weight = balanced, Multi-Class = multinomial
Train Score = 0.7491, Validation Score = 0.7604
C = 1.000, Penalty = l2, Solver = newton-cg, Class Weight = None, Multi-Class = ovr
Train Score = 0.8022, Validation Score = 0.7945
C = 1.000, Penalty = l2, Solver = newton-cg, Class Weight = None, Multi-Class = multinomial
Train Score = 0.8016, Validation Score = 0.7945
C = 1.000, Penalty = l2, Solver = newton-cg, Class Weight = balanced, Multi-Class = ovr
Train Score = 0.7511, Validation Score = 0.7566
C = 1.000, Penalty = l2, Solver = newton-cg, Class Weight = balanced, Multi-Class = multinomial
Train Score = 0.7515, Validation Score = 0.7557
C = 1.000, Penalty = l2, Solver = lbfgs, Class Weight = None, Multi-Class = ovr
Train Score = 0.8022, Validation Score = 0.7945
C = 1.000, Penalty = l2, Solver = lbfgs, Class Weight = None, Multi-Class = multinomial
Train Score = 0.8016, Validation Score = 0.7945
C = 1.000, Penalty = l2, Solver = lbfgs, Clas



C = 1.000, Penalty = l2, Solver = sag, Class Weight = None, Multi-Class = ovr
Train Score = 0.8006, Validation Score = 0.7983
C = 1.000, Penalty = l2, Solver = sag, Class Weight = None, Multi-Class = multinomial
Train Score = 0.8022, Validation Score = 0.7936




C = 1.000, Penalty = l2, Solver = sag, Class Weight = balanced, Multi-Class = ovr
Train Score = 0.7495, Validation Score = 0.7604
C = 1.000, Penalty = l2, Solver = sag, Class Weight = balanced, Multi-Class = multinomial
Train Score = 0.7513, Validation Score = 0.7566




C = 1.000, Penalty = l2, Solver = saga, Class Weight = None, Multi-Class = ovr
Train Score = 0.7982, Validation Score = 0.8011




C = 1.000, Penalty = l2, Solver = saga, Class Weight = None, Multi-Class = multinomial
Train Score = 0.8008, Validation Score = 0.7973




C = 1.000, Penalty = l2, Solver = saga, Class Weight = balanced, Multi-Class = ovr
Train Score = 0.7469, Validation Score = 0.7595




C = 1.000, Penalty = l2, Solver = saga, Class Weight = balanced, Multi-Class = multinomial
Train Score = 0.7493, Validation Score = 0.7604
Error: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.
Error: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.
Error: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.
Error: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.
Error: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.
Error: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.
Error: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.
Error: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.
C = 10.000, Penalty = l1, Solver = liblinear, Class Weight = None, Multi-Class = ovr
Train Score = 0.8016, Validation Score = 0.7964
Error: Solver liblinear does not support a multinomial backend.
C = 10.000, Penalty = l1, Solver = libli



C = 10.000, Penalty = l1, Solver = saga, Class Weight = None, Multi-Class = ovr
Train Score = 0.7982, Validation Score = 0.8011




C = 10.000, Penalty = l1, Solver = saga, Class Weight = None, Multi-Class = multinomial
Train Score = 0.8012, Validation Score = 0.7945




C = 10.000, Penalty = l1, Solver = saga, Class Weight = balanced, Multi-Class = ovr
Train Score = 0.7460, Validation Score = 0.7595




C = 10.000, Penalty = l1, Solver = saga, Class Weight = balanced, Multi-Class = multinomial
Train Score = 0.7499, Validation Score = 0.7604
C = 10.000, Penalty = l2, Solver = newton-cg, Class Weight = None, Multi-Class = ovr
Train Score = 0.8014, Validation Score = 0.7964
C = 10.000, Penalty = l2, Solver = newton-cg, Class Weight = None, Multi-Class = multinomial
Train Score = 0.8016, Validation Score = 0.7964
C = 10.000, Penalty = l2, Solver = newton-cg, Class Weight = balanced, Multi-Class = ovr
Train Score = 0.7511, Validation Score = 0.7547
C = 10.000, Penalty = l2, Solver = newton-cg, Class Weight = balanced, Multi-Class = multinomial
Train Score = 0.7509, Validation Score = 0.7547
C = 10.000, Penalty = l2, Solver = lbfgs, Class Weight = None, Multi-Class = ovr
Train Score = 0.8014, Validation Score = 0.7964
C = 10.000, Penalty = l2, Solver = lbfgs, Class Weight = None, Multi-Class = multinomial
Train Score = 0.8016, Validation Score = 0.7964
C = 10.000, Penalty = l2, Solver = lbf



C = 10.000, Penalty = l2, Solver = sag, Class Weight = None, Multi-Class = ovr
Train Score = 0.8012, Validation Score = 0.7945
C = 10.000, Penalty = l2, Solver = sag, Class Weight = None, Multi-Class = multinomial
Train Score = 0.8016, Validation Score = 0.7964




C = 10.000, Penalty = l2, Solver = sag, Class Weight = balanced, Multi-Class = ovr
Train Score = 0.7499, Validation Score = 0.7604
C = 10.000, Penalty = l2, Solver = sag, Class Weight = balanced, Multi-Class = multinomial
Train Score = 0.7515, Validation Score = 0.7557




C = 10.000, Penalty = l2, Solver = saga, Class Weight = None, Multi-Class = ovr
Train Score = 0.7982, Validation Score = 0.8011




C = 10.000, Penalty = l2, Solver = saga, Class Weight = None, Multi-Class = multinomial
Train Score = 0.8010, Validation Score = 0.7945




C = 10.000, Penalty = l2, Solver = saga, Class Weight = balanced, Multi-Class = ovr
Train Score = 0.7462, Validation Score = 0.7595




C = 10.000, Penalty = l2, Solver = saga, Class Weight = balanced, Multi-Class = multinomial
Train Score = 0.7499, Validation Score = 0.7604
Error: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.
Error: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.
Error: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.
Error: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.
Error: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.
Error: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.
Error: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.
Error: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.
C = 100.000, Penalty = l1, Solver = liblinear, Class Weight = None, Multi-Class = ovr
Train Score = 0.8016, Validation Score = 0.7964
Error: Solver liblinear does not support a multinomial backend.
C = 100.000, Penalty = l1, Solver = li



C = 100.000, Penalty = l1, Solver = saga, Class Weight = None, Multi-Class = ovr
Train Score = 0.7982, Validation Score = 0.8011




C = 100.000, Penalty = l1, Solver = saga, Class Weight = None, Multi-Class = multinomial
Train Score = 0.8010, Validation Score = 0.7936




C = 100.000, Penalty = l1, Solver = saga, Class Weight = balanced, Multi-Class = ovr
Train Score = 0.7465, Validation Score = 0.7595




C = 100.000, Penalty = l1, Solver = saga, Class Weight = balanced, Multi-Class = multinomial
Train Score = 0.7499, Validation Score = 0.7604
C = 100.000, Penalty = l2, Solver = newton-cg, Class Weight = None, Multi-Class = ovr
Train Score = 0.8016, Validation Score = 0.7964
C = 100.000, Penalty = l2, Solver = newton-cg, Class Weight = None, Multi-Class = multinomial
Train Score = 0.8016, Validation Score = 0.7964
C = 100.000, Penalty = l2, Solver = newton-cg, Class Weight = balanced, Multi-Class = ovr
Train Score = 0.7507, Validation Score = 0.7547
C = 100.000, Penalty = l2, Solver = newton-cg, Class Weight = balanced, Multi-Class = multinomial
Train Score = 0.7507, Validation Score = 0.7547
C = 100.000, Penalty = l2, Solver = lbfgs, Class Weight = None, Multi-Class = ovr
Train Score = 0.8016, Validation Score = 0.7964
C = 100.000, Penalty = l2, Solver = lbfgs, Class Weight = None, Multi-Class = multinomial
Train Score = 0.8016, Validation Score = 0.7964
C = 100.000, Penalty = l2, Solv



C = 100.000, Penalty = l2, Solver = sag, Class Weight = None, Multi-Class = ovr
Train Score = 0.8010, Validation Score = 0.7936
C = 100.000, Penalty = l2, Solver = sag, Class Weight = None, Multi-Class = multinomial
Train Score = 0.8016, Validation Score = 0.7964




C = 100.000, Penalty = l2, Solver = sag, Class Weight = balanced, Multi-Class = ovr
Train Score = 0.7499, Validation Score = 0.7604
C = 100.000, Penalty = l2, Solver = sag, Class Weight = balanced, Multi-Class = multinomial
Train Score = 0.7515, Validation Score = 0.7557




C = 100.000, Penalty = l2, Solver = saga, Class Weight = None, Multi-Class = ovr
Train Score = 0.7982, Validation Score = 0.8011




C = 100.000, Penalty = l2, Solver = saga, Class Weight = None, Multi-Class = multinomial
Train Score = 0.8010, Validation Score = 0.7936




C = 100.000, Penalty = l2, Solver = saga, Class Weight = balanced, Multi-Class = ovr
Train Score = 0.7465, Validation Score = 0.7595
C = 100.000, Penalty = l2, Solver = saga, Class Weight = balanced, Multi-Class = multinomial
Train Score = 0.7499, Validation Score = 0.7604

Best Hyperparameters:
Best C = 0.010
Best Penalty = l2
Best Solver = liblinear
Best Class Weight = None
Best Multi-Class = ovr
Best Train Score = 0.7966
Best Validation Score = 0.8011




In [19]:
#Test a range of neighbors  train_X.shape, val_X.shape, test_X.shape, train_y.shape
train_acc = []
val_acc = []
neighbor = []

for nb_range in range(1, 100):
    knn = KNeighborsClassifier(n_neighbors= nb_range)
    knn.fit(train_X, train_y)
    
    neighbor.append(nb_range)
    
    train_acc.append(knn.score(train_X, train_y))
    val_acc.append(knn.score(val_X, val_y))


In [20]:
knn_df = pd.DataFrame({'neighbor': neighbor,
                      'train_acc': train_acc,
                      'val_acc': val_acc})

knn_df.sort_values(by=['val_acc', 'train_acc', 'neighbor'], ascending=[False, False, True]).head(5)

Unnamed: 0,neighbor,train_acc,val_acc
20,21,0.804462,0.804924
58,59,0.797566,0.803977
36,37,0.797769,0.80303
50,51,0.796552,0.80303
43,44,0.796349,0.80303


In [21]:
def get_knn(train_X, val_X, train_y, val_y):

    # create model object and fit it to the training data
    knn = KNeighborsClassifier(n_neighbors=17, weights='distance')
    knn.fit(train_X, train_y)

    # print results
    print(f"Accuracy of Logistic Regression on train is {knn.score(train_X, train_y)}")
    print(f"Accuracy of Logistic Regression on validate is {knn.score(val_X, val_y)}")


In [22]:
get_knn(train_X, val_X, train_y, val_y)

Accuracy of Logistic Regression on train is 0.9904665314401623
Accuracy of Logistic Regression on validate is 0.7727272727272727


In [23]:

# Instantiate the KNeighborsClassifier model
knn_model = KNeighborsClassifier()

# Define the parameter grid
param_grid = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance'],
    'p': [1, 2],
}

# Instantiate GridSearchCV
grid_search = GridSearchCV(knn_model, param_grid, cv=5)

# Fit the grid search to the data
grid_search.fit(train_X, train_y)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Cross-Validation Score:", best_score)


NameError: name 'GridSearchCV' is not defined

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

def get_knn(train_X, val_X, train_y, val_y):
    # Instantiate the KNeighborsClassifier model
    knn_model = KNeighborsClassifier()

    # Define the parameter grid
    param_grid = {
        'n_neighbors': list(range(1, 50)),  # Values from 1 to 20
        'weights': ['uniform', 'distance'],
        'p': [1, 2],
    }

    # Instantiate GridSearchCV
    grid_search = GridSearchCV(knn_model, param_grid, cv=5)

    # Fit the grid search to the training data
    grid_search.fit(train_X, train_y)

    # Get the best parameters and best estimator
    best_params = grid_search.best_params_
    best_estimator = grid_search.best_estimator_

    # Predict on validation data using the best estimator
    val_predictions = best_estimator.predict(val_X)

    # Calculate accuracy on validation data
    val_accuracy = accuracy_score(val_y, val_predictions)

    print("Best Parameters:", best_params)
    print("Validation Accuracy:", val_accuracy)

# Assuming you have train_X, val_X, train_y, val_y defined
get_knn(train_X, val_X, train_y, val_y)


Best Parameters: {'n_neighbors': 42, 'p': 1, 'weights': 'uniform'}
Validation Accuracy: 0.7964015151515151


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

def get_knn(train_X, val_X, train_y, val_y):
    best_val_accuracy = 0.0
    results = []

    for n_neighbors in range(1, 21):
        for weights in ['uniform', 'distance']:
            for p in [1, 2]:
                knn_model = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, p=p)
                knn_model.fit(train_X, train_y)

                train_predictions = knn_model.predict(train_X)
                val_predictions = knn_model.predict(val_X)

                train_accuracy = accuracy_score(train_y, train_predictions)
                val_accuracy = accuracy_score(val_y, val_predictions)

                results.append((n_neighbors, weights, p, train_accuracy, val_accuracy))

                if val_accuracy > best_val_accuracy:
                    best_val_accuracy = val_accuracy

    results.sort(key=lambda x: x[4], reverse=True)

    for result in results:
        n_neighbors, weights, p, train_accuracy, val_accuracy = result
        print(f"n_neighbors: {n_neighbors}, weights: {weights}, p: {p}")
        print(f"Train Accuracy: {train_accuracy:.4f}, Validation Accuracy: {val_accuracy:.4f}\n")

# Assuming train_X, val_X, train_y, and val_y are available
get_knn(train_X, val_X, train_y, val_y)


n_neighbors: 17, weights: uniform, p: 1
Train Accuracy: 0.8122, Validation Accuracy: 0.8030

n_neighbors: 20, weights: uniform, p: 1
Train Accuracy: 0.8087, Validation Accuracy: 0.8021

n_neighbors: 16, weights: uniform, p: 1
Train Accuracy: 0.8077, Validation Accuracy: 0.7983

n_neighbors: 19, weights: uniform, p: 1
Train Accuracy: 0.8116, Validation Accuracy: 0.7973

n_neighbors: 20, weights: uniform, p: 2
Train Accuracy: 0.8030, Validation Accuracy: 0.7973

n_neighbors: 15, weights: uniform, p: 1
Train Accuracy: 0.8134, Validation Accuracy: 0.7964

n_neighbors: 19, weights: uniform, p: 2
Train Accuracy: 0.8037, Validation Accuracy: 0.7964

n_neighbors: 18, weights: uniform, p: 1
Train Accuracy: 0.8095, Validation Accuracy: 0.7955

n_neighbors: 10, weights: uniform, p: 1
Train Accuracy: 0.8172, Validation Accuracy: 0.7926

n_neighbors: 17, weights: uniform, p: 2
Train Accuracy: 0.8055, Validation Accuracy: 0.7917

n_neighbors: 8, weights: uniform, p: 1
Train Accuracy: 0.8247, Validat