In [1]:
import pandas as pd 

# To perform numerical operations
import numpy as np

# To visualize data
import seaborn as sns

# To partition the data
from sklearn.model_selection import train_test_split

# importing the library of KNN
from sklearn.neighbors import KNeighborsClassifier  

# Importing performance metrics - accuracy score & confusion matrix
from sklearn.metrics import accuracy_score,confusion_matrix

###############################################################################
# =============================================================================
# Importing data
# =============================================================================
data = pd.read_csv('income(1) (2).csv',na_values=[" ?"]) 

# =============================================================================
# Data pre-processing
# =============================================================================

data.isnull().sum()

missing = data[data.isnull().any(axis=1)]
# axis=1 => to consider at least one column value is missing in a row

""" Points to note:
1. Missing values in Jobtype    = 1809
2. Missing values in Occupation = 1816 
3. There are 1809 rows where two specific 
   columns i.e. occupation & JobType have missing values
4. (1816-1809) = 7 => You still have occupation unfilled for 
   these 7 rows. Because, jobtype is Never worked
"""

data2 = data.dropna(axis=0)


# Reindexing the salary status names to 0,1
data2['SalStat']=data2['SalStat'].map({' less than or equal to 50,000':0,' greater than 50,000':1})
print(data2['SalStat'])

new_data=pd.get_dummies(data2, drop_first=True)

# Storing the column names 
columns_list=list(new_data.columns)
print(columns_list)

# Separating the input names from data
features=list(set(columns_list)-set(['SalStat']))
print(features)

# Storing the output values in y
y=new_data['SalStat'].values
print(y)

# Storing the values from input features
x = new_data[features].values
print(x)

# Splitting the data into train and test
train_x,test_x,train_y,test_y = train_test_split(x,y,test_size=0.3, random_state=0)

# =============================================================================
# KNN
# =============================================================================

# Storing the K nearest neighbors classifier
KNN_classifier = KNeighborsClassifier(n_neighbors = 5)  

# Fitting the values for X and Y
KNN_classifier.fit(train_x, train_y) 

# Predicting the test values with model
prediction = KNN_classifier.predict(test_x)

# Performance metric check
confusionMmatrix = confusion_matrix(test_y, prediction)
print(confusionMmatrix)

# Calculating the accuracy
accuracy_score=accuracy_score(test_y, prediction)
print(accuracy_score)

print('Misclassified samples: %d' % (test_y != prediction).sum())

"""
Effect of K value on classifier
"""
Misclassified_sample = []
# Calculating error for K values between 1 and 20
for i in range(1, 20):  
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(train_x, train_y)
    pred_i = knn.predict(test_x)
    Misclassified_sample.append((test_y != pred_i).sum())

print(Misclassified_sample)
# =============================================================================
# END OF SCRIPT
# =============================================================================


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2['SalStat']=data2['SalStat'].map({' less than or equal to 50,000':0,' greater than 50,000':1})


0        0
1        0
2        1
3        0
4        0
        ..
31973    0
31974    0
31975    0
31976    0
31977    0
Name: SalStat, Length: 30162, dtype: int64
['age', 'capitalgain', 'capitalloss', 'hoursperweek', 'SalStat', 'JobType_ Local-gov', 'JobType_ Private', 'JobType_ Self-emp-inc', 'JobType_ Self-emp-not-inc', 'JobType_ State-gov', 'JobType_ Without-pay', 'EdType_ 11th', 'EdType_ 12th', 'EdType_ 1st-4th', 'EdType_ 5th-6th', 'EdType_ 7th-8th', 'EdType_ 9th', 'EdType_ Assoc-acdm', 'EdType_ Assoc-voc', 'EdType_ Bachelors', 'EdType_ Doctorate', 'EdType_ HS-grad', 'EdType_ Masters', 'EdType_ Preschool', 'EdType_ Prof-school', 'EdType_ Some-college', 'maritalstatus_ Married-AF-spouse', 'maritalstatus_ Married-civ-spouse', 'maritalstatus_ Married-spouse-absent', 'maritalstatus_ Never-married', 'maritalstatus_ Separated', 'maritalstatus_ Widowed', 'occupation_ Armed-Forces', 'occupation_ Craft-repair', 'occupation_ Exec-managerial', 'occupation_ Farming-fishing', 'occupation_ Hand

In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Make sure x and y are correctly defined with your data

# Initialize a KNN classifier with a specified number of neighbors (k)
k = 5  # You can adjust the number of neighbors as needed
model = KNeighborsClassifier(n_neighbors=k)

# Initialize an empty list to store accuracy scores
accuracy_scores = []

# Perform KNN classification for 15 iterations
for i in range(15):
    # Split the data into training and testing sets (adjust sizes as needed)
    train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.3, random_state=i)
    
    # Train the model on the training data
    model.fit(train_x, train_y)
    
    # Make predictions on the test data
    predictions = model.predict(test_x)
    
    # Calculate accuracy and append to the list
    accuracy = accuracy_score(test_y, predictions)
    accuracy_scores.append(accuracy)

# Print the accuracy scores for each iteration
for i, score in enumerate(accuracy_scores):
    print(f'Iteration {i + 1} - Accuracy: {score:.2%}')


Iteration 1 - Accuracy: 83.50%
Iteration 2 - Accuracy: 83.43%
Iteration 3 - Accuracy: 83.48%
Iteration 4 - Accuracy: 83.90%
Iteration 5 - Accuracy: 83.71%
Iteration 6 - Accuracy: 83.85%
Iteration 7 - Accuracy: 83.72%
Iteration 8 - Accuracy: 84.33%
Iteration 9 - Accuracy: 83.05%
Iteration 10 - Accuracy: 83.61%
Iteration 11 - Accuracy: 83.77%
Iteration 12 - Accuracy: 83.66%
Iteration 13 - Accuracy: 83.99%
Iteration 14 - Accuracy: 83.74%
Iteration 15 - Accuracy: 83.43%


In [3]:
from sklearn.neighbors import KNeighborsClassifier  # Import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Make sure x and y are correctly defined with data

# Initialize empty lists to store accuracy scores and losses
accuracy_scores = []
losses = []  # Define a list to store your custom loss values

# Perform K-Nearest Neighbors classification for 20 iterations
for i in range(20):
    # Split the data into training and testing sets (adjust sizes as needed)
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=i)

    # Initialize a K-Nearest Neighbors classifier with a specified number of neighbors (k)
    k = 5  # You can adjust the number of neighbors as needed
    knn_model = KNeighborsClassifier(n_neighbors=k)

    # Train the model on the training data
    knn_model.fit(X_train, y_train)

    # Make predictions on the test data
    predictions = knn_model.predict(X_test)

    # Calculate accuracy and append to the list
    accuracy = accuracy_score(y_test, predictions)
    accuracy_scores.append(accuracy)

    # You need to define your custom loss function and calculate loss
    # Here's an example using mean squared error as a loss function
    # Replace this with your specific loss function
    loss = ((y_test - predictions) ** 2).mean()
    losses.append(loss)

# Print the accuracy scores and losses for each iteration
for i, (score, loss) in enumerate(zip(accuracy_scores, losses)):
    print(f'Iteration {i + 1} - Accuracy: {score:.2%} - Loss: {loss:.4f}')


Iteration 1 - Accuracy: 83.50% - Loss: 0.1650
Iteration 2 - Accuracy: 83.43% - Loss: 0.1657
Iteration 3 - Accuracy: 83.48% - Loss: 0.1652
Iteration 4 - Accuracy: 83.90% - Loss: 0.1610
Iteration 5 - Accuracy: 83.71% - Loss: 0.1629
Iteration 6 - Accuracy: 83.85% - Loss: 0.1615
Iteration 7 - Accuracy: 83.72% - Loss: 0.1628
Iteration 8 - Accuracy: 84.33% - Loss: 0.1567
Iteration 9 - Accuracy: 83.05% - Loss: 0.1695
Iteration 10 - Accuracy: 83.61% - Loss: 0.1639
Iteration 11 - Accuracy: 83.77% - Loss: 0.1623
Iteration 12 - Accuracy: 83.66% - Loss: 0.1634
Iteration 13 - Accuracy: 83.99% - Loss: 0.1601
Iteration 14 - Accuracy: 83.74% - Loss: 0.1626
Iteration 15 - Accuracy: 83.43% - Loss: 0.1657
Iteration 16 - Accuracy: 83.78% - Loss: 0.1622
Iteration 17 - Accuracy: 83.81% - Loss: 0.1619
Iteration 18 - Accuracy: 83.82% - Loss: 0.1618
Iteration 19 - Accuracy: 83.64% - Loss: 0.1636
Iteration 20 - Accuracy: 83.35% - Loss: 0.1665
