# K Nearest Neighbors

We will be using customer churn data from the telecom industry for this week's exercises. The data file is called Orange_Telecom_Churn_Data.csv. We will load this data together, do some preprocessing, and use K-nearest neighbors to predict customer churn based on account characteristics.



In [None]:
import pandas as pd

data = pd.read_csv('Orange_Telecom_Churn_Data.csv')

In [None]:
data.head(1).T

In [None]:
# Remove extraneous columns
data.drop(['state', 'area_code', 'phone_number'], axis=1, inplace=True)

In [None]:
data.columns

Notice that some of the columns are categorical data and some are floats. These features will need to be numerically encoded using one of the methods from the lecture. 

In [None]:
from sklearn.preprocessing import LabelBinarizer

lb = LabelBinarizer()

for col in ['intl_plan', 'voice_mail_plan', 'churned']:
    data[col] = lb.fit_transform(data[col])

In [None]:
data.head().T

In [None]:
from sklearn.preprocessing import MinMaxScaler

msc = MinMaxScaler()

data = pd.DataFrame(msc.fit_transform(data),  # this is an np.array, not a dataframe.
                    columns=data.columns)

Separate the feature columns (everything except churned) from the label (churned). This will create two tables.

In [None]:
# Get a list of all the columns that don't contain the label
x_cols = [x for x in data.columns if x != 'churned']

# Split the data into two dataframes
X_data = data[x_cols]
y_data = data['churned']

# # alternatively:
# X_data = data.copy()
# y_data = X_data.pop('churned')

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)

knn = knn.fit(X_data, y_data)

y_pred = knn.predict(X_data)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(y_pred, y_data)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn1 = KNeighborsClassifier(n_neighbors=3,weights='distance')

knn1 = knn1.fit(X_data, y_data)

y_pred1 = knn1.predict(X_data)

In [None]:
accuracy_score(y_pred1, y_data)

In [None]:

d={}
for k in range(1,21):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn = knn.fit(X_data, y_data)
    y_pred = knn.predict(X_data)
    d[k]=accuracy_score(y_data, y_pred)

In [None]:
d

In [None]:
    
%matplotlib inline
import matplotlib.pyplot as plt
plt.plot(d.keys(),d.values())

In [None]:

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X_data,y_data)
print(X_train.shape)
print(X_val.shape)
print(y_train.shape)

print(y_val.shape)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)

knn = knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

In [None]:
accuracy_score(y_pred,y_test)

In [None]:

from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(y_pred,y_test)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3,weights='distance')

knn = knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

In [None]:
accuracy_score(y_pred,y_test)

In [None]:
confusion_matrix(y_pred,y_test)