In [31]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from pydataset import data
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [2]:
data('voteincome',show_doc=True)

voteincome

PyDataset Documentation (adopted from R Documentation. The displayed examples are in R)

## Sample Turnout and Demographic Data from the 2000 Current Population Survey

### Description

This data set contains turnout and demographic data from a sample of
respondents to the 2000 Current Population Survey (CPS). The states
represented are South Carolina and Arkansas. The data represent only a sample
and results from this example should not be used in publication.

### Usage

    data(voteincome)

### Format

A data frame containing 7 variables ("state", "year", "vote", "income",
"education", "age", "female") and 1500 observations.

`state`

a factor variable with levels equal to "AR" (Arkansas) and "SC" (South
Carolina)

`year`

an integer vector

`vote`

an integer vector taking on values "1" (Voted) and "0" (Did Not Vote)

`income`

an integer vector ranging from "4" (Less than \$5000) to "17" (Greater than
\$75000) denoting family income. See the CPS codebook for more info

In [14]:
df = data('voteincome')

In [15]:
df.head(2)

Unnamed: 0,state,year,vote,income,education,age,female
1,AR,2000,1,9,2,73,0
2,AR,2000,1,11,2,24,0


In [16]:
df = df.drop(columns=['state','year'])

In [17]:
df.head()

Unnamed: 0,vote,income,education,age,female
1,1,9,2,73,0
2,1,11,2,24,0
3,0,12,2,24,1
4,1,16,4,40,0
5,1,10,4,85,1


In [7]:
df.shape

(1500, 5)

In [34]:
df.describe()

Unnamed: 0,vote,income,education,age,female
count,1500.0,1500.0,1500.0,1500.0,1500.0
mean,0.855333,12.464,2.651333,49.261333,0.559333
std,0.351882,3.915643,1.021009,17.471134,0.496633
min,0.0,4.0,1.0,18.0,0.0
25%,1.0,9.0,2.0,36.0,0.0
50%,1.0,13.0,3.0,49.0,1.0
75%,1.0,16.0,4.0,62.0,1.0
max,1.0,17.0,4.0,85.0,1.0


In [18]:
X = df.drop(columns='vote')
y = df.vote

In [67]:
X_train,X_test,y_train,y_test = train_test_split(X,y,stratify=y,test_size=.2)

In [68]:
knn = KNeighborsClassifier(n_neighbors=2,weights='uniform')

In [69]:
knn.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=2, p=2,
                     weights='uniform')

In [70]:
y_pred = knn.predict(X_train)
y_pred

array([1, 0, 1, ..., 1, 1, 1])

In [71]:
y_pred_prob = knn.predict_proba(X_train)
y_pred_prob

array([[0. , 1. ],
       [0.5, 0.5],
       [0. , 1. ],
       ...,
       [0. , 1. ],
       [0. , 1. ],
       [0. , 1. ]])

In [72]:
knn.score(X_train,y_train)

0.9216666666666666

In [77]:
knn.score(X_test,y_test)

0.8066666666666666

In [79]:
for n in [1,2,3,4]:
    knn = KNeighborsClassifier(n_neighbors=n)
    knn.fit(X_train,y_train)
    print("n: ",n)
    print("train score: ", knn.score(X_train,y_train))
    print("test score: ",knn.score(X_test,y_test))

n:  1
train score:  0.9758333333333333
test score:  0.8966666666666666
n:  2
train score:  0.9216666666666666
test score:  0.8066666666666666
n:  3
train score:  0.9141666666666667
test score:  0.8066666666666666
n:  4
train score:  0.8833333333333333
test score:  0.79


In [73]:
confusion_matrix(y_train,y_pred)

array([[173,   1],
       [ 93, 933]])

In [74]:
print(classification_report(y_train,y_pred))

              precision    recall  f1-score   support

           0       0.65      0.99      0.79       174
           1       1.00      0.91      0.95      1026

    accuracy                           0.92      1200
   macro avg       0.82      0.95      0.87      1200
weighted avg       0.95      0.92      0.93      1200



In [75]:
y_pred = knn.predict(X_test)

In [76]:
knn.score(X_test,y_test)

0.8066666666666666

In [80]:
from sklearn import __version__
__version__

'0.21.2'

In [83]:
import pyspark.sql.functions as F
import pyspark

In [85]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [86]:
spark.range(10).show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
+---+

