In [76]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor 
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score as acc

In [64]:
outlook = ['Sunny', 'Sunny', 'Overcast', 'Rain', 'Rain','Rain', 'Overcast', 'Sunny', 'Sunny', 'Rain', 'Sunny', 'Overcast',np.NaN,'Rain', 'Sunny', 'Sunny']
temparature = [26, 28, 29, 23,0, 12,np.nan, 25,18,20,20,21,26,24, 23,21]
humidity = ["High", "High", "High","High","Normal","Normal",np.NaN, "High", "Normal", "Normal", "Normal", "High", "Normal", "High", "Normal","Normal"]
wind = [np.NaN, "Strong", "Weak", "Weak", "Weak", "Strong", "Strong", "Weak", "Weak", "Weak", "Strong", "Strong", "Weak", "Strong", "Weak", "Weak"]
playTennis = ["No", "No", "Yes", "Yes", "Yes", "No", "Yes", "No", "Yes", "Yes",np.NAN, "Yes", "Yes", "No", "No", "Yes"]
day = ["D1", "D2", "D3", "D4", "D5", "D6", "D7", "D8", "D9", "D10", "D11", "D12", "D13", "D14", "D15", "D16"]

df =  pd.DataFrame(list(zip(day,outlook,temparature,humidity,wind,playTennis)), columns=['Day', 'Outlook', 'Temparatur','Humidity','Wind','PlayTennis'])
df

Unnamed: 0,Day,Outlook,Temparatur,Humidity,Wind,PlayTennis
0,D1,Sunny,26.0,High,,No
1,D2,Sunny,28.0,High,Strong,No
2,D3,Overcast,29.0,High,Weak,Yes
3,D4,Rain,23.0,High,Weak,Yes
4,D5,Rain,0.0,Normal,Weak,Yes
5,D6,Rain,12.0,Normal,Strong,No
6,D7,Overcast,,,Strong,Yes
7,D8,Sunny,25.0,High,Weak,No
8,D9,Sunny,18.0,Normal,Weak,Yes
9,D10,Rain,20.0,Normal,Weak,Yes


In [65]:
df.dtypes

Day            object
Outlook        object
Temparatur    float64
Humidity       object
Wind           object
PlayTennis     object
dtype: object

In [66]:
# Finding number of missing values in each columns
df.isnull().sum()

Day           0
Outlook       1
Temparatur    1
Humidity      1
Wind          1
PlayTennis    1
dtype: int64

In [67]:
# Separating the data with missing values # can be optimized
test_total = df[df.isna().any(axis=1)]
test_total

Unnamed: 0,Day,Outlook,Temparatur,Humidity,Wind,PlayTennis
0,D1,Sunny,26.0,High,,No
6,D7,Overcast,,,Strong,Yes
10,D11,Sunny,20.0,Normal,Strong,
12,D13,,26.0,Normal,Weak,Yes


In [68]:
df = df.set_index('Day')

In [69]:
df = df.dropna()
df

Unnamed: 0_level_0,Outlook,Temparatur,Humidity,Wind,PlayTennis
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
D2,Sunny,28.0,High,Strong,No
D3,Overcast,29.0,High,Weak,Yes
D4,Rain,23.0,High,Weak,Yes
D5,Rain,0.0,Normal,Weak,Yes
D6,Rain,12.0,Normal,Strong,No
D8,Sunny,25.0,High,Weak,No
D9,Sunny,18.0,Normal,Weak,Yes
D10,Rain,20.0,Normal,Weak,Yes
D12,Overcast,21.0,High,Strong,Yes
D14,Rain,24.0,High,Strong,No


In [70]:
numeric_features=['Temparatur']
categorical_features = ['Outlook', 'Humidity', 'Wind', 'PlayTennis']

In [81]:
df_numeric = df.copy()
for i in categorical_features:
    #df[i] = df[i].astype('category')
    #df[i] = df[i].cat.codes
    df_numeric[i] = pd.factorize(df_numeric[i])[0] # NaN to -1
df_numeric

Unnamed: 0_level_0,Outlook,Temparatur,Humidity,Wind,PlayTennis
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
D2,0,28.0,0,0,0
D3,1,29.0,0,1,1
D4,2,23.0,0,1,1
D5,2,0.0,1,1,1
D6,2,12.0,1,0,0
D8,0,25.0,0,1,0
D9,0,18.0,1,1,1
D10,2,20.0,1,1,1
D12,1,21.0,0,0,1
D14,2,24.0,0,0,0


In [72]:
# for i in categorical_features:
#     df.loc[df[i] == -1, i] = np.NAN
# df          

In [82]:
#Normalize the features to range[0,1]
scaler = MinMaxScaler(feature_range=(0, 1))
df_knn = pd.DataFrame(scaler.fit_transform(df_numeric), columns=df_numeric.columns)
df_knn

Unnamed: 0,Outlook,Temparatur,Humidity,Wind,PlayTennis
0,0.0,0.965517,0.0,0.0,0.0
1,0.5,1.0,0.0,1.0,1.0
2,1.0,0.793103,0.0,1.0,1.0
3,1.0,0.0,1.0,1.0,1.0
4,1.0,0.413793,1.0,0.0,0.0
5,0.0,0.862069,0.0,1.0,0.0
6,0.0,0.62069,1.0,1.0,1.0
7,1.0,0.689655,1.0,1.0,1.0
8,0.5,0.724138,0.0,0.0,1.0
9,1.0,0.827586,0.0,0.0,0.0


In [83]:
# Creating a KNN model
y= df['Outlook']
X= df_knn.drop(['Outlook'], axis=1)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)


In [84]:
knn_outlook = KNeighborsClassifier(n_neighbors=3)
knn_outlook.fit(X, y)
pred = knn_outlook.predict(X)
pred 

array(['Sunny', 'Overcast', 'Overcast', 'Rain', 'Rain', 'Rain', 'Sunny',
       'Sunny', 'Rain', 'Sunny', 'Sunny', 'Sunny'], dtype=object)

In [85]:
print("Accuracy of the model ist ", acc(y, pred))

Accuracy of the model ist  0.5833333333333334


(    Temparatur  Humidity  Wind  PlayTennis
 9     0.827586       0.0   0.0         0.0
 7     0.689655       1.0   1.0         1.0
 8     0.724138       0.0   0.0         1.0
 11    0.724138       1.0   1.0         1.0
 4     0.413793       1.0   0.0         0.0
 10    0.793103       1.0   1.0         0.0
 2     0.793103       0.0   1.0         1.0
 0     0.965517       0.0   0.0         0.0,
 9     1.0
 7     1.0
 8     0.5
 11    0.0
 4     1.0
 10    0.0
 2     1.0
 0     0.0
 Name: Outlook, dtype: float64)

0     0.0
1     0.5
2     1.0
3     1.0
4     1.0
5     0.0
6     0.0
7     1.0
8     0.5
9     1.0
10    0.0
11    0.0
Name: Outlook, dtype: float64

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
5,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
6,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
7,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
8,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
9,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
