# k-Nearest Neighbors

1. Choose K (the number of clusters you want).

2. Initialize K centroids randomly.

3. Repeat until convergence:
   a. Assign each data point to the nearest centroid.
   b. Recalculate the centroids based on the current assignment of data points.
   
4. Return the final K centroids and the corresponding cluster assignments.


In [604]:
# import modules 
import pandas as pd
import numpy as np
%matplotlib inline

In [605]:
df = pd.read_csv('../cleaning-preprocessing/cleaned_flight_data_with_target.csv')
df.head(5)

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,CRS_ELAPSED_TIME,DISTANCE,CRS_DEP_M,CRS_ARR_M,Temperature,Dew Point,Humidity,...,Condition_Light Snow / Windy,Condition_Mostly Cloudy,Condition_Mostly Cloudy / Windy,Condition_Partly Cloudy,Condition_Partly Cloudy / Windy,Condition_Rain,Condition_Rain / Windy,Condition_Snow,Condition_Wintry Mix,Condition_Wintry Mix / Windy
0,11,1,5,124,636,324,448,48,34,58,...,False,False,False,False,False,False,False,False,False,False
1,11,1,5,371,2475,340,531,48,34,58,...,False,False,False,False,False,False,False,False,False,False
2,11,1,5,181,1069,301,482,48,34,58,...,False,False,False,False,False,False,False,False,False,False
3,11,1,5,168,944,345,513,48,34,58,...,False,False,False,False,False,False,False,False,False,False
4,11,1,5,139,760,360,499,46,32,58,...,False,False,False,False,False,False,False,False,False,False


In [606]:
# Convert all boolean columns to 0/1
df = df.astype(int)
df.head(5)

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,CRS_ELAPSED_TIME,DISTANCE,CRS_DEP_M,CRS_ARR_M,Temperature,Dew Point,Humidity,...,Condition_Light Snow / Windy,Condition_Mostly Cloudy,Condition_Mostly Cloudy / Windy,Condition_Partly Cloudy,Condition_Partly Cloudy / Windy,Condition_Rain,Condition_Rain / Windy,Condition_Snow,Condition_Wintry Mix,Condition_Wintry Mix / Windy
0,11,1,5,124,636,324,448,48,34,58,...,0,0,0,0,0,0,0,0,0,0
1,11,1,5,371,2475,340,531,48,34,58,...,0,0,0,0,0,0,0,0,0,0
2,11,1,5,181,1069,301,482,48,34,58,...,0,0,0,0,0,0,0,0,0,0
3,11,1,5,168,944,345,513,48,34,58,...,0,0,0,0,0,0,0,0,0,0
4,11,1,5,139,760,360,499,46,32,58,...,0,0,0,0,0,0,0,0,0,0


In [607]:
X = df.drop(columns=['DEP_DELAY'])  # Feature matrix
y = df['DEP_DELAY']  # Target variable

### Feature Selection - Chi-Square Method

In [608]:
# Ensure that there are no negative values in your dataset
X = X.applymap(lambda x: max(x, 0))  # Replace negative values with 0 (if needed)

# Handle missing values (NaN) by replacing them with the mean of the column or a constant value
X = X.fillna(X.mean())  # You can also use other strategies like median or mode

  X = X.applymap(lambda x: max(x, 0))  # Replace negative values with 0 (if needed)


In [609]:
from sklearn.feature_selection import chi2, SelectKBest
# Perform Chi-Square test
chi2_selector = SelectKBest(score_func=chi2, k=15)  # Select top 30 features
X_selected = chi2_selector.fit_transform(X, y)

# Get selected feature names
selected_features = X.columns[chi2_selector.get_support()]
print("Selected Features:", selected_features.tolist())

Selected Features: ['MONTH', 'DAY_OF_MONTH', 'CRS_ELAPSED_TIME', 'DISTANCE', 'CRS_DEP_M', 'CRS_ARR_M', 'Humidity', 'Wind Speed', 'Wind Gust', 'sch_dep', 'OP_UNIQUE_CARRIER_B6', 'DEST_LAX', 'Condition_Heavy Rain', 'Condition_Light Snow / Windy', 'Condition_Rain']


In [610]:
# Print Chi-Square scores for all features
chi2_scores = pd.Series(chi2_selector.scores_, index=X.columns)

In [611]:
sorted_chi2_scores = chi2_scores.sort_values(ascending=False)

print(chi2_scores.sort_values(ascending=False))

CRS_DEP_M           49170.189663
DISTANCE            17937.129663
CRS_ARR_M            6374.272257
CRS_ELAPSED_TIME     1920.355634
Wind Gust            1336.043846
                        ...     
DEST_PIT                0.031091
DEST_CVG                0.016443
DEST_RDU                0.005127
DEST_SAV                0.004111
DEST_SJC                0.001949
Length: 110, dtype: float64


In [612]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X[selected_features], y, test_size = 0.25, random_state = 0)

In [613]:
X_train

Unnamed: 0,MONTH,DAY_OF_MONTH,CRS_ELAPSED_TIME,DISTANCE,CRS_DEP_M,CRS_ARR_M,Humidity,Wind Speed,Wind Gust,sch_dep,OP_UNIQUE_CARRIER_B6,DEST_LAX,Condition_Heavy Rain,Condition_Light Snow / Windy,Condition_Rain
5823,11,18,382,2422,1077,1279,10,9,18,35,1,0,0,0,0
897,11,3,83,301,1375,18,48,12,0,21,1,0,0,0,0
2739,11,9,211,1182,899,1050,57,10,0,26,1,0,0,0,0
6162,11,19,404,2586,1035,1259,68,12,0,30,0,0,0,0,0
2503,11,8,157,760,1125,1282,44,22,30,39,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13123,12,13,205,1029,420,565,79,5,0,19,0,0,0,0,0
19648,1,3,146,636,525,671,10,8,0,54,0,0,0,0,0
9845,12,1,232,1623,515,807,61,13,0,53,0,0,0,0,0
10799,12,5,95,290,580,675,67,15,0,32,0,0,0,0,0


In [614]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [615]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 3, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

In [616]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report

y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

print(classification_report(y_test, y_pred))


[[5976  262]
 [ 782  185]]
              precision    recall  f1-score   support

           0       0.88      0.96      0.92      6238
           1       0.41      0.19      0.26       967

    accuracy                           0.86      7205
   macro avg       0.65      0.57      0.59      7205
weighted avg       0.82      0.86      0.83      7205



### Hyperparameter Tuning

In [617]:
# from sklearn.model_selection import GridSearchCV

# # Set parameter to be searched in a range
# params = {'n_neighbors': range(2,25)}

# # Initiate the KNN model and GridSearchCV function
# knn = KNeighborsClassifier()
# grid_knn = GridSearchCV(estimator=knn, param_grid=params,
#                         scoring='accuracy', cv=5)

# # Fit the function to train set
# grid_knn.fit(X_train, y_train)

# # Find the best parameter and see how well it performs on test set
# print(grid_knn.best_params_)
# print(grid_knn.score(X_test, y_test))
# print(grid_knn.cv_results_['mean_test_score'])