In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('advertising.csv')
df.head(1)

Unnamed: 0,TV,Radio,Newspaper,Sales
0,230.1,37.8,69.2,22.1


In [3]:
X = df.drop(columns='Sales',axis=1)
y = df['Sales']

In [4]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=3)

In [5]:
X_train

Unnamed: 0,TV,Radio,Newspaper
156,93.9,43.5,50.5
115,75.1,35.0,52.7
155,4.1,11.6,5.7
15,195.4,47.7,52.9
61,261.3,42.7,54.7
...,...,...,...
0,230.1,37.8,69.2
184,253.8,21.3,30.0
131,265.2,2.9,43.0
152,197.6,23.3,14.2


In [6]:
X_test

Unnamed: 0,TV,Radio,Newspaper
40,202.5,22.3,31.6
51,100.4,9.6,3.6
139,184.9,43.9,1.7
197,177.0,9.3,6.4
170,50.0,11.6,18.4
82,75.3,20.3,32.5
183,287.6,43.0,71.8
46,89.7,9.9,35.7
70,199.1,30.6,38.7
100,222.4,4.3,49.8


In [7]:
def KnnReg(X_train, X_test, y_train, y_test, k=3):
    # Combine training and test data
    train_data = pd.concat([X_train, y_train],axis=1)
    test_data = pd.concat([X_test, y_test],axis=1)

    # Calculate Euclidean distances
    diss = []
    for k in range(test_data.shape[0]):
        dis = []
        for i in range(train_data.shape[0]):
            sum_=0
            for j in range(train_data.shape[1]-1):
                sum_ += (train_data.iloc[i,j] - test_data.iloc[k,j])**2
            dist = sum_**0.5
            dis.append(dist)
        diss.append(dis)

    # Make predictions
    predictions = []
    for i in range(len(diss)):
        dis = diss[i]
        ind = np.argsort(dis)[:k]
        y_labels = [train_data.iloc[j, -1] for j in ind]
        y_hat = np.mean(y_labels)
        predictions.append(y_hat)

    # Add predictions to X_test
    X_test['y_pred']=predictions

    return X_test

In [8]:
from sklearn.metrics import mean_absolute_percentage_error

In [9]:
X_test

Unnamed: 0,TV,Radio,Newspaper
40,202.5,22.3,31.6
51,100.4,9.6,3.6
139,184.9,43.9,1.7
197,177.0,9.3,6.4
170,50.0,11.6,18.4
82,75.3,20.3,32.5
183,287.6,43.0,71.8
46,89.7,9.9,35.7
70,199.1,30.6,38.7
100,222.4,4.3,49.8


In [10]:
KnnReg(X_train, X_test, y_train, y_test,3)

Unnamed: 0,TV,Radio,Newspaper,y_pred
40,202.5,22.3,31.6,19.038462
51,100.4,9.6,3.6,12.187179
139,184.9,43.9,1.7,18.307692
197,177.0,9.3,6.4,16.753846
170,50.0,11.6,18.4,9.051282
82,75.3,20.3,32.5,11.776923
183,287.6,43.0,71.8,20.617949
46,89.7,9.9,35.7,12.225641
70,199.1,30.6,38.7,19.033333
100,222.4,4.3,49.8,19.112821


In [11]:
mean_absolute_percentage_error(y_test,X_test['y_pred'])

0.11356164321925352

In [14]:
def CV(df,cv,k=3):
    import pandas as pd
    import numpy as np

    # Assuming you have already defined KnnReg function

    # Sample data frame
    shuffle = df.sample(frac=1,random_state=42).reset_index(drop=True)
    cv = cv
    train_mape_list = []
    test_mape_list = []

    for i in range(cv):
        start = i * len(shuffle) // cv
        end = (i + 1) * len(shuffle) // cv
        train = pd.concat([shuffle.iloc[:start, :-1], shuffle.iloc[end:, :-1]])
        test = df.iloc[start:end, :-1]
        train_y = pd.concat([shuffle.iloc[:start, -1], shuffle.iloc[end:, -1]])
        test_y = df.iloc[start:end, -1]

        # Calculate MAPE
        test_new = KnnReg(train, test, train_y, test_y, 3)
        absolute_percentage_errors = np.abs((test_y - test_new['y_pred']) / test_y)
        mape = np.mean(absolute_percentage_errors)
        test_mape_list.append(mape)

    # Now test_mape_list contains the MAPE for each fold.
    return test_mape_list

In [15]:
CV(df,6)

[0.1190689778476494,
 0.10948419319936363,
 0.10474722687790407,
 0.24268105139175686,
 0.15411749050436035,
 0.12033841178769995]