In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split #splitting data in to training and testing
from sklearn.metrics import r2_score #to use mean squared error
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor


In [22]:
data = pd.read_csv('advertising.csv')
data.head()

Unnamed: 0,TV,Radio,Newspaper,Sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,12.0
3,151.5,41.3,58.5,16.5
4,180.8,10.8,58.4,17.9


In [23]:
data.shape
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   TV         200 non-null    float64
 1   Radio      200 non-null    float64
 2   Newspaper  200 non-null    float64
 3   Sales      200 non-null    float64
dtypes: float64(4)
memory usage: 6.4 KB


In [24]:
#checking for missing values
data.isnull().sum()

TV           0
Radio        0
Newspaper    0
Sales        0
dtype: int64

In [25]:
#selecting x, y
x = data.iloc[:, :3] 
y = data.iloc[:, 3]

In [26]:
#checking for data unbalance by checking if this data set is having more 0's than 1's or vice versa
y.value_counts()

Sales
11.9    5
16.7    5
20.7    4
11.0    3
11.3    3
       ..
13.4    1
24.2    1
8.1     1
5.5     1
25.5    1
Name: count, Length: 121, dtype: int64

In [27]:
y_value = np.mean(y)
y_value

15.130500000000001

In [28]:
#if sales is greater than mean value then it is 1 else 0
y_bool_value= np.where(y > y_value, 1, 0)
y_bool_value

array([1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 1])

In [55]:
#combining table x and y
Newdata = pd.concat([x, pd.DataFrame(y_bool_value, columns=['sales'])], axis=1)
Newdata.head()


Unnamed: 0,TV,Radio,Newspaper,sales
0,230.1,37.8,69.2,1
1,44.5,39.3,45.1,0
2,17.2,45.9,69.3,0
3,151.5,41.3,58.5,1
4,180.8,10.8,58.4,1


In [56]:
#selecting x, y
x = Newdata.iloc[:, :3] 
y = Newdata.iloc[:, 3]
y.value_counts()


sales
1    107
0     93
Name: count, dtype: int64

In [57]:
#since there is data unbalance we need to balance the data set
#balancing the data set
#Because the data set is unbalanced(acording to above code), we need to balance the data set add random samples to the data set
from imblearn.over_sampling import SMOTE, ADASYN
smt = SMOTE()
x1, y1 = smt.fit_resample(x,y)
#again check balanced new y data which is y1
y1.value_counts()


sales
1    107
0    107
Name: count, dtype: int64

# Train the data set


In [62]:
#split the data into training and testing data
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, test_size=0.2, random_state=100)
knn1 = KNeighborsClassifier(n_neighbors=5)#using KNN classifier
knn1.fit(x_train, y_train) #fitting the model with training data set 
#predicting the model with testing data set

In [63]:
y_pred = knn1.predict(x_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred) #checking the accuracy of the model

0.95

In [64]:
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold
para = {'n_neighbors': [1,2,3,4,5,6,7,8,9,10]} #assigning 10 values to n_neighbors for knn evaluation
model = KNeighborsClassifier() #assigning KNeighborsClassifier to model
cvals = KFold(n_splits=10) #applying crossvalidation with 10 splits
gsearch = GridSearchCV(model, para, cv=cvals) #cv for cross validation and cvals for the values
result = gsearch.fit(x_train, y_train) #fitting the model

# Doing Prediction

In [65]:
correct_sum = []
for i in range(1,20):
    model = KNeighborsClassifier(n_neighbors=i)
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    correct = np.sum(y_pred == y_test)
    correct_sum.append(correct)

print(correct_sum)

[37, 35, 37, 37, 38, 38, 38, 38, 39, 38, 38, 38, 38, 38, 38, 38, 38, 37, 38]
