In [13]:
import pandas as pd
import numpy as np 
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.linear_model import LogisticRegression 
from sklearn import metrics   
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import train_test_split
import warnings  
warnings.filterwarnings('ignore')
df = pd.read_csv('C:/dataflights/mushroom_data/mushroom_data.csv') 
df.head()

Unnamed: 0.1,Unnamed: 0,class,odor,cap-color
0,0,2,17,21
1,1,1,11,30
2,2,1,12,29
3,3,2,17,29
4,4,1,10,24


### Preparing X and Y using Pandas

In [14]:
X_odor = df['odor'].values.reshape(-1,1)
X_cap_color = df['cap-color'].values.reshape(-1,1)
y = df['class'] 

### cross-validation for parameter tuning for odor

In [15]:
knn = KNeighborsClassifier(n_neighbors=5)
k_range = list(range(1, 31)) 
param_grid = dict(n_neighbors=k_range) 
grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy') 
grid.fit(X_odor, y)

GridSearchCV(cv=10, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                         13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                         23, 24, 25, 26, 27, 28, 29, 30]},
             scoring='accuracy')

In [16]:
print(grid.best_score_)
print(grid.best_params_) 
print(grid.best_estimator_)

0.9852239470670568
{'n_neighbors': 1}
KNeighborsClassifier(n_neighbors=1)


### Cross-validation for parameter tuning for cap-color

In [17]:
k_range = list(range(1, 31)) 
param_grid = dict(n_neighbors=k_range) 
grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy') 
grid.fit(X_cap_color, y) 

GridSearchCV(cv=10, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                         13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                         23, 24, 25, 26, 27, 28, 29, 30]},
             scoring='accuracy')

In [18]:
print(grid.best_score_)
print(grid.best_params_) 
print(grid.best_estimator_)

0.5280583680221039
{'n_neighbors': 27}
KNeighborsClassifier(n_neighbors=27)


### Cross validation model selection for odor

In [19]:
##10-fold cross-validation with the best KNN model
knn = KNeighborsClassifier(n_neighbors=1)
print(cross_val_score(knn, X_odor, y, cv=10, scoring='accuracy').mean())

0.9852239470670568


In [20]:
# 10-fold cross-validation with logistic regression 
logreg = LogisticRegression(solver='liblinear')
print(cross_val_score(logreg, X_odor, y, cv=10, scoring='accuracy').mean())

0.9852239470670568


In [21]:
# Linear regression prediction and RMSE
X_train, X_test, y_train, y_test = train_test_split(X_odor, y, random_state=1) 
linreg = LinearRegression() 
linreg.fit(X_train, y_train) 
y_pred = linreg.predict(X_test) 
print(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

0.23272104538619684


### Cross validation model selection for cap-color

In [22]:
##10-fold cross-validation with the best KNN model
knn = KNeighborsClassifier(n_neighbors=27)
print(cross_val_score(knn, X_cap_color, y, cv=10, scoring='accuracy').mean())

0.5280583680221039


In [23]:
# 10-fold cross-validation with logistic regression 
logreg = LogisticRegression(solver='liblinear')
print(cross_val_score(logreg, X_cap_color, y, cv=10, scoring='accuracy').mean())

0.5679922624349396


In [24]:
# Linear regression prediction and RMSE
X_train, X_test, y_train, y_test = train_test_split(X_cap_color, y, random_state=1) 
linreg = LinearRegression() 
linreg.fit(X_train, y_train) 
y_pred = linreg.predict(X_test) 
print(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

0.4994488239833032


## Conclusion 
- To determine which of the two predictor columns (odor and cap-color) most accurately predicts wether or not a mushroom is poisonous, I decided to run various model in order to see which one would make better predictions. As we can see by the results of the models, odor would be a better predictor of accuracy as the KNN and logistic regression models had an accuracy score of around .98 while the KNN and logistic regression models for the cap-color had scores of .53 and .57 respectively. I also chose to use linear regression in order to get the Root Mean Squared Error (RMSE) for Odor and Cap-color and as the results show, odor has a lower RMSE, which means it is more accurate as error is something we would want to minimize. A recommendation for further analysis would be to use logistic regression in order to predict if a mushroom is poisonous by cap-color as it has a higher accuracy score.