## Project – Predictive Analysis using scikit-learn
### By Kimesha Josephs

In [13]:
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import numpy as np

In [7]:
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data',
                   sep=',', header=None, usecols=[0,5,22], names=["Mushroom_Type","Odor","Habitat"])
data.head()

Unnamed: 0,Mushroom_Type,Odor,Habitat
0,p,p,u
1,e,a,g
2,e,l,m
3,p,p,u
4,e,n,g


In [8]:
data.replace(to_replace={'Mushroom_Type':{'p': 0, 'e': 1}}, inplace = True)
data.replace(to_replace={'Odor':{'a':0, 'l':1, 'c':2, 'y':3, 'f':4, 'm':5, 'n':6, 'p':7, 's':8}}, inplace = True)
data.replace(to_replace={'Habitat':{'g':0, 'l':1, 'm':2, 'p':3, 'u':4, 'w':5, 'd':6}}, inplace = True)
data.head()

Unnamed: 0,Mushroom_Type,Odor,Habitat
0,0,7,4
1,1,0,0
2,1,1,2
3,0,7,4
4,1,6,0


In [9]:
H = pd.Series(data['Habitat'])
f = pd.get_dummies(H)

O = pd.Series(data['Odor'])
g = pd.get_dummies(O)

In [10]:
new_data = pd.concat([f, g, data['Mushroom_Type']], axis=1)
cols = list(new_data.iloc[:, :-1])

In [14]:
X = new_data.iloc[:, :-1].values
y = new_data.iloc[:, 1].values

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(6093, 16)
(6093,)
(2031, 16)
(2031,)


In [16]:
linreg = LinearRegression()
linreg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [17]:
print(linreg.intercept_)
print(linreg.coef_)

0.851234464981
[-0.84337333  0.15662667 -0.84337333 -0.84337333 -0.84337333 -0.84337333
 -0.84337333 -0.00786114 -0.00786114 -0.00786114 -0.00786114 -0.00786114
 -0.00786114 -0.00786114 -0.00786114 -0.00786114]


In [18]:
y_pred = linreg.predict(X_test)

In [19]:
true = [1, 0]
pred = [1, 0]

print(metrics.mean_absolute_error(true, pred))
print(metrics.mean_squared_error(true, pred))
print(np.sqrt(metrics.mean_squared_error(true, pred)))

0.0
0.0
0.0


In [20]:
print(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

2.52468016812e-15


In [21]:
X = new_data.iloc[:, 11:-1].values
y = new_data.iloc[:, 1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
linreg.fit(X_train, y_train)
y_pred = linreg.predict(X_test)
print(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

0.302833453446


In [22]:
X = new_data.iloc[:, 1:10].values
y = new_data.iloc[:, 1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
linreg.fit(X_train, y_train)
y_pred = linreg.predict(X_test)
print(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

3.15186809319e-15


### Based on this analysis the use of odor and not the habitat is best to determine the mushroom type, whether it is poisonous or edible.