In [None]:
#importing libraries
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")
     

In [None]:
!pip install chart_studio

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import chart_studio.plotly as py
import cufflinks as cf
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, plot, init_notebook_mode, iplot
init_notebook_mode(connected=True)# initiate notebook for offline plot
cf.go_offline()

from sklearn import preprocessing
from sklearn import pipeline
from sklearn import model_selection

In [None]:
#importing dataset
fruits_df = pd.read_table("/content/drive/MyDrive/Artifical intelligence/fruits.txt")
fruits_df.head()

In [None]:
fruits_df.shape

In [None]:
#Checking Missing Values
fruits_df.isna().sum()

In [None]:
#visualizing
trace = go.Scatter(x=fruits_df["mass"], y=fruits_df["height"]/fruits_df["width"],
                   mode="markers",
                   hovertext=fruits_df["fruit_name"],
                   marker=dict(
                       size=12,
                        color=fruits_df["fruit_label"],
              ))
fig = go.Figure(data=[trace])

fig.update_layout(title="Fruits Classification based on Dimensions")
fig.update_xaxes(title="Mass")
fig.update_yaxes(title="Height/Width")

iplot(fig)

In [None]:
from matplotlib import style
style.use("fivethirtyeight")
fruits_df.drop(["fruit_label"], axis=1).plot(kind="box", layout=(2,2), figsize=(8,10),
                                             subplots=True, sharey=False)
plt.show()

In [None]:
fruits_df["fruit_name"].value_counts()

In [None]:
#Visualizing Pairwise correlation 
from pandas.plotting import scatter_matrix
atts = ["mass", "width", "height", "color_score"]
scatter_matrix(fruits_df[atts], figsize=(12,12))

In [None]:
fruits_corr = fruits_df.corr()
fruits_corr["mass"]

In [None]:
fruits_corr = fruits_df.corr()
fruits_corr["fruit_label"]

In [None]:
fruits_df.describe()

In [None]:
#scaling
minmax_scaler = preprocessing.MinMaxScaler()

In [None]:
fruits_minmax_scaled = minmax_scaler.fit_transform(
    fruits_df.drop(["fruit_label","fruit_name","fruit_subtype"], axis=1)
)

In [None]:
fruits_df_scaled = pd.DataFrame(fruits_minmax_scaled, columns=fruits_df.columns[3:])
fruits_df_scaled.head()

In [None]:
fruits_df_scaled.describe()

In [None]:
#Splitting of Data into Training And Testing Sets
X_train, X_test, y_train, y_test = model_selection.train_test_split(fruits_df_scaled,
                                                                   fruits_df["fruit_label"],
                                                                   test_size=0.2)

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
#Fitting and Training Logistic Regression and K-Nearest Neighbour models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
logis_reg = LogisticRegression()
logis_reg.fit(X_train, y_train)
accuracy = logis_reg.score(X_test, y_test)
print(accuracy)

In [None]:
clf = KNeighborsClassifier()
clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)
print(accuracy)

In [None]:
#testing accuracy of model with n_neighbor values from 1 to 20
error_rate = []
for i in range(1,20):
 clf = KNeighborsClassifier(n_neighbors=i)
 clf.fit(X_train,y_train)
 pred_i = clf.predict(X_test)
 error_rate.append(np.mean(pred_i != y_test))

In [None]:
print(error_rate)

In [None]:
plt.plot(range(1,20),error_rate)
plt.title("Error Rate vs. K Value")
plt.xlabel("K")
plt.ylabel("Error Rate")
plt.show()


In [None]:
from sklearn.linear_model import SGDClassifier

In [None]:
sgd_clf = SGDClassifier(random_state=42)

In [None]:
sgd_clf.fit(X_train, y_train)

In [None]:
sgd_accuracy = sgd_clf.score(X_test, y_test)
print(sgd_accuracy)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier(random_state=42)

In [None]:
rfc.fit(X_train, y_train)

In [None]:
rfc_accuracy = rfc.score(X_test, y_test)
print(rfc_accuracy)

In [None]:
y_test.value_counts()

In [None]:
#confusion matrix
y_test_prediction = rfc.predict(X_test)
print(y_test_prediction)

In [None]:
from sklearn.metrics import confusion_matrix
conf_mat_rfc = confusion_matrix(y_test, y_test_prediction)
print(conf_mat_rfc)

In [None]:
plt.matshow(conf_mat_rfc, cmap=plt.cm.gray)
plt.show()

In [None]:
from sklearn import metrics
print(metrics.precision_score(y_test, y_test_prediction, average="weighted"))

In [None]:
print(metrics.recall_score(y_test, y_test_prediction, average="weighted"))

In [None]:
y_test_prediction

In [None]:
X_test