In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib

In [2]:
data = pd.read_csv('Iris.csv')
data.head(100)

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
95,96,5.7,3.0,4.2,1.2,Iris-versicolor
96,97,5.7,2.9,4.2,1.3,Iris-versicolor
97,98,6.2,2.9,4.3,1.3,Iris-versicolor
98,99,5.1,2.5,3.0,1.1,Iris-versicolor


In [3]:
# Dropping the id column
data.drop(columns='Id', axis=1, inplace=True)


In [4]:
# Renaming columns for easy reading
data = data.rename(columns={"SepalLengthCm":"Sepal Lenght(CM)", "SepalWidthCm":"Sepal Width(CM)", "PetalLengthCm":"Petal Lenght(CM)", "PetalWidthCm":"Petal Width(CM)"})

In [5]:
# getting information about the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Sepal Lenght(CM)  150 non-null    float64
 1   Sepal Width(CM)   150 non-null    float64
 2   Petal Lenght(CM)  150 non-null    float64
 3   Petal Width(CM)   150 non-null    float64
 4   Species           150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [6]:
# Checking for duplicate
data.loc[data.duplicated()]

Unnamed: 0,Sepal Lenght(CM),Sepal Width(CM),Petal Lenght(CM),Petal Width(CM),Species
34,4.9,3.1,1.5,0.1,Iris-setosa
37,4.9,3.1,1.5,0.1,Iris-setosa
142,5.8,2.7,5.1,1.9,Iris-virginica


In [7]:
# dropping duplicates
data = data.drop_duplicates()

#checking if duplicates still exist
data.loc[data.duplicated()]

Unnamed: 0,Sepal Lenght(CM),Sepal Width(CM),Petal Lenght(CM),Petal Width(CM),Species


In [8]:
data['Species'].value_counts()

Iris-versicolor    50
Iris-virginica     49
Iris-setosa        48
Name: Species, dtype: int64

In [9]:
# Changing the Species column to binaries
encoder = LabelEncoder()
data['Species'] = encoder.fit_transform(data['Species'])

In [10]:
data.head()

Unnamed: 0,Sepal Lenght(CM),Sepal Width(CM),Petal Lenght(CM),Petal Width(CM),Species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [11]:
data['Species'].value_counts()

1    50
2    49
0    48
Name: Species, dtype: int64

In [12]:
# splitting the data into training and testing sets
X = data.drop('Species', axis =1)
y = data['Species']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Standardize the data
scaler = StandardScaler()
X_train_transf = scaler.fit_transform(X_train)
X_test_transf  = scaler.transform(X_test)

In [14]:
# saving the scaled data
joblib.dump(X_test_transf, 'scaler.pkl')

['scaler.pkl']

In [15]:
# Creating the model
clf = RandomForestClassifier()

# fitting the model
clf.fit(X_train_transf, y_train)

In [16]:
# Getting predictions
pred = clf.predict(X_test_transf)

# Get the accuracy score
print(accuracy_score(y_test, pred))

0.9333333333333333


In [17]:
# pickling the model 
joblib.dump(clf, 'scripts/model.pkl')

['scripts/model.pkl']

In [18]:
# Testing the model
scaler = StandardScaler()
data = scaler.fit_transform([[1,12,1,1]])
model = joblib.load('scripts/model.pkl')
model.predict(data)

array([1])

In [19]:
X[130:].to_csv('test.csv', header=None, index=False, encoding='UTF8')
