# Model Training
## Guide

In [1]:
guide_used = "https://www.kaggle.com/code/mbalvi75/02-investigating-the-dataset-s-using-pandas"

## Make necessary imports

In [3]:
import opendatasets as od
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import scipy.stats
import sklearn.decomposition
import plotly.express as px
from matplotlib import cm

## Importing data
Import data:

In [4]:
od.download("https://www.kaggle.com/datasets/arshid/iris-flower-dataset")   # Got error, so I downloaded it manually

Skipping, found downloaded files in ".\iris-flower-dataset" (use force=True to force download)


Read data:

In [5]:
file = 'iris-flower-dataset/IRIS.csv'
iris = pd.read_csv(file)
iris.head(2)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa


## Exploratory Data Analysis (EDA)

### Learning the structure

In [6]:
iris.index

RangeIndex(start=0, stop=150, step=1)

In [8]:
iris.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

In [10]:
iris.shape

(150, 5)

In [12]:
iris.values[:5]

array([[5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
       [4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
       [4.7, 3.2, 1.3, 0.2, 'Iris-setosa'],
       [4.6, 3.1, 1.5, 0.2, 'Iris-setosa'],
       [5.0, 3.6, 1.4, 0.2, 'Iris-setosa']], dtype=object)

In [14]:
iris.count()

sepal_length    150
sepal_width     150
petal_length    150
petal_width     150
species         150
dtype: int64

In [15]:
type(iris)

pandas.core.frame.DataFrame

In [16]:
iris.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


### Change Attribute names

In [17]:
act_cols = iris.columns
act_cols

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

In [18]:
new_cols = ['sl', 'sw', 'pl', 'pw', 'flowers']
iris.columns = new_cols
iris.head(1)

Unnamed: 0,sl,sw,pl,pw,flowers
0,5.1,3.5,1.4,0.2,Iris-setosa


### Digging more into the DataFrame

In [22]:
print(type(iris["sl"]))
print(type(iris["sw"]))
print(type(iris["pl"]))
print(type(iris["pw"]))

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


In [27]:
iris.sort_values('flowers', ascending=False)[1:5]

Unnamed: 0,sl,sw,pl,pw,flowers
111,6.4,2.7,5.3,1.9,Iris-virginica
122,7.7,2.8,6.7,2.0,Iris-virginica
121,5.6,2.8,4.9,2.0,Iris-virginica
120,6.9,3.2,5.7,2.3,Iris-virginica


In [28]:
iris.sort_index(ascending=False)[0:3]

Unnamed: 0,sl,sw,pl,pw,flowers
149,5.9,3.0,5.1,1.8,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica


### Accessing individual elements or part of the dataset

In [29]:
iris[140:145]

Unnamed: 0,sl,sw,pl,pw,flowers
140,6.7,3.1,5.6,2.4,Iris-virginica
141,6.9,3.1,5.1,2.3,Iris-virginica
142,5.8,2.7,5.1,1.9,Iris-virginica
143,6.8,3.2,5.9,2.3,Iris-virginica
144,6.7,3.3,5.7,2.5,Iris-virginica


In [32]:
# loc selects rows and columns with specific labels.
# iloc selects rows and columns at specific integer positions.
iris.iloc[140:145][["sl", "pw"]]

Unnamed: 0,sl,pw
140,6.7,2.4
141,6.9,2.3
142,5.8,1.9
143,6.8,2.3
144,6.7,2.5


In [31]:
iris.values[10:15]

array([[5.4, 3.7, 1.5, 0.2, 'Iris-setosa'],
       [4.8, 3.4, 1.6, 0.2, 'Iris-setosa'],
       [4.8, 3.0, 1.4, 0.1, 'Iris-setosa'],
       [4.3, 3.0, 1.1, 0.1, 'Iris-setosa'],
       [5.8, 4.0, 1.2, 0.2, 'Iris-setosa']], dtype=object)