In [2]:
import pandas as pd


## Loading Data

In [3]:
# read data as pandas dataframe

df = pd.read_csv('./data/flavors.csv')

In [4]:
df

Unnamed: 0,celery,vegetable
0,corn,vegetable
1,cucumber,vegetable
2,horseradish,vegetable
3,vegetable,vegetable
4,potato,vegetable
...,...,...
850,wild,
851,wine-lee,
852,winey,
853,yeasty,


In [5]:
# confirm type of df

type(df)

pandas.core.frame.DataFrame

In [6]:
# first five rows of df

df.head()

Unnamed: 0,celery,vegetable
0,corn,vegetable
1,cucumber,vegetable
2,horseradish,vegetable
3,vegetable,vegetable
4,potato,vegetable


In [7]:
# first ten rows of df

df.head(10)

Unnamed: 0,celery,vegetable
0,corn,vegetable
1,cucumber,vegetable
2,horseradish,vegetable
3,vegetable,vegetable
4,potato,vegetable
5,tomato,vegetable
6,caraway,herbaceous
7,clove,herbaceous
8,fennel,herbaceous
9,herbaceous,herbaceous


In [8]:
# last five rows of df

df.tail()

Unnamed: 0,celery,vegetable
850,wild,
851,wine-lee,
852,winey,
853,yeasty,
854,ylang,


In [10]:
# show dataframe info (notice there is not a column header -- maybe we should add some)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 855 entries, 0 to 854
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   celery     855 non-null    object
 1   vegetable  112 non-null    object
dtypes: object(2)
memory usage: 13.5+ KB


## Selecting Rows and Columns

In [13]:
# check columns

df.columns

Index(['celery', 'vegetable'], dtype='object')

In [17]:
df.columns = ["food", "class"]
df.columns

Index(['food', 'class'], dtype='object')

In [18]:
df.head()

Unnamed: 0,food,class
0,corn,vegetable
1,cucumber,vegetable
2,horseradish,vegetable
3,vegetable,vegetable
4,potato,vegetable


In [20]:
# to return data in specific column

df["food"]

0             corn
1         cucumber
2      horseradish
3        vegetable
4           potato
          ...     
850           wild
851       wine-lee
852          winey
853         yeasty
854          ylang
Name: food, Length: 855, dtype: object

In [21]:
# return multiple specific columns

df[["food","class"]]

Unnamed: 0,food,class
0,corn,vegetable
1,cucumber,vegetable
2,horseradish,vegetable
3,vegetable,vegetable
4,potato,vegetable
...,...,...
850,wild,
851,wine-lee,
852,winey,
853,yeasty,


In [25]:
# check data type of column (notice they are a series)

type(df["class"])

pandas.core.series.Series

In [27]:
# check data type of multiple columns

type(df[["food","class"]])

pandas.core.frame.DataFrame

One dimension of pandas is a series while two dimensions are a df

In [34]:
# select specific row of df

df.iloc[22]

food     gardenia
class      floral
Name: 22, dtype: object

In [48]:
# return all data where food is vegetable

df[df["class"] == "vegetable"]

Unnamed: 0,food,class
0,corn,vegetable
1,cucumber,vegetable
2,horseradish,vegetable
3,vegetable,vegetable
4,potato,vegetable
5,tomato,vegetable
131,cabbage,vegetable


In [50]:
import numpy as np
A = np.arange(10)
A

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [51]:
A[A % 2 == 0]

array([0, 2, 4, 6, 8])

In [52]:
type(A % 2 == 0)

numpy.ndarray

In [53]:
# convert df to np array 

df.values

array([['corn', 'vegetable'],
       ['cucumber', 'vegetable'],
       ['horseradish', 'vegetable'],
       ...,
       ['winey', nan],
       ['yeasty', nan],
       ['ylang', nan]], dtype=object)

In [58]:
# to return array of numbers (helpful in ml) --  does not work with our dataset because it has no numbers

A = df[["food","class"]].values
A

array([['corn', 'vegetable'],
       ['cucumber', 'vegetable'],
       ['horseradish', 'vegetable'],
       ...,
       ['winey', nan],
       ['yeasty', nan],
       ['ylang', nan]], dtype=object)

In [59]:
type(A)

numpy.ndarray

In [61]:
# create new csv from existing data

smalldf = df[["food","class"]]
smalldf.to_csv("output.csv")

In [69]:
!head output.csv

,food,class
0,corn,vegetable
1,cucumber,vegetable
2,horseradish,vegetable
3,vegetable,vegetable
4,potato,vegetable
5,tomato,vegetable
6,caraway,herbaceous
7,clove,herbaceous
8,fennel,herbaceous


In [70]:
# remove useless index on side

smalldf.to_csv("output.csv", index=False)

In [71]:
!head output.csv

food,class
corn,vegetable
cucumber,vegetable
horseradish,vegetable
vegetable,vegetable
potato,vegetable
tomato,vegetable
caraway,herbaceous
clove,herbaceous
fennel,herbaceous
