# Lecture - pandas basics

- pandas.Series
- pandas.Dataframe
- read_csv
- indexing
- plotting

## Pandas Series 

- can create from dicitionary
- can create from list
- can create from np.array

In [48]:
import pandas as pd

programs_dict = dict(AI = 26, NET = 38, Java = 30, UX = 28)

program_series = pd.Series(programs_dict)

program_series


AI      26
NET     38
Java    30
UX      28
dtype: int64

In [49]:
# values through indexing
print(f"{program_series[0] = }")
print(f"{program_series[-1] = }")
print(f"{program_series['UX'] = }")

# get keys 
print(f"{program_series.keys() =}")
print(f"{program_series.keys()[0] =}")


program_series[0] = 26
program_series[-1] = 28
program_series['UX'] = 28
program_series.keys() =Index(['AI', 'NET', 'Java', 'UX'], dtype='object')
program_series.keys()[0] ='AI'


In [50]:
import random as rnd

rnd.seed(1337)

dice_series = pd.Series([rnd.randint(1,6) for _ in range(10)])

dice_series.head() # inspects the fice first rows

0    5
1    5
2    6
3    3
4    5
dtype: int64

In [51]:
print(dice_series.argmin())
print(dice_series.max())
print(dice_series.mean())
print(dice_series.median())


7
6
4.4
5.0


---
## DataFrame

- tabluar data with rows and columns
- analog to 2d numpy arrays flexible row indices and col names
- "specialized" dictionary with col name mapped to a series object

In [52]:
# instantiated a data frame froma series object 
df_programs = pd.DataFrame(program_series, columns=('number_of_students',))
df_programs

Unnamed: 0,number_of_students
AI,26
NET,38
Java,30
UX,28


In [64]:
# create 2 serises objects 
students = pd.Series({'AI':26,'net':38, 'ux':28, 'java': 30})
skill = pd.Series({'AI':"Python",'net':"C#", 'ux':"Figma",'java':"Java"})

df_programs = pd.DataFrame({"students": students, 'Skills': skill})
df_programs
# create a datafarme from 2 series objects 

Unnamed: 0,students,Skills
AI,26,Python
net,38,C#
ux,28,Figma
java,30,Java


In [56]:
df_programs["students"]

AI      26
net     38
ux      28
java    30
Name: students, dtype: int64

In [57]:
df_programs["students"].mean()

30.5

In [65]:
df_programs["Skills"][0],

('Python',)

## indexers

-loc - sliceing and indexing using explicit index
 
-iloc - sliceing and indexing using pyhton-style index

In [67]:
df_programs.loc["java"]

students      30
Skills      Java
Name: java, dtype: object

In [69]:
df_programs.iloc[1:2]

Unnamed: 0,students,Skills
net,38,C#


## Masking

In [73]:
df_programs['students'] >= 30

AI      False
net      True
ux      False
java     True
Name: students, dtype: bool

In [74]:
df_programs[df_programs['students']>= 30]

Unnamed: 0,students,Skills
net,38,C#
java,30,Java


In [75]:
df_programs

Unnamed: 0,students,Skills
AI,26,Python
net,38,C#
ux,28,Figma
java,30,Java


---
## Excel data

In [77]:
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as students

df = pd.read_excel("../Data/calories.xlsx")
df.head()

Unnamed: 0,FoodCategory,FoodItem,per100grams,Cals_per100grams,KJ_per100grams
0,CannedFruit,Applesauce,100g,62 cal,260 kJ
1,CannedFruit,Canned Apricots,100g,48 cal,202 kJ
2,CannedFruit,Canned Blackberries,100g,92 cal,386 kJ
3,CannedFruit,Canned Blueberries,100g,88 cal,370 kJ
4,CannedFruit,Canned Cherries,100g,54 cal,227 kJ


In [78]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   FoodCategory      2225 non-null   object
 1   FoodItem          2225 non-null   object
 2   per100grams       2225 non-null   object
 3   Cals_per100grams  2225 non-null   object
 4   KJ_per100grams    2225 non-null   object
dtypes: object(5)
memory usage: 87.0+ KB


In [80]:
#liquid food and solid food
df["FoodCategory"].unique()

array(['CannedFruit', 'Fruits', 'Tropical&ExoticFruits', 'PotatoProducts',
       'Vegetables', 'FastFood', 'Pizza', 'Cheese', 'CreamCheese',
       'Milk&DairyProducts', 'SlicedCheese', 'Yogurt', 'Beef&Veal',
       'ColdCuts&LunchMeat', 'Meat', 'Offal&Giblets', 'Pork',
       'Poultry&Fowl', 'Sausage', 'Venison&Game', 'Cakes&Pies',
       'Candy&Sweets', 'IceCream', '(Fruit)Juices',
       'AlcoholicDrinks&Beverages', 'Beer',
       'Non-AlcoholicDrinks&Beverages', 'Soda&SoftDrinks', 'Wine',
       'CerealProducts', 'Oatmeal,Muesli&Cereals', 'Pasta&Noodles',
       'Dishes&Meals', 'Soups', 'Legumes', 'Nuts&Seeds', 'Oils&Fats',
       'VegetableOils', 'BakingIngredients', 'Fish&Seafood',
       'Herbs&Spices', 'Pastries,Breads&Rolls', 'Sauces&Dressings',
       'Spreads'], dtype=object)

In [81]:
df["per100grams"].unique()

array(['100g', '100ml'], dtype=object)

In [82]:
df.head()

Unnamed: 0,FoodCategory,FoodItem,per100grams,Cals_per100grams,KJ_per100grams
0,CannedFruit,Applesauce,100g,62 cal,260 kJ
1,CannedFruit,Canned Apricots,100g,48 cal,202 kJ
2,CannedFruit,Canned Blackberries,100g,92 cal,386 kJ
3,CannedFruit,Canned Blueberries,100g,88 cal,370 kJ
4,CannedFruit,Canned Cherries,100g,54 cal,227 kJ


## Data cleaning and explorations

- tpye convert string objects with numerical values to int
- change colomn names 
- seperate into liqudis and solids

In [83]:
# renaming columns
df = df.rename(
    dict(Cals_per100grams="Calories", KJ_per100grams="kJ", per100grams="per100"),
    axis="columns",
)

df.head()

Unnamed: 0,FoodCategory,FoodItem,per100,Calories,kJ
0,CannedFruit,Applesauce,100g,62 cal,260 kJ
1,CannedFruit,Canned Apricots,100g,48 cal,202 kJ
2,CannedFruit,Canned Blackberries,100g,92 cal,386 kJ
3,CannedFruit,Canned Blueberries,100g,88 cal,370 kJ
4,CannedFruit,Canned Cherries,100g,54 cal,227 kJ


In [88]:
# converting calories to int
df["Calories"].str[:-3].astype(int)

0        62
1        48
2        92
3        88
4        54
       ... 
2220    617
2221    233
2222    717
2223    180
2224    286
Name: Calories, Length: 2225, dtype: int32

In [89]:
df["per100"].value_counts()

100g     1802
100ml     423
Name: per100, dtype: int64

In [91]:
df["per100"] == "100g".value_counts()

AttributeError: 'str' object has no attribute 'value_counts'

In [96]:
# SEPERATE LIQUIDS
# filter ot all liquids
df_solids = [df["per100"] == "100g"]
df_liquids = df[df["per100"] == "100 ml"]
#filter out all solids 



Unnamed: 0,FoodCategory,FoodItem,per100,Calories,kJ


In [97]:
df_solids["per100"].values_count()

TypeError: list indices must be integers or slices, not str