In [2]:
import random
import numpy as np
import pandas as pd


## Pandas Series
A pandas series is a one -dimensional-array-like object that can hold any data type (int ,floats, string et)

**Creating a Series**
you can create a series from e.g a list, an array and from dictionary



In [5]:
# create a series using a list:

number_list = [x for x in range(30,40)]
print(number_list)

# now i can make a series form this list:

my_first_series = pd.Series(number_list)
print(my_first_series)
isinstance(my_first_series, pd.Series)

[30, 31, 32, 33, 34, 35, 36, 37, 38, 39]
0    30
1    31
2    32
3    33
4    34
5    35
6    36
7    37
8    38
9    39
dtype: int64


True

In [9]:
print(f"Seriens minsta v√§rde: {my_first_series.min()}")
print(f"Seriens st√∂rsta v√§rde: {my_first_series.max()}")
print(f"Seriens medelv√§rde: {my_first_series.mean()}")
print(f"Seriens standard avvikelse: {my_first_series.std():.3f}")

Seriens minsta v√§rde: 30
Seriens st√∂rsta v√§rde: 39
Seriens medelv√§rde: 34.5
Seriens standard avvikelse: 3.028


### Interaction with other Series object
** How does a Series interact with other objects of the same class under e.g., addition or multiplication

In [18]:
list_one  = [1,2,3]
list_two  = [5,6,7]
list_3 =list_one+list_two
print(list_3)
print(pd.Series(list_one))
print(pd.Series(list_two))
# Elementwise addition 
list_se = pd.Series(list_one)+pd.Series(list_two)
# Note : returns a new Series-type object
print(list_se)
list_mul = pd.Series(list_one)*pd.Series(list_two)
print(list_mul)
list_div = pd.Series(list_one)/pd.Series(list_two)
print(list_div.round(2))

[1, 2, 3, 5, 6, 7]
0    1
1    2
2    3
dtype: int64
0    5
1    6
2    7
dtype: int64
0     6
1     8
2    10
dtype: int64
0     5
1    12
2    21
dtype: int64
0    0.20
1    0.33
2    0.43
dtype: float64


In [23]:
my_first_series[1]+2

np.int64(33)

We can also extract multiple elements simultaneously:

In [26]:
my_first_series[:4]
my_first_series[:4].max()


np.int64(33)

## Pandas DataFrame
a dataframe is a 2d labeled data structure in pandas , similar to a table or a spreadsheet
Each column might hold different types of data (integers, floats, strings etc)


In [31]:
names = ['Amir', 'Swash', 'Rozann', 'Ali']
ages = [2.5, 5, 31, 35]
eye_colors = ['blue' , 'orange' , 'brown' , 'green']

# We can with ease create Dataframes using dictionaries

family_dict = { 'name' : names, 'age' : ages , 'eye_color' : eye_colors }
family_df = pd.DataFrame(family_dict)
family_df

Unnamed: 0,name,age,eye_color
0,Amir,2.5,blue
1,Swash,5.0,orange
2,Rozann,31.0,brown
3,Ali,35.0,green


We can select specific columns from our Dataframe using the column names:

In [35]:
# note that the returned column is given as a series object:
family_df['name']

family_df[['name', 'age']]            
family_df[['name', 'age', 'name']]            

Unnamed: 0,name,age,name.1
0,Amir,2.5,Amir
1,Swash,5.0,Swash
2,Rozann,31.0,Rozann
3,Ali,35.0,Ali


## Further Indexing
In pandas, selecting  specific rows and columns is essential for analyzing data. Pandas offers two primary methods to do this :\
.iloc() # index location\
.loc() # location


In [41]:
data = {
    'Name' : ['Alice' , 'Bob' , 'Charlie'  , 'David' , 'Eva' , 'Frank' , 'Grace' , 'Helen' , 'Ian'],
    'Age' : [25,30,35,40,22,29,28,22,31],
    'Salary' : [50000,60000,70000,800000,52000,62000,75000,55000,100000]

}
data_df = pd.DataFrame(data)
data_df
data_df['Salary'].mean().round(3)

np.float64(147111.111)

In [42]:
# we can use slicing to select multiple rows
data_df[1:5] # rows 1 to 4 and 5 not included

Unnamed: 0,Name,Age,Salary
1,Bob,30,60000
2,Charlie,35,70000
3,David,40,800000
4,Eva,22,52000


In [47]:
data_df.iloc[3,2] # select the value in the row with index 3, and column with index 2(salary)
data_df

Unnamed: 0,Name,Age,Salary
0,Alice,25,50000
1,Bob,30,60000
2,Charlie,35,70000
3,David,40,800000
4,Eva,22,52000
5,Frank,29,62000
6,Grace,28,75000
7,Helen,22,55000
8,Ian,31,100000


In [48]:
data_df.iloc[1:4, :2]

Unnamed: 0,Name,Age
1,Bob,30
2,Charlie,35
3,David,40


## Masking
masking is powerful feature in pandas that allows you to filter data based on certain conditions.
Masking is often used to filter data, perform calculations or create subsets of data for further analysis

In [49]:
family_df

Unnamed: 0,name,age,eye_color
0,Amir,2.5,blue
1,Swash,5.0,orange
2,Rozann,31.0,brown
3,Ali,35.0,green


In [50]:
# the above df has 4 rows., so let us create a list of booleans of the same size
my_mask = [True,False, True, False]
family_df[my_mask]

Unnamed: 0,name,age,eye_color
0,Amir,2.5,blue
2,Rozann,31.0,brown


In [57]:
#we have extreme flexibility here, and can use any condition we want to create our 
my_mask = [color =='brown' for color in family_df['eye_color']]
family_df[my_mask]
# another way i s:
my_color_mask = family_df['eye_color'] == 'brown'
family_df[my_color_mask]
#or even like this way 
family_df[family_df['eye_color'] == 'brown']

Unnamed: 0,name,age,eye_color
2,Rozann,31.0,brown


In [64]:
pd.Series(family_df['age'])


family_df['age'] < 25
age_mask = family_df['age'] < 25

kids_df = family_df[age_mask]
print (kids_df)

    name  age eye_color
0   Amir  2.5      blue
1  Swash  5.0    orange


### we can negate a mask by using ~ (tilda) symbol. Negating means that we get the opposite

In [73]:
~age_mask  # we get the opposite
                    #only works if your mask is a series
adults_df = family_df[~age_mask]
adults_df

Unnamed: 0,name,age,eye_color
2,Rozann,31.0,brown
3,Ali,35.0,green


In [75]:
print (age_mask)
print(my_color_mask)

0     True
1     True
2    False
3    False
Name: age, dtype: bool
0    False
1    False
2     True
3    False
Name: eye_color, dtype: bool


In [68]:
family_df[age_mask & my_color_mask]

Unnamed: 0,name,age,eye_color


In [72]:
short_names_mask = [len(name) < 4 for name in data_df['Name']]
short_names_mask
data_df[short_names_mask]

Unnamed: 0,Name,Age,Salary
1,Bob,30,60000
4,Eva,22,52000
8,Ian,31,100000


## Read excel 

In [102]:


#calories_df =pd.read_excel('Project-Management-Sample-Data.xlsx',)
calories_df = pd.read_excel('calories.xlsx')
calories_df.head(10) # shows the first 5 rows by default

Unnamed: 0,FoodCategory,FoodItem,per100grams,Cals_per100grams,KJ_per100grams
0,CannedFruit,Applesauce,100g,62 cal,260 kJ
1,CannedFruit,Canned Apricots,100g,48 cal,202 kJ
2,CannedFruit,Canned Blackberries,100g,92 cal,386 kJ
3,CannedFruit,Canned Blueberries,100g,88 cal,370 kJ
4,CannedFruit,Canned Cherries,100g,54 cal,227 kJ
5,CannedFruit,Canned Cranberries,100g,178 cal,748 kJ
6,CannedFruit,Canned Crushed Pineapple,100g,53 cal,223 kJ
7,CannedFruit,Canned Figs,100g,107 cal,449 kJ
8,CannedFruit,Canned Fruit Cocktail,100g,81 cal,340 kJ
9,CannedFruit,Canned Fruit Salad,100g,50 cal,210 kJ


In [112]:
import pandas as pd

# Option A: Filter columns at read-time (fast if file is large)
df = pd.read_excel('calories.xlsx')
df_cols = df[['FoodCategory','FoodItem']]
df_cols




Unnamed: 0,FoodCategory,FoodItem
0,CannedFruit,Applesauce
1,CannedFruit,Canned Apricots
2,CannedFruit,Canned Blackberries
3,CannedFruit,Canned Blueberries
4,CannedFruit,Canned Cherries
...,...,...
2220,Spreads,Sunflower Butter
2221,Spreads,Tapenade
2222,Spreads,Unsalted Butter
2223,Spreads,Vegemite


## Skip Rows
Skip rows at the yop(useful if file has headers or notes)

In [116]:
df_skip = pd.read_excel('calories.xlsx', skiprows = 1)
df_skip

Unnamed: 0,CannedFruit,Applesauce,100g,62 cal,260 kJ
0,CannedFruit,Canned Apricots,100g,48 cal,202 kJ
1,CannedFruit,Canned Blackberries,100g,92 cal,386 kJ
2,CannedFruit,Canned Blueberries,100g,88 cal,370 kJ
3,CannedFruit,Canned Cherries,100g,54 cal,227 kJ
4,CannedFruit,Canned Cranberries,100g,178 cal,748 kJ
...,...,...,...,...,...
2219,Spreads,Sunflower Butter,100g,617 cal,2591 kJ
2220,Spreads,Tapenade,100g,233 cal,979 kJ
2221,Spreads,Unsalted Butter,100g,717 cal,3011 kJ
2222,Spreads,Vegemite,100g,180 cal,756 kJ


## üß© d) nrows

Read only a certain number of rows (helpful for testing large files).

In [123]:
df_nrows = pd.read_excel('calories.xlsx', skiprows=1, nrows = 10, header = 1, index_col ='CannedFruit').dropna()
df_nrows

Unnamed: 0_level_0,Canned Apricots,100g,48 cal,202 kJ
CannedFruit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CannedFruit,Canned Blackberries,100g,92 cal,386 kJ
CannedFruit,Canned Blueberries,100g,88 cal,370 kJ
CannedFruit,Canned Cherries,100g,54 cal,227 kJ
CannedFruit,Canned Cranberries,100g,178 cal,748 kJ
CannedFruit,Canned Crushed Pineapple,100g,53 cal,223 kJ
CannedFruit,Canned Figs,100g,107 cal,449 kJ
CannedFruit,Canned Fruit Cocktail,100g,81 cal,340 kJ
CannedFruit,Canned Fruit Salad,100g,50 cal,210 kJ
CannedFruit,Canned Gooseberries,100g,73 cal,307 kJ
CannedFruit,Canned Grapefruit,100g,37 cal,155 kJ


In [125]:
names = ['Ruul' , 'Rahul' , 'Rohaan' , 'Ali']
ages = [2.5, 5 , 31 , 35]
eyes_colors = ['blue' , 'brown' , 'green' , 'brown']

new_family_dict = {'name': names , 'age': ages , 'eye_color': eye_colors}
new_family_df = pd.DataFrame(new_family_dict)
new_family_df

Unnamed: 0,name,age,eye_color
0,Ruul,2.5,blue
1,Rahul,5.0,orange
2,Rohaan,31.0,brown
3,Ali,35.0,green


In [None]:
eye_mask = new_family_df['eye_color'] == 'brown'
new_family_df[eye_mask]




Unnamed: 0,name,age,eye_color
2,Rohaan,31.0,brown


In [None]:
new_family_df['age'] >=5 # this is a mask
age_mask= new_family_df['age'] >= 5
kids_mask = new_family_df[~age_mask]
new_family_df[age_mask]

Unnamed: 0,name,age,eye_color
1,Rahul,5.0,orange
2,Rohaan,31.0,brown
3,Ali,35.0,green
