## Pandas series and dataframe

In [1]:
import pandas as pd

students_data = dict(business = 25, AI = 30, JS = 30, JAVA = 27 )
students_data

{'business': 25, 'AI': 30, 'JS': 30, 'JAVA': 27}

In [2]:
series_program = pd.Series(students_data)
series_program

business    25
AI          30
JS          30
JAVA        27
dtype: int64

In [3]:
print(series_program)

business    25
AI          30
JS          30
JAVA        27
dtype: int64


In [5]:
series_program.iloc[0], series_program.iloc[-1]

(np.int64(25), np.int64(27))

In [6]:
series_program.keys()

Index(['business', 'AI', 'JS', 'JAVA'], dtype='object')

In [7]:
series_program["AI"]

np.int64(30)

In [8]:
print(series_program["AI"])

30


In [12]:
series_program["AI"] + 50

np.int64(80)

## another series
- using list instead of dictionary

In [18]:
import random as rnd
rnd.seed(42)

dice_list = [rnd.randint(1,6) for _ in range(5)]
dice_list

[6, 1, 1, 6, 3]

In [19]:
dice_series = pd.Series(dice_list)
dice_series

0    6
1    1
2    1
3    6
4    3
dtype: int64

In [22]:
# göra beräkningar med vår series
dice_series.min(), dice_series.max(), dice_series.mean()

(np.int64(1), np.int64(6), np.float64(3.4))

## Dataframe
- analog of 20 numpy array with flexible row indices and col names

In [23]:
series_program

business    25
AI          30
JS          30
JAVA        27
dtype: int64

In [27]:
df_programs = pd.DataFrame(series_program, columns = ("Num students",))
df_programs

Unnamed: 0,Num students
business,25
AI,30
JS,30
JAVA,27


In [28]:
# create 2 series objects using dictionary
students = pd.Series(dict( AI = 25, NET = 30, APP = 30, Java = 27))
language = pd.Series(dict(AI = "pyhton", NET = "C#", APP = "Kotlin", Java = "Java"))
students

AI      25
NET     30
APP     30
Java    27
dtype: int64

In [29]:
language

AI      pyhton
NET         C#
APP     Kotlin
Java      Java
dtype: object

In [31]:
df_programs = pd.DataFrame({"Students": students, "Langueage": language})
df_programs

Unnamed: 0,Students,Langueage
AI,25,pyhton
NET,30,C#
APP,30,Kotlin
Java,27,Java


In [34]:
import numpy as np

pd.DataFrame(
    {
        "Students": np.array((25,30,30,27)),
        "Language": ["python", "C#", "Kotlin", "Java"] 
    },
    index = ["AI", ".NET", "APP", "Java"]
)

Unnamed: 0,Students,Language
AI,25,python
.NET,30,C#
APP,30,Kotlin
Java,27,Java


In [35]:
df_programs.index

Index(['AI', 'NET', 'APP', 'Java'], dtype='object')

## Data selection

In [36]:
# en eller flera series blir en datafram (det är baserat på series)
df_programs["Students"]

AI      25
NET     30
APP     30
Java    27
Name: Students, dtype: int64

In [37]:
df_programs[["Langueage", "Students"]]

Unnamed: 0,Langueage,Students
AI,pyhton,25
NET,C#,30
APP,Kotlin,30
Java,Java,27


In [38]:
df_programs.Langueage

AI      pyhton
NET         C#
APP     Kotlin
Java      Java
Name: Langueage, dtype: object

In [40]:
# för att plocka ut saker i en series
df_programs["Langueage"]["NET"]

'C#'

## Indexers

In [41]:
# .loc - vi lokaliserar indexet 'java',
df_programs.loc["Java"]

Students       27
Langueage    Java
Name: Java, dtype: object

In [42]:
df_programs.loc["AI"]

Students         25
Langueage    pyhton
Name: AI, dtype: object

In [43]:
df_programs.loc[["Java", "APP"]]

Unnamed: 0,Students,Langueage
Java,27,Java
APP,30,Kotlin


In [45]:
# utan .loc blir det en key error för att vi då 'letar' efter kolumnerna ist.
try:
    df_programs[["Java", "APP"]]
except KeyError as err:
    print(err)

"None of [Index(['Java', 'APP'], dtype='object')] are in the [columns]"


In [46]:
df_programs.loc["AI": "APP"]

Unnamed: 0,Students,Langueage
AI,25,pyhton
NET,30,C#
APP,30,Kotlin


In [48]:
# .iloc ger oss indexet. använd hellre .loc oftast
df_programs.iloc[1:3]

Unnamed: 0,Students,Langueage
NET,30,C#
APP,30,Kotlin


## Masking

In [50]:
df_programs["Students"] > 25

AI      False
NET      True
APP      True
Java     True
Name: Students, dtype: bool

In [51]:
# använd detta för att sortera ut olika värden baserat på villkor
df_programs [df_programs["Students"] > 25]

Unnamed: 0,Students,Langueage
NET,30,C#
APP,30,Kotlin
Java,27,Java


In [52]:
# man kan även göra såhär
df_programs.query("Students > 25")

Unnamed: 0,Students,Langueage
NET,30,C#
APP,30,Kotlin
Java,27,Java
