In [None]:
!pip install pandas

# Intro to Data Science

## Series Data

In [2]:
import pandas as pd

students = ['Alice', 'Jack', 'Molly']

pd.Series(students) #dtype: object

Collecting pandas
  Downloading pandas-1.3.0-cp39-cp39-win_amd64.whl (10.2 MB)
Collecting pytz>=2017.3
  Downloading pytz-2021.1-py2.py3-none-any.whl (510 kB)
Installing collected packages: pytz, pandas
Successfully installed pandas-1.3.0 pytz-2021.1


0    Alice
1     Jack
2    Molly
dtype: object

In [3]:
numbers = [1,2,3]
pd.Series(numbers) #dtype: int64

0    1
1    2
2    3
dtype: int64

In [4]:
students = ['Alice', 'Jack', None]

pd.Series(students) #dtype: object

0    Alice
1     Jack
2     None
dtype: object

In [5]:
numbers = [1, 2, None]

pd.Series(numbers) #dtype: float64, 2 is a NaN not a None

0    1.0
1    2.0
2    NaN
dtype: float64

In [6]:
import numpy as np

np.nan == None

False

In [7]:
np.nan == np.nan

False

In [8]:
np.isnan(np.nan)

True

In [12]:
gpas = {'Alice': 4.0,
          'Jack': 3.7,
          'Molly': 2.5}

# Note that instead of 0, 1, 2 indexes we keep the keys
gpaSeries = pd.Series(gpas)
gpaSeries

Alice    4.0
Jack     3.7
Molly    2.5
dtype: float64

In [13]:
gpaSeries.index

Index(['Alice', 'Jack', 'Molly'], dtype='object')

In [15]:
students = [("Alice", "Brown"), ("Jack", "White"), ("Molly", "Green")]
pd.Series(students)

0    (Alice, Brown)
1     (Jack, White)
2    (Molly, Green)
dtype: object

In [16]:
pd.Series(['Physics', 'Chemistry', 'English'], index=['Alice', 'Jack', 'Molly'])

Alice      Physics
Jack     Chemistry
Molly      English
dtype: object

In [17]:
student_scores = {'Alice': 'Physics',
                  'Jack': 'Chemistry',
                  'Molly': 'English'}

s = pd.Series(student_scores, index=['Alice', 'Molly', 'Sam'])
s # Note, Jack is not present, but Sam is with NaN

Alice    Physics
Molly    English
Sam          NaN
dtype: object

In [18]:
student_classes = {'Alice': 'Physics',
                   'Jack': 'Chemistry',
                   'Molly': 'English',
                   'Sam': 'History'}

s = pd.Series(student_classes)
s

Alice      Physics
Jack     Chemistry
Molly      English
Sam        History
dtype: object

In [24]:
s.iloc[3] # returns History, basically the value at index 3

'History'

In [22]:
s.loc['Molly'] # returns the current value for Molly (English)

'English'

In [25]:
s[3] # implicitly using index

'History'

In [26]:
s['Molly'] # implicitly using label

'English'

In [28]:
codes = {99: 'Physics',
         100: 'Chemistry',
         101: 'English',
         102: 'History'}

s = pd.Series(codes)
s[100] # Since our keys are ints, this is doing an implicit loc


'Chemistry'

In [None]:
grades = pd.Series(np.random.randint(0,1000,10000)) # Generate 10k items between 0 and 1000
grades.head() # get First 5 numbers

In [38]:
%%timeit -n 100
total = 0
for grade in grades:
    total += grade

total / len(grades)

2.65 µs ± 811 ns per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [39]:
%%timeit -n 100
total = np.sum(grades)
total / len(grades)

47.5 µs ± 14.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [40]:
# Broadcasting - uses vectorization
numbers += 2000 # updates all values
numbers.head()

0    2674
1    2788
2    2123
3    2485
4    2945
dtype: int32

In [44]:
# SIGNIFICANTLY slower than using broadcasting
for label, value in numbers.iteritems():
    # OLD Way: numbers.set_value(label, value + 3000)
    numbers.at[label] = value + 3000
    # iat also available

numbers

0       5674
1       5788
2       5123
3       5485
4       5945
        ... 
9995    5691
9996    5288
9997    5353
9998    5489
9999    5632
Length: 10000, dtype: int32

In [46]:
s = pd.Series([1,2,3])
s.loc['History'] = 102 # Add a new label and value
s

0            1
1            2
2            3
History    102
dtype: int64