In [None]:
!pip install pandas

# Intro to Data Science

## Series Data

In [2]:
import pandas as pd

students = ['Alice', 'Jack', 'Molly']

pd.Series(students) #dtype: object

Collecting pandas
  Downloading pandas-1.3.0-cp39-cp39-win_amd64.whl (10.2 MB)
Collecting pytz>=2017.3
  Downloading pytz-2021.1-py2.py3-none-any.whl (510 kB)
Installing collected packages: pytz, pandas
Successfully installed pandas-1.3.0 pytz-2021.1


0    Alice
1     Jack
2    Molly
dtype: object

In [3]:
numbers = [1,2,3]
pd.Series(numbers) #dtype: int64

0    1
1    2
2    3
dtype: int64

In [4]:
students = ['Alice', 'Jack', None]

pd.Series(students) #dtype: object

0    Alice
1     Jack
2     None
dtype: object

In [5]:
numbers = [1, 2, None]

pd.Series(numbers) #dtype: float64, 2 is a NaN not a None

0    1.0
1    2.0
2    NaN
dtype: float64

In [6]:
import numpy as np

np.nan == None

False

In [7]:
np.nan == np.nan

False

In [8]:
np.isnan(np.nan)

True

In [12]:
gpas = {'Alice': 4.0,
          'Jack': 3.7,
          'Molly': 2.5}

# Note that instead of 0, 1, 2 indexes we keep the keys
gpaSeries = pd.Series(gpas)
gpaSeries

Alice    4.0
Jack     3.7
Molly    2.5
dtype: float64

In [13]:
gpaSeries.index

Index(['Alice', 'Jack', 'Molly'], dtype='object')

In [15]:
students = [("Alice", "Brown"), ("Jack", "White"), ("Molly", "Green")]
pd.Series(students)

0    (Alice, Brown)
1     (Jack, White)
2    (Molly, Green)
dtype: object

In [16]:
pd.Series(['Physics', 'Chemistry', 'English'], index=['Alice', 'Jack', 'Molly'])

Alice      Physics
Jack     Chemistry
Molly      English
dtype: object

In [17]:
student_scores = {'Alice': 'Physics',
                  'Jack': 'Chemistry',
                  'Molly': 'English'}

s = pd.Series(student_scores, index=['Alice', 'Molly', 'Sam'])
s # Note, Jack is not present, but Sam is with NaN

Alice    Physics
Molly    English
Sam          NaN
dtype: object

In [18]:
student_classes = {'Alice': 'Physics',
                   'Jack': 'Chemistry',
                   'Molly': 'English',
                   'Sam': 'History'}

s = pd.Series(student_classes)
s

Alice      Physics
Jack     Chemistry
Molly      English
Sam        History
dtype: object

In [24]:
s.iloc[3] # returns History, basically the value at index 3

'History'

In [22]:
s.loc['Molly'] # returns the current value for Molly (English)

'English'

In [25]:
s[3] # implicitly using index

'History'

In [26]:
s['Molly'] # implicitly using label

'English'

In [28]:
codes = {99: 'Physics',
         100: 'Chemistry',
         101: 'English',
         102: 'History'}

s = pd.Series(codes)
s[100] # Since our keys are ints, this is doing an implicit loc


'Chemistry'

In [None]:
grades = pd.Series(np.random.randint(0,1000,10000)) # Generate 10k items between 0 and 1000
grades.head() # get First 5 numbers

In [38]:
%%timeit -n 100
total = 0
for grade in grades:
    total += grade

total / len(grades)

2.65 µs ± 811 ns per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [39]:
%%timeit -n 100
total = np.sum(grades)
total / len(grades)

47.5 µs ± 14.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [40]:
# Broadcasting - uses vectorization
numbers += 2000 # updates all values
numbers.head()

0    2674
1    2788
2    2123
3    2485
4    2945
dtype: int32

In [44]:
# SIGNIFICANTLY slower than using broadcasting
for label, value in numbers.iteritems():
    # OLD Way: numbers.set_value(label, value + 3000)
    numbers.at[label] = value + 3000
    # iat also available

numbers

0       5674
1       5788
2       5123
3       5485
4       5945
        ... 
9995    5691
9996    5288
9997    5353
9998    5489
9999    5632
Length: 10000, dtype: int32

In [46]:
s = pd.Series([1,2,3])
s.loc['History'] = 102 # Add a new label and value
s

0            1
1            2
2            3
History    102
dtype: int64

In [47]:
student_classes = pd.Series({'Alice': 'Physics',
                   'Jack': 'Chemistry',
                   'Molly': 'English',
                   'Sam': 'History'})
student_classes

Alice      Physics
Jack     Chemistry
Molly      English
Sam        History
dtype: object

In [49]:
kelly_classes = pd.Series(['Philosophy', 'Arts', 'Math'],
                          index=['Kelly', 'Kelly', 'Kelly'])
kelly_classes # 3 labels named Kelly

Kelly    Philosophy
Kelly          Arts
Kelly          Math
dtype: object

In [51]:
# Merge the two series together into a new series
all_student_classes = student_classes.append(kelly_classes)
all_student_classes

Alice       Physics
Jack      Chemistry
Molly       English
Sam         History
Kelly    Philosophy
Kelly          Arts
Kelly          Math
dtype: object

In [52]:
all_student_classes.loc['Kelly'] # returns a series instead of a value

Kelly    Philosophy
Kelly          Arts
Kelly          Math
dtype: object

## Data Frames

In [56]:
grade1 = pd.Series({'Name': 'Alice', 'Class': 'Physics', 'Score': 85});
grade2 = pd.Series({'Name': 'Jack', 'Class': 'Chemistry', 'Score': 82});
grade3 = pd.Series({'Name': 'Helen', 'Class': 'Biology', 'Score': 90});

df = pd.DataFrame([grade1, grade2, grade3],
                  index=['school1', 'school2', 'school1'])
df

Unnamed: 0,Name,Class,Score
school1,Alice,Physics,85
school2,Jack,Chemistry,82
school1,Helen,Biology,90


In [76]:
students = [{'Name': 'Alice', 'Class': 'Physics', 'Score': 85},
            {'Name': 'Jack', 'Class': 'Chemistry', 'Score': 82},
            {'Name': 'Helen', 'Class': 'Biology', 'Score': 90}]
df = pd.DataFrame(students, index=['school1', 'school2', 'school1'])
df

Unnamed: 0,Name,Class,Score
school1,Alice,Physics,85
school2,Jack,Chemistry,82
school1,Helen,Biology,90


In [60]:
df.loc['school1']

Unnamed: 0,Name,Class,Score
school1,Alice,Physics,85
school1,Helen,Biology,90


In [62]:
df.loc['school2']

Name          Jack
Class    Chemistry
Score           82
Name: school2, dtype: object

In [66]:
df.loc['school2', 'Score']

82

In [68]:
df.T # switches the rows / columns. Creates a new DF

Unnamed: 0,school1,school2,school1.1
Name,Alice,Jack,Helen
Class,Physics,Chemistry,Biology
Score,85,82,90


In [74]:
df.T.loc['Class']

school1      Physics
school2    Chemistry
school1      Biology
Name: Class, dtype: object

In [78]:
df['Class'] # Indexes work on column labels

school1      Physics
school2    Chemistry
school1      Biology
Name: Class, dtype: object

In [80]:
df.loc['school1'] # Loc works on row labels or column labels

Unnamed: 0,Name,Class,Score
school1,Alice,Physics,85
school1,Helen,Biology,90


In [82]:
df.loc['school2']

Name          Jack
Class    Chemistry
Score           82
Name: school2, dtype: object

In [85]:
#chaining can be too expensive in speed
df.loc['school1']['Name']

school1    Alice
school1    Helen
Name: Name, dtype: object

In [87]:
# :  - get all rows
# [] - columns to return
df.loc[:, ['Name', 'Score']]

Unnamed: 0,Name,Score
school1,Alice,85
school2,Jack,82
school1,Helen,90


In [88]:
# Removes all rows with that label and returns that new DF
df.drop('school1')

Unnamed: 0,Name,Class,Score
school2,Jack,Chemistry,82


In [91]:
copy_df = df.copy()

# inplace will modify the dataframe instead of creating a new one
# axis 1 works on columns instead of rows
copy_df.drop('Name', inplace=True, axis=1)
copy_df

Unnamed: 0,Class,Score
school1,Physics,85
school2,Chemistry,82
school1,Biology,90


In [94]:
# Add a column by defining it
df['ClassRanking'] = None
df

Unnamed: 0,Name,Class,Score,ClassRanking
school1,Alice,Physics,85,
school2,Jack,Chemistry,82,
school1,Helen,Biology,90,


### Indexing and Loading

In [96]:
df = pd.read_csv('StudentsPerformance.csv')
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [103]:
newDf = df.rename(columns = {'race/ethnicity': 'group', 'reading score': 'reading', 'writing score': 'writing', 'math score': 'math', 'test preparation course': 'course', 'parental level of education': 'parents edu'})
newDf

TypeError: Cannot specify both 'mapper' and any of 'index' or 'columns'

In [104]:
newDf.columns

Index(['gender', 'group', 'parents edu', 'lunch', 'course', 'math', 'reading',
       'writing'],
      dtype='object')

In [107]:
newDf = newDf.rename(mapper=str.strip, axis='columns')
newDf

Unnamed: 0,gender,group,parents edu,lunch,course,math,reading,writing
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


In [111]:
newDf.columns = ['Gender', 'Group', 'Parent EDU', 'Lunch', 'Course', 'Math', 'Reading', 'Writing']
newDf

Unnamed: 0,Gender,Group,Parent EDU,Lunch,Course,Math,Reading,Writing
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77
