### Characteristics

Manipulating data through the dataframe structure.

- A dataframe structures data as a table :
    - row (horizontal)
    - column (vertical).
 
 A collection of one or more **series** that share an index.
 

In [3]:
## Import libraries

import pandas as pd
import numpy as np



In [4]:
# use np random-number genererator :
# set seed

np.random.seed(123)

In [21]:
# use a list to create the dataframe

students = ['Serena', 'Johanna', 'Suzette', 'Boyan', 'Ada', 'Yassen', 'Thomas',
            'Mario', 'Alberta', 'Ricardo', 'Isaak', 'Alain']

In [23]:
# generate random scores for each studen in 3 subjects
# use random integers from Numpy

# math
math_grades = np.random.randint(low = 60, high = 100, size = len(students))


#english
english_grades = np.random.randint(low = 60, high = 100, size = len(students))


#reading
reading_grades = np.random.randint(low = 60, high = 100, size = len(students))

# np.random.randint()

In [9]:

# telling pandas what values to use in this dataframe

# use dictionary (key - valuepair) to specify the columns (ie, name, math...)

In [24]:
df = pd.DataFrame(
                {'name': students,
                'math': math_grades,
                'english': english_grades,
                'reading': reading_grades}
                    )

In [15]:
type(df)

# tells us that df is a dataframe
# this is a standard Python function

pandas.core.frame.DataFrame

In [25]:
print(df)

       name  math  english  reading
0    Serena    90       80       99
1   Johanna    66       72       94
2   Suzette    86       78       63
3     Boyan    76       77       71
4       Ada    66       61       63
5    Yassen    74       87       90
6    Thomas    99       82       66
7     Mario    71       63       69
8   Alberta    67       63       83
9   Ricardo    61       71       74
10    Isaak    97       81       98
11    Alain    85       85       79


In [26]:
df

Unnamed: 0,name,math,english,reading
0,Serena,90,80,99
1,Johanna,66,72,94
2,Suzette,86,78,63
3,Boyan,76,77,71
4,Ada,66,61,63
5,Yassen,74,87,90
6,Thomas,99,82,66
7,Mario,71,63,69
8,Alberta,67,63,83
9,Ricardo,61,71,74


In [28]:
# helpful in cases of larger dataframes : the .info function.

# allows for viewing the info about a dataframe

df.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     12 non-null     object
 1   math     12 non-null     int64 
 2   english  12 non-null     int64 
 3   reading  12 non-null     int64 
dtypes: int64(3), object(1)
memory usage: 512.0+ bytes


In [29]:
# Summary of numerical atrributes : 

df.describe()

Unnamed: 0,math,english,reading
count,12.0,12.0,12.0
mean,78.166667,75.0,79.083333
std,12.882923,8.923921,13.460098
min,61.0,61.0,63.0
25%,66.75,69.0,68.25
50%,75.0,77.5,76.5
75%,87.0,81.25,91.0
max,99.0,87.0,99.0


## Dataframe Attributes

- dtypes --> data types (the data types present in the data frame)
- shape --> number of rows x number of columns
- columns --> gives column names. Possible to set the column names.
- index --> bunch of series stuck together using a matching index (ie, 0,1,2....). Usually autogenerated.

In [41]:
# view our data types and shape (12, 4)

(
    df.dtypes, 
    df.shape
)

(name       object
 math        int64
 english     int64
 reading     int64
 dtype: object,
 (12, 4))

In [40]:
# returns list of column names and the RangeIndex / row indices info :

df.columns, df.index

(Index(['name', 'math', 'english', 'reading'], dtype='object'),
 RangeIndex(start=0, stop=12, step=1))

In [182]:
# Rename columns

df.columns = [clmn.title() for clmn in df.columns]
df
# left side of the = sign returns list of columns.
# right side uses string method to capitalise, using a 'for' loop operator, assuring its application throughout

Unnamed: 0,Name,Math,English,Reading,Passing_Math,Passing_English
0,Serena,90,80,99,True,True
1,Johanna,66,72,94,False,True
2,Suzette,86,78,63,True,True
3,Boyan,76,77,71,False,True
4,Ada,66,61,63,False,False
5,Yassen,74,87,90,False,True
6,Thomas,99,82,66,True,True
7,Mario,71,63,69,False,False
8,Alberta,67,63,83,False,False
9,Ricardo,61,71,74,False,True


In [50]:
# use column name to pull desired info : See multiple columns
# use a list [inside brackets] that contains desired column names, bc dealing with strings
# columns : name, math

df [['Name', 'Math']]

# must use [[]] here. outer brakets holds columns, inner brakcet contains the desired values.
    # series use [] ; dataframe uses [[]]

Unnamed: 0,Name,Math
0,Serena,90
1,Johanna,66
2,Suzette,86
3,Boyan,76
4,Ada,66
5,Yassen,74
6,Thomas,99
7,Mario,71
8,Alberta,67
9,Ricardo,61


In [55]:
# how to select one column as a SERIES

math_scores = df['Math']
math_scores

# this is a series, and when combining the series they create a data frame

0     90
1     66
2     86
3     76
4     66
5     74
6     99
7     71
8     67
9     61
10    97
11    85
Name: Math, dtype: int64

In [56]:
type(math_scores)

pandas.core.series.Series

In [59]:
# select one column, reading scores, as a DATAFRAME

reading_scores = df[['Reading']]
reading_scores

Unnamed: 0,Reading
0,99
1,94
2,63
3,71
4,63
5,90
6,66
7,69
8,83
9,74


In [60]:
type(reading_scores)

pandas.core.frame.DataFrame

In [71]:
# Create a variable that contains the column names.
# Allows for easy modification of the variable / info to be output

columns = ['Name', 'Math', 'Reading']

# use the variable 'columns' to specify the columns

df[columns]

Unnamed: 0,Name,Math,Reading
0,Serena,90,99
1,Johanna,66,94
2,Suzette,86,63
3,Boyan,76,71
4,Ada,66,63
5,Yassen,74,90
6,Thomas,99,66
7,Mario,71,69
8,Alberta,67,83
9,Ricardo,61,74


In [70]:
type(columns)

list

In [77]:
# Accessing individual columns

df.Math

# returns a Series

0     90
1     66
2     86
3     76
4     66
5     74
6     99
7     71
8     67
9     61
10    97
11    85
Name: Math, dtype: int64

In [80]:
df['Math']

# returns a Series
# use [[]] for a dataframe

0     90
1     66
2     86
3     76
4     66
5     74
6     99
7     71
8     67
9     61
10    97
11    85
Name: Math, dtype: int64

In [87]:
# Accessing info at top / bottom of a dataframe, by row
# Ie, accessing row subsets
# Default is 5 rows

# First 5 rows, final 3 rows, sampling of random entry

df.head()

Unnamed: 0,Name,Math,English,Reading
0,Serena,90,80,99
1,Johanna,66,72,94
2,Suzette,86,78,63
3,Boyan,76,77,71
4,Ada,66,61,63


In [88]:
df.tail(3)


Unnamed: 0,Name,Math,English,Reading
9,Ricardo,61,71,74
10,Isaak,97,81,98
11,Alain,85,85,79


In [89]:
df.sample(4)

Unnamed: 0,Name,Math,English,Reading
2,Suzette,86,78,63
10,Isaak,97,81,98
7,Mario,71,63,69
4,Ada,66,61,63


## Using Boolean Values

In [91]:
df.Math < 83

0     False
1      True
2     False
3      True
4      True
5      True
6     False
7      True
8      True
9      True
10    False
11    False
Name: Math, dtype: bool

In [92]:
# Find rows that match a condition

df[df.Math < 80]

Unnamed: 0,Name,Math,English,Reading
1,Johanna,66,72,94
3,Boyan,76,77,71
4,Ada,66,61,63
5,Yassen,74,87,90
7,Mario,71,63,69
8,Alberta,67,63,83
9,Ricardo,61,71,74


In [106]:
# Dropping a set of columns
# Use a list with the column name/s
df.drop(columns = 'Math')

Unnamed: 0,Name,English,Reading
0,Serena,80,99
1,Johanna,72,94
2,Suzette,78,63
3,Boyan,77,71
4,Ada,61,63
5,Yassen,87,90
6,Thomas,82,66
7,Mario,63,69
8,Alberta,63,83
9,Ricardo,71,74


In [111]:
no_me = df.drop(columns = ['Math', 'English'])
no_me

# variable name is optional, but allows to reference the altered dataframe at a later point.

Unnamed: 0,Name,Reading
0,Serena,99
1,Johanna,94
2,Suzette,63
3,Boyan,71
4,Ada,63
5,Yassen,90
6,Thomas,66
7,Mario,69
8,Alberta,83
9,Ricardo,74


In [105]:
# Does '.drop' alter the original dataframe ?
# No ; it shows the altered version. It can be assigned to a new variable.


In [131]:
# To RENAME a column, need original name and new name :
        # Pass a pair of values.
# Dictionary structure.

en_francais = df.rename(
                        columns = 
                        {'Name':'Étudiants',
                         'Math':"Les_Maths",
                         'English':'L\'Anglais',
                         'Reading':'La_Lecture'}
                       )
en_francais

Unnamed: 0,Étudiants,Les_Maths,L'Anglais,La_Lecture
0,Serena,90,80,99
1,Johanna,66,72,94
2,Suzette,86,78,63
3,Boyan,76,77,71
4,Ada,66,61,63
5,Yassen,74,87,90
6,Thomas,99,82,66
7,Mario,71,63,69
8,Alberta,67,63,83
9,Ricardo,61,71,74


In [148]:
# Rename and remove column names AT THE SAME TIME, ONE LINE OF CODE.
# This can be difficult to read, so perhaps break it down into simpler steps.

new = df.drop(columns = ['English', 'Reading', 'Math']).rename(columns = {'Name':'Prénom'})
new



Unnamed: 0,Prénom
0,Serena
1,Johanna
2,Suzette
3,Boyan
4,Ada
5,Yassen
6,Thomas
7,Mario
8,Alberta
9,Ricardo


## Creating Columns



In [151]:
df.Math >= 79

# gives Boolean values

0      True
1     False
2      True
3     False
4     False
5     False
6      True
7     False
8     False
9     False
10     True
11     True
Name: Math, dtype: bool

In [160]:
# Make this a column, with its column name

# Assign values to a column named Passing_Math

# First way :

en_francais['Passing_Math'] = en_francais.Les_Maths >= 79
en_francais

Unnamed: 0,Étudiants,Les_Maths,L'Anglais,La_Lecture,Passing_Math
0,Serena,90,80,99,True
1,Johanna,66,72,94,False
2,Suzette,86,78,63,True
3,Boyan,76,77,71,False
4,Ada,66,61,63,False
5,Yassen,74,87,90,False
6,Thomas,99,82,66,True
7,Mario,71,63,69,False
8,Alberta,67,63,83,False
9,Ricardo,61,71,74,False


In [161]:
df['Passing_Math'] = df.Math >= 79
df

# This creates a permanently added column.

Unnamed: 0,Name,Math,English,Reading,Passing_Math
0,Serena,90,80,99,True
1,Johanna,66,72,94,False
2,Suzette,86,78,63,True
3,Boyan,76,77,71,False
4,Ada,66,61,63,False
5,Yassen,74,87,90,False
6,Thomas,99,82,66,True
7,Mario,71,63,69,False
8,Alberta,67,63,83,False
9,Ricardo,61,71,74,False


In [170]:
# Second way : 

# Use .assign to assign a name

df.assign(Passing_English = df.English >= 70)


# This method creates a temporary column
# In order to keep this column permanent, assign a variable to the fucntion.

Unnamed: 0,Name,Math,English,Reading,Passing_Math,Passing_English
0,Serena,90,80,99,True,True
1,Johanna,66,72,94,False,True
2,Suzette,86,78,63,True,True
3,Boyan,76,77,71,False,True
4,Ada,66,61,63,False,False
5,Yassen,74,87,90,False,True
6,Thomas,99,82,66,True,True
7,Mario,71,63,69,False,False
8,Alberta,67,63,83,False,False
9,Ricardo,61,71,74,False,True


In [171]:
# Sort based off of column name

df.sort_values(by = 'English')

# Ascending = default



Unnamed: 0,Name,Math,English,Reading,Passing_Math,Passing_English
4,Ada,66,61,63,False,False
7,Mario,71,63,69,False,False
8,Alberta,67,63,83,False,False
9,Ricardo,61,71,74,False,True
1,Johanna,66,72,94,False,True
3,Boyan,76,77,71,False,True
2,Suzette,86,78,63,True,True
0,Serena,90,80,99,True,True
10,Isaak,97,81,98,True,True
6,Thomas,99,82,66,True,True


In [174]:
# Use descending to sort

df.sort_values(by = 'English', ascending = False)

Unnamed: 0,Name,Math,English,Reading,Passing_Math,Passing_English
5,Yassen,74,87,90,False,True
11,Alain,85,85,79,True,True
6,Thomas,99,82,66,True,True
10,Isaak,97,81,98,True,True
0,Serena,90,80,99,True,True
2,Suzette,86,78,63,True,True
3,Boyan,76,77,71,False,True
1,Johanna,66,72,94,False,True
9,Ricardo,61,71,74,False,True
7,Mario,71,63,69,False,False


In [181]:
# Multiple metods combined

df[df.English > 86].sort_values(by = 'English').head(1).Name

# Lowest English grade that is greater than 86

5    Yassen
Name: Name, dtype: object