## Pandas Basics

In [6]:
import pandas as pd

## creating DataFrame

In [None]:
# list of lists, each list is a row
data = [["tom", 10], ["nick", 15], ["juli", 14]]
df = pd.DataFrame(data, columns=["Name", "Age"])
df

Unnamed: 0,Name,Age
0,tom,10
1,nick,15
2,juli,14


In [None]:
# with index
df = pd.DataFrame(
    [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
    index=['a', 'b', 'c'], # row names
    columns=['x', 'y', 'z'])   # column names

print(df)

   x  y  z
a  1  2  3
b  4  5  6
c  7  8  9


In [None]:
# dictionary column name: data
data = {"Name": ["Tom", "Nick", "Krish", "Jack"], 
        "Age": [20, 21, 19, 18]
        }

# no index needed
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age
0,Tom,20
1,Nick,21
2,Krish,19
3,Jack,18


In [None]:
# column wise
df = pd.DataFrame(
{
 "a" : [1 ,4, 7],       # col a
 "b" : [2, 5, 8],       # col b
 "c" : [3, 6, 9],
 "d" : [10, 11, 12],
 },
index = ["1", "2", "3"])      # row index

print(df)

   a  b  c   d
1  1  2  3  10
2  4  5  6  11
3  7  8  9  12


In [7]:
df = pd.read_csv( "https://drive.google.com/uc?id=18xhPLKRtaCHsMKYpzb_G8JENj9HesiDC" )
df

Unnamed: 0,school_id,grade,class,student_id,sex,nationality,grade_math_t1,grade_language_t1,grade_science_t1,grade_math_t2,grade_language_t2,grade_science_t2,treatment,date_of_birth
0,57,6,A,wvqgd@cunb.edu,1,1,2.046285,2.369783,2.711344,1.175309,1.498807,1.840368,1,1997-07-27
1,57,6,A,j0ihe@cunb.edu,1,2,7.859077,7.206984,6.994455,7.548812,6.896719,6.684190,1,1997-06-24
2,57,6,A,wcjgk@cunb.edu,2,1,7.118976,8.057449,8.573210,6.181019,7.119492,7.635253,1,1997-04-23
3,57,6,A,mzmqb@cunb.edu,1,1,6.973737,7.388008,6.628448,7.593102,8.007373,7.247812,1,1997-02-24
4,57,6,A,s6n0y@cunb.edu,1,1,6.574877,6.773626,7.844316,7.611065,7.809814,8.880504,1,1996-09-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,946,8,D,jj36e@cunb.edu,1,1,4.938375,5.306647,4.985917,6.162418,6.530690,6.209960,0,1995-07-04
1506,946,8,D,ca1dg@cunb.edu,1,1,7.661931,8.360153,7.937165,8.179364,8.877586,8.454598,0,1995-08-23
1507,946,8,D,amdrx@cunb.edu,1,1,9.248758,8.534791,8.056641,8.773601,8.059634,7.581484,0,1994-12-15
1508,946,8,D,yn5ug@cunb.edu,1,2,5.689228,5.071503,5.338547,6.382995,5.765270,6.032315,0,1994-09-18


## Basic inspection 
`head(), tail(), shape, info(), describe(), dtypes`


In [None]:
# dataframe methods
df.head()
df.head(2)
df.tail()
df.info()
df.describe()

# dataframe attributes
df.shape
df.dtypes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1510 entries, 0 to 1509
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   school_id          1510 non-null   int64  
 1   grade              1510 non-null   int64  
 2   class              1510 non-null   object 
 3   student_id         1510 non-null   object 
 4   sex                1510 non-null   int64  
 5   nationality        1510 non-null   int64  
 6   grade_math_t1      1510 non-null   float64
 7   grade_language_t1  1510 non-null   float64
 8   grade_science_t1   1510 non-null   float64
 9   grade_math_t2      1508 non-null   float64
 10  grade_language_t2  1507 non-null   float64
 11  grade_science_t2   1508 non-null   float64
 12  treatment          1510 non-null   int64  
 13  date_of_birth      1510 non-null   object 
dtypes: float64(6), int64(5), object(3)
memory usage: 165.3+ KB


school_id              int64
grade                  int64
class                 object
student_id            object
sex                    int64
nationality            int64
grade_math_t1        float64
grade_language_t1    float64
grade_science_t1     float64
grade_math_t2        float64
grade_language_t2    float64
grade_science_t2     float64
treatment              int64
date_of_birth         object
dtype: object

## Basic statistics 
`mean(), median(), std(), count(), min(), max()`


In [None]:
# series methods
df["grade_language_t1"].mean().round(2)
df["grade_math_t1"].median().round(2)
df["grade_math_t1"].std().round(2)

# works on whole df and not just a column
df.count()
df.min()
df.max()


school_id                       946
grade                             8
class                             D
student_id           zyjey@cunb.edu
sex                               2
nationality                       2
grade_math_t1                  10.0
grade_language_t1              10.0
grade_science_t1               10.0
grade_math_t2               99999.0
grade_language_t2           99999.0
grade_science_t2            99999.0
treatment                         1
date_of_birth            1997.03.09
dtype: object

## Reading/Writing data 
`read_csv(), to_csv(), read_excel(), to_json()`


### reading / importing data

In [None]:
pd.read_csv(filename) # Reads a CSV file
pd.read_table(filename) # Reads a delimited text file (like TSV)
pd.read_excel(filename) # Reads an Excel file
pd.read_sql(query, connection_object) # Reads a SQL table/database
pd.read_json(json_string) # Reads a JSON formatted string, URL or file
pd.read_html(url) # Parses an html, string or file and extracts tables to a list of df
pd.read_clipboard() # Reads the contents of your clipboard
pd.DataFrame(dict) # Reads from a dict; keys for columns names, values for data as lists

In [3]:
### writing / exporting data

In [None]:
df.to_csv(filename) # Writes to a CSV file
df.to_excel(filename) # Writes to an Excel file
df.to_sql(table_name, connection_object) # Writes to a SQL table
df.to_json(filename) # Writes to a file in JSON format
df.to_html(filename) # Writes to an HTML table
df.to_clipboard() # Writes to the clipboard

## Display options
`set_option(), option_context()`

In [None]:
# see default rows and columns displayed
pd.get_option("display.max_rows") # 60
pd.get_option("display.max_columns") # 20

In [13]:
# set default rows and columns
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 25)
pd.get_option("display.max_rows")
pd.get_option("display.max_columns")

25

In [15]:
# reset defaults
pd.reset_option("display.max_rows")
pd.reset_option("display.max_columns")
pd.get_option("display.max_columns")
pd.get_option("display.max_rows")

60

### temporary options 

The option_context() function allows you to set an option temporarily within a with statement. Once the context is exited, the option is automatically reverted to its previous value.

In [23]:
with pd.option_context("display.max_columns", 4):
    print(pd.get_option("display.max_columns"))
    print(df)


4
      school_id  grade  ... treatment date_of_birth
0            57      6  ...         1    1997-07-27
1            57      6  ...         1    1997-06-24
2            57      6  ...         1    1997-04-23
3            57      6  ...         1    1997-02-24
4            57      6  ...         1    1996-09-05
...         ...    ...  ...       ...           ...
1505        946      8  ...         0    1995-07-04
1506        946      8  ...         0    1995-08-23
1507        946      8  ...         0    1994-12-15
1508        946      8  ...         0    1994-09-18
1509        946      8  ...         0    1994-12-19

[1510 rows x 14 columns]
