In [2]:
import pandas as pd
import numpy as np

## Crating a dataframe

In [3]:
# Array with numpy
data = np.array([[1,4],[2,6],[3,6]])

In [4]:
# Dataframe with pandas
df = pd.DataFrame(data, index=['row1','row2','row3'],columns=['col1','col2'])

In [5]:
df

Unnamed: 0,col1,col2
row1,1,4
row2,2,6
row3,3,6


## Creating a dataframe from a dictionary

In [6]:
states = ['California', 'Texas', 'Florida']
population = [39613493,29730311,21944577]

In [7]:
dictStates = {'States':states,'Population':population}

In [8]:
df_population = pd.DataFrame(dictStates)

In [9]:
df_population

Unnamed: 0,States,Population
0,California,39613493
1,Texas,29730311
2,Florida,21944577


## Creating a dataframe from  a csv file

In [10]:
df_exams = pd.read_csv('StudentsPerformance.csv')

In [11]:
df_exams.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [12]:
df_exams

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


In [13]:
df_exams.tail(10)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
990,male,group E,high school,free/reduced,completed,86,81,75
991,female,group B,some high school,standard,completed,65,82,78
992,female,group D,associate's degree,free/reduced,none,55,76,76
993,female,group D,bachelor's degree,free/reduced,none,62,72,74
994,male,group A,high school,standard,none,63,63,62
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77
999,female,group D,some college,free/reduced,none,77,86,86


## Attributes

In [14]:
#Access to the shape attribute
df_exams.shape

(1000, 8)

In [15]:
#Index atribute
df_exams.index

RangeIndex(start=0, stop=1000, step=1)

In [16]:
#Column
df_exams.columns

Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course', 'math score', 'reading score',
       'writing score'],
      dtype='object')

In [17]:
#Type of each column
df_exams.dtypes

gender                         object
race/ethnicity                 object
parental level of education    object
lunch                          object
test preparation course        object
math score                      int64
reading score                   int64
writing score                   int64
dtype: object

## Methods

In [18]:
#Info
df_exams.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [19]:
# Describing 
df_exams.describe()

Unnamed: 0,math score,reading score,writing score
count,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054
std,15.16308,14.600192,15.195657
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


## Functions

In [20]:
#Length
len(df_exams)

1000

In [21]:
# Highest index of the dataframe
max(df_exams.index)

999

## Selecting one column

In [22]:
# Select a column with []
df_exams['gender']

0      female
1      female
2      female
3        male
4        male
        ...  
995    female
996      male
997    female
998    female
999    female
Name: gender, Length: 1000, dtype: object

## Add a new Column

##### Assign()

In [23]:
# Create random number for our new score columns
score_one = np.random.randint(1,100, size=1000)

In [24]:
# Create a series using random number
serie_one = pd.Series(score_one, index=np.arange(0,1000))

In [26]:
#Add multiple columns
df_exams.assign(score_one=serie_one)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,score_one
0,female,group B,bachelor's degree,standard,none,72,72,74,58
1,female,group C,some college,standard,completed,69,90,88,33
2,female,group B,master's degree,standard,none,90,95,93,73
3,male,group A,associate's degree,free/reduced,none,47,57,44,66
4,male,group C,some college,standard,none,76,78,75,83
...,...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95,47
996,male,group C,high school,free/reduced,none,62,55,55,92
997,female,group C,high school,free/reduced,completed,59,71,65,68
998,female,group D,some college,standard,completed,68,78,77,86


#### Insert()

In [27]:
#Insert a new column at a specific position
df_exams.insert(1, "test", serie_one)

In [28]:
df_exams

Unnamed: 0,gender,test,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,58,group B,bachelor's degree,standard,none,72,72,74
1,female,33,group C,some college,standard,completed,69,90,88
2,female,73,group B,master's degree,standard,none,90,95,93
3,male,66,group A,associate's degree,free/reduced,none,47,57,44
4,male,83,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...,...
995,female,47,group E,master's degree,standard,completed,88,99,95
996,male,92,group C,high school,free/reduced,none,62,55,55
997,female,68,group C,high school,free/reduced,completed,59,71,65
998,female,86,group D,some college,standard,completed,68,78,77


# Math Operations

### Columns

In [29]:
# Select a column and calculate total sum
df_exams['math score'].sum()

66089

In [31]:
# Count, mean, std, max and min
df_exams['math score'].count()
df_exams['math score'].mean()
df_exams['math score'].std()
df_exams['math score'].max()
df_exams['math score'].min()

0

In [32]:
# Describe()
df_exams.describe()

Unnamed: 0,test,math score,reading score,writing score
count,1000.0,1000.0,1000.0,1000.0
mean,51.289,66.089,69.169,68.054
std,28.16851,15.16308,14.600192,15.195657
min,1.0,0.0,17.0,10.0
25%,27.0,57.0,59.0,57.75
50%,52.0,66.0,70.0,69.0
75%,76.0,77.0,79.0,79.0
max,99.0,100.0,100.0,100.0


### Rows

In [33]:
# Sum in a row
df_exams['math score'] + df_exams['reading score'] + df_exams['writing score']

0      218
1      247
2      278
3      148
4      229
      ... 
995    282
996    172
997    195
998    223
999    249
Length: 1000, dtype: int64

# Value counts

In [34]:
#len function
len(df_exams['gender'])
#count method
df_exams['gender'].count()

1000

In [35]:
# Gender elements by category
df_exams['gender'].value_counts()

female    518
male      482
Name: gender, dtype: int64

In [36]:
# Relative frequency
df_exams['gender'].value_counts(normalize=True)

female    0.518
male      0.482
Name: gender, dtype: float64

# Sort a dataframe

In [39]:
# sort by one column
df_exams.sort_values('math score', ascending=False) #True as defalut

Unnamed: 0,gender,test,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
962,female,73,group E,associate's degree,standard,none,100,100,100
625,male,45,group D,some college,standard,completed,100,97,99
458,female,43,group E,bachelor's degree,standard,none,100,100,100
623,male,49,group A,some college,standard,completed,100,96,86
451,female,2,group E,some college,standard,none,100,92,97
...,...,...,...,...,...,...,...,...,...
145,female,60,group C,some college,free/reduced,none,22,39,33
787,female,52,group B,some college,standard,none,19,38,32
17,female,73,group B,some high school,free/reduced,none,18,32,28
980,female,56,group B,high school,free/reduced,none,8,24,23


# Create Index

In [49]:
import random

In [50]:
new_index = np.arange(0, 1000)

In [51]:
random.shuffle(new_index)

In [57]:
df_exams['new_index'] = new_index

In [58]:
df_exams.set_index('new_index')

Unnamed: 0_level_0,gender,test,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
new_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
735,female,58,group B,bachelor's degree,standard,none,72,72,74
929,female,33,group C,some college,standard,completed,69,90,88
887,female,73,group B,master's degree,standard,none,90,95,93
670,male,66,group A,associate's degree,free/reduced,none,47,57,44
11,male,83,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...,...
138,female,47,group E,master's degree,standard,completed,88,99,95
32,male,92,group C,high school,free/reduced,none,62,55,55
17,female,68,group C,high school,free/reduced,completed,59,71,65
565,female,86,group D,some college,standard,completed,68,78,77


In [59]:
# Rename columns
df_exams.rename(columns = {
   'maths score':'MS',
    'reading score': 'RS',
    'writing score': 'WS'
}, inplace=True)

In [60]:
df_exams

Unnamed: 0_level_0,gender,test,race/ethnicity,parental level of education,lunch,test preparation course,math score,RS,WS,new_index
new_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
735,female,58,group B,bachelor's degree,standard,none,72,72,74,735
929,female,33,group C,some college,standard,completed,69,90,88,929
887,female,73,group B,master's degree,standard,none,90,95,93,887
670,male,66,group A,associate's degree,free/reduced,none,47,57,44,670
11,male,83,group C,some college,standard,none,76,78,75,11
...,...,...,...,...,...,...,...,...,...,...
138,female,47,group E,master's degree,standard,completed,88,99,95,138
32,male,92,group C,high school,free/reduced,none,62,55,55,32
17,female,68,group C,high school,free/reduced,completed,59,71,65,17
565,female,86,group D,some college,standard,completed,68,78,77,565
