In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('/content/StudentsPerformance.csv')

In [3]:
type(df)

pandas.core.frame.DataFrame

In [4]:
# 5 primeiras linhas
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [5]:
# 5 ultimas linhas
df.tail()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77
999,female,group D,some college,free/reduced,none,77,86,86


In [7]:
# quantidade de linhas e colunas
df.shape

(1000, 8)

In [8]:
df.columns

Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course', 'math score', 'reading score',
       'writing score'],
      dtype='object')

In [9]:
# verificar linhas duplicadas
df.duplicated().sum()

0

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [13]:
# verificando existência de NaN
df.isna().sum()

gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64

In [14]:
# sumário estatístico
df.describe()

Unnamed: 0,math score,reading score,writing score
count,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054
std,15.16308,14.600192,15.195657
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


In [15]:
# sumário estatístico - inclusive para as variáveis categóricas
df.describe(include = 'all')

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
count,1000,1000,1000,1000,1000,1000.0,1000.0,1000.0
unique,2,5,6,2,2,,,
top,female,group C,some college,standard,none,,,
freq,518,319,226,645,642,,,
mean,,,,,,66.089,69.169,68.054
std,,,,,,15.16308,14.600192,15.195657
min,,,,,,0.0,17.0,10.0
25%,,,,,,57.0,59.0,57.75
50%,,,,,,66.0,70.0,69.0
75%,,,,,,77.0,79.0,79.0


In [16]:
# quantidade de valores únicos em cada coluna
df.nunique()

gender                          2
race/ethnicity                  5
parental level of education     6
lunch                           2
test preparation course         2
math score                     81
reading score                  72
writing score                  77
dtype: int64

In [17]:
# valores únicos
df['parental level of education'].unique()

array(["bachelor's degree", 'some college', "master's degree",
       "associate's degree", 'high school', 'some high school'],
      dtype=object)

In [18]:
# frequência entre os gêneros
df.gender.value_counts()

female    518
male      482
Name: gender, dtype: int64

In [19]:
provas = ['math score', 'reading score', 'writing score']

In [None]:
df.sort_values(['math score']).reset_index(drop = True)

In [24]:
# ordena o dataset
df = df.sort_values(by = provas, ascending = False)\
.reset_index(drop = True)

In [25]:
df

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group E,bachelor's degree,standard,none,100,100,100
1,male,group E,bachelor's degree,standard,completed,100,100,100
2,female,group E,associate's degree,standard,none,100,100,100
3,male,group E,associate's degree,free/reduced,completed,100,100,93
4,male,group D,some college,standard,completed,100,97,99
...,...,...,...,...,...,...,...,...
995,female,group C,some college,free/reduced,none,22,39,33
996,female,group B,some college,standard,none,19,38,32
997,female,group B,some high school,free/reduced,none,18,32,28
998,female,group B,high school,free/reduced,none,8,24,23


In [26]:
# coluna com a média das provas
df['mean'] = df[provas].mean(axis = 1)

In [27]:
# Nova coluna colocado no final
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,mean
0,female,group E,bachelor's degree,standard,none,100,100,100,100.0
1,male,group E,bachelor's degree,standard,completed,100,100,100,100.0
2,female,group E,associate's degree,standard,none,100,100,100,100.0
3,male,group E,associate's degree,free/reduced,completed,100,100,93,97.666667
4,male,group D,some college,standard,completed,100,97,99,98.666667


In [34]:
# consulta
df.query('(gender == "male") & (`test preparation course` == "none") & (`math score` >= 70)')

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,mean
15,male,group C,associate's degree,standard,none,97,93,91,93.666667
17,male,group E,some college,standard,none,97,87,82,88.666667
23,male,group D,master's degree,standard,none,95,81,84,86.666667
26,male,group E,some high school,standard,none,94,88,78,86.666667
30,male,group E,high school,standard,none,94,73,71,79.333333
...,...,...,...,...,...,...,...,...,...
401,male,group C,high school,standard,none,70,70,65,68.333333
403,male,group B,high school,standard,none,70,65,60,65.000000
406,male,group D,some college,free/reduced,none,70,63,58,63.666667
407,male,group C,high school,standard,none,70,56,51,59.000000


In [36]:
df[(df.gender == 'male') & (df['test preparation course'] == 'none') & (df['math score'] >= 70)]

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,mean
15,male,group C,associate's degree,standard,none,97,93,91,93.666667
17,male,group E,some college,standard,none,97,87,82,88.666667
23,male,group D,master's degree,standard,none,95,81,84,86.666667
26,male,group E,some high school,standard,none,94,88,78,86.666667
30,male,group E,high school,standard,none,94,73,71,79.333333
...,...,...,...,...,...,...,...,...,...
401,male,group C,high school,standard,none,70,70,65,68.333333
403,male,group B,high school,standard,none,70,65,60,65.000000
406,male,group D,some college,free/reduced,none,70,63,58,63.666667
407,male,group C,high school,standard,none,70,56,51,59.000000


In [37]:
df.loc[(df.gender == 'male') & (df['test preparation course'] == 'none') & (df['math score'] >= 70)]

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,mean
15,male,group C,associate's degree,standard,none,97,93,91,93.666667
17,male,group E,some college,standard,none,97,87,82,88.666667
23,male,group D,master's degree,standard,none,95,81,84,86.666667
26,male,group E,some high school,standard,none,94,88,78,86.666667
30,male,group E,high school,standard,none,94,73,71,79.333333
...,...,...,...,...,...,...,...,...,...
401,male,group C,high school,standard,none,70,70,65,68.333333
403,male,group B,high school,standard,none,70,65,60,65.000000
406,male,group D,some college,free/reduced,none,70,63,58,63.666667
407,male,group C,high school,standard,none,70,56,51,59.000000


In [38]:
# agrupamento - agrupa os dados po gênero por gênero e obtém estatísticas descritivas
df.groupby(by = 'gender')[provas].agg([np.mean, np.median]).T

Unnamed: 0,gender,female,male
math score,mean,63.633205,68.728216
math score,median,65.0,69.0
reading score,mean,72.608108,65.473029
reading score,median,73.0,66.0
writing score,mean,72.467181,63.311203
writing score,median,74.0,64.0
