## Instalación de pptx para presentaciones

In [1]:
# Instalación de pptx
!pip install python-pptx



In [2]:
# Instalación de dataframe_image
# Para pasar un dataframe a una imagen

!pip install dataframe-image



## Librerías

In [3]:
# Librerías típicas 
import numpy as np
import pandas as pd
import os

# Matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

# Seaborn
import seaborn as sns

# Plotly
import chart_studio.plotly as py
import cufflinks as cf
import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot # para que funcione plotly en jupyter notebook
#init_notebook_mode(connected=True)
init_notebook_mode(connected=True)
cf.go_offline()

# pptx
from pptx import Presentation
from pptx.util import Inches
from pptx.chart.data import CategoryChartData
from pptx.enum.chart import XL_CHART_TYPE

# dataframe image
import dataframe_image as dfi

## Importación de datos

El contexto de los datos utilizados puede revisarse en [Kaggle](https://www.kaggle.com/datasets/spscientist/students-performance-in-exams/data).

Se tiene informaición de notas por estudiantes en algunos temas.


In [4]:
df = pd.read_csv('../Datasets/StudentsPerformance.csv')
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [6]:
df.describe()

Unnamed: 0,math score,reading score,writing score
count,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054
std,15.16308,14.600192,15.195657
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


In [7]:
#vemos valores únicos
print(df['gender'].unique())
print(df['race/ethnicity'].unique())
print(df['parental level of education'].unique())

['female' 'male']
['group B' 'group C' 'group A' 'group D' 'group E']
["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']


## Modificaciones y agrupaciones

* Definimos objetivos y preparamos los datos, haciendo las agrupaciones correspondientes.

#### Exportación a imágenes
 * La exportación del DataFrame como imagen se realiza con la librería dataframe_image. Documentación se puede encontrar en el siguiente [link](https://pypi.org/project/dataframe-image/).

In [95]:
# scores medios de las materias por género, raza y nivel educativo de los padres.
# Necesitamos tres agrupaciones
# Objetos tipo groupby

gr_gender = df.groupby("gender")
gr_race = df.groupby("race/ethnicity")
gr_education = df.groupby("parental level of education")

In [110]:
# Extraemos estadística de estos grupos
# Se resetea el índice para poder graficarlos después
gender_dict = {"math score":"mean",
                "reading score":"mean", 
                "writing score":"mean"}
df_gender = gr_gender.agg(gender_dict)
df_gender = df_gender.reset_index() # resetear indice para tener todo en columnas
df_gender.to_csv('Output/score_mean_gender.csv',index=None) # exportación a archivo .csv
df_gender.head()

df_gender_styled = df_gender.style.background_gradient() # resaltar las cosas más grandes
dfi.export(df_gender_styled,"Output/df_gender.png") # exportación del dataframe estilizado como imagen png
df_gender_styled

Unnamed: 0,gender,math score,reading score,writing score
0,female,63.633205,72.608108,72.467181
1,male,68.728216,65.473029,63.311203


In [111]:
race_dict = {"math score":"mean",
                "reading score":"mean", 
                "writing score":"mean"}
df_race = gr_race.agg(gender_dict)
df_race = df_race.reset_index() # resetear indice para tener todo en columnas
df_race.to_csv('Output/score_mean_race.csv',index=None) # exportación a archivo .csv
df_race.head()

df_race_styled = df_race.style.background_gradient() # resaltar las cosas más grandes
dfi.export(df_race_styled,"Output/df_race.png") # exportación del dataframe estilizado como imagen png
df_race_styled


Unnamed: 0,race/ethnicity,math score,reading score,writing score
0,group A,61.629213,64.674157,62.674157
1,group B,63.452632,67.352632,65.6
2,group C,64.46395,69.103448,67.827586
3,group D,67.362595,70.030534,70.145038
4,group E,73.821429,73.028571,71.407143


In [112]:
education_dict = {"math score":"mean",
                "reading score":"mean", 
                "writing score":"mean"}
df_education = gr_education.agg(gender_dict)
df_education = df_education.reset_index() # resetear indice para tener todo en columnas
df_education.to_csv('Output/score_mean_education.csv',index=None) # exportación a archivo .csv
df_education.head()

df_education_styled = df_education.style.background_gradient() # resaltar las cosas más grandes
dfi.export(df_education_styled,"Output/df_education.png") # exportación del dataframe estilizado como imagen png
df_education_styled


Unnamed: 0,parental level of education,math score,reading score,writing score
0,associate's degree,67.882883,70.927928,69.896396
1,bachelor's degree,69.389831,73.0,73.381356
2,high school,62.137755,64.704082,62.44898
3,master's degree,69.745763,75.372881,75.677966
4,some college,67.128319,69.460177,68.840708
5,some high school,63.497207,66.938547,64.888268


In [113]:
## exportar tres dataframes en un solo archivo excel con Pandas

with pd.ExcelWriter('Output/Means_table.xlsx') as excel_file:
    df_gender.to_excel(excel_file, sheet_name="Gender", index=False, header=True)
    df_race.to_excel(excel_file, sheet_name="Race", index=False, header=True)
    df_education.to_excel(excel_file, sheet_name="Education", index=False, header=True)

Entonces ya tengo tres dataframes que me permiten ver como varian la media de scores por materia, agrupando por género, raza y educación de los padres, por separado.

Estas tablas las podemos exportar a excel o a csv.

El dataframe original puede ser utilizado para hacer gráficos de barras. Sin embargo, por su formato, sería mejor modificar el score, y ponerlo en una sola columna el valor, y crear otras que te digan el subject.

In [10]:
df.columns

Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course', 'math score', 'reading score',
       'writing score'],
      dtype='object')

In [13]:
df = pd.read_csv('../Datasets/StudentsPerformance.csv')
df.columns = ['gender', 'race/ethnicity', 'parental level of education', 'lunch',
             'test preparation course', 'math', 'reading',
             'writing']
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math,reading,writing
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [16]:
df_alt = pd.melt(frame=df,
                 id_vars=["gender","race/ethnicity","parental level of education","test preparation course"],
                 value_vars=["math","reading","writing"],
                 value_name="score",
                 var_name='subject')
df_alt.tail()


Unnamed: 0,gender,race/ethnicity,parental level of education,test preparation course,subject,score
2995,female,group E,master's degree,completed,writing,95
2996,male,group C,high school,none,writing,55
2997,female,group C,high school,completed,writing,65
2998,female,group D,some college,completed,writing,77
2999,female,group D,some college,none,writing,86


In [62]:
gr_education_alt = df_alt.groupby(["parental level of education","race/ethnicity","gender"])
education_dict_alt = {"score":"mean"}
df_education_alt = gr_education_alt.agg(education_dict_alt)
df_education_alt = df_education_alt.reset_index()
df_education_alt.head()

Unnamed: 0,parental level of education,race/ethnicity,gender,score
0,associate's degree,group A,female,65.277778
1,associate's degree,group A,male,62.833333
2,associate's degree,group B,female,68.84058
3,associate's degree,group B,male,66.87037
4,associate's degree,group C,female,70.785185


In [122]:
fig = px.bar(data_frame = df_education_alt,
             x='parental level of education',
             y='score',
             color='gender',
             barmode='group')
fig.write_html('Output/histogram_education.html')
fig.write_image('Output/histogram_education.png')
fig

## Exportación a un archivo .pptx

Más información de esta librería puede encontrarse en este [link ](https://python-pptx.readthedocs.io/en/latest/).

Un código rápido puede encontrarse en el siguiente [link](https://www.geeksforgeeks.org/creating-and-updating-powerpoint-presentations-in-python-using-python-pptx/).

In [134]:
## Crear una presentación PPT
# se van añadiendo objetos

pr = Presentation()

# layouts por slide, en orden
#0 Title (presentation title slide)
#1 Title and Content
#2 Section Header (sometimes called Segue)
#3 Two Content (side by side bullet textboxes)
#4 Comparison (same but additional title for each side by side content box)
#5 Title Only
#6 Blank
#7 Content with Caption
#8 Picture with Caption

##### SLIDE 0 ##### Title
#seleccionar layout
slide1_register = pr.slide_layouts[0]
#crear slide con este layout
slide1 = pr.slides.add_slide(slide1_register)
#placeholder
title1 = slide1.shapes.title
title1.text = 'Notas de alumnos de bachillerato'
subtitle1 = slide1.placeholders[1]
subtitle1.text = 'Comparación de datos de género, raza y nivel educativo de los padres'

##### SLIDE 1 ##### Title and content
slide2 = pr.slides.add_slide(pr.slide_layouts[1])
slide2.shapes.title.text = "Contenido"
# Bullet points
bullet_point_box = slide2.shapes

bullet_points_lvl1 = bullet_point_box.placeholders[1] #primer nivel de la lista
bullet_points_lvl1.text = "Estadísticas por categoría:"
bullet_points_lvl2 = bullet_points_lvl1.text_frame.add_paragraph() 
bullet_points_lvl2.text = "Género"
bullet_points_lvl2.level = 1 # segundo nivel de la lista
bullet_points_lvl3 = bullet_points_lvl1.text_frame.add_paragraph() 
bullet_points_lvl3.text = "Raza"
bullet_points_lvl3.level = 1
bullet_points_lvl4 = bullet_points_lvl1.text_frame.add_paragraph() 
bullet_points_lvl4.text = "Nivel educativo de los padres"
bullet_points_lvl4.level = 1


##### SLIDE 2 ##### Title only (and image)
slide3 = pr.slides.add_slide(pr.slide_layouts[5])
slide3.shapes.title.text = "Pandas"

# Añadir imagen
img1_path = "Imagenes/pandas.png" #path
from_left = Inches(3)
from_top = Inches(3)
add_picture = slide3.shapes.add_picture(img1_path,
                                        from_left,
                                        from_top)


##### SLIDE 3 ##### Title only (and Pandas DataFrame)
slide4 = pr.slides.add_slide(pr.slide_layouts[5])
slide4.shapes.title.text = "Pandas DataFrame: media de calificación en exámenes, por género y raza."

img_path = "Output/df_gender.png" #path
from_left = Inches(1)
from_top = Inches(2)
add_picture = slide4.shapes.add_picture(img_path,
                                        from_left,
                                        from_top)

img_path = "Output/df_race.png" #path
from_left = Inches(1)
from_top = Inches(4)
add_picture = slide4.shapes.add_picture(img_path,
                                        from_left,
                                        from_top)



##### SLIDE 4 ##### Title only (and Pandas DataFrame)
slide5 = pr.slides.add_slide(pr.slide_layouts[5])
slide5.shapes.title.text = "Pandas DataFrame: media de calificación en exámenes, por nivel educativo de padres."

img_path = "Output/df_education.png" #path
from_left = Inches(1)
from_top = Inches(2)
add_picture = slide5.shapes.add_picture(img_path,
                                        from_left,
                                        from_top)


##### SLIDE 5 ##### Title only (and Plotly Bar Chart)
slide6 = pr.slides.add_slide(pr.slide_layouts[5])
slide6.shapes.title.text = "Calificaciones por nivel educativo y género."

img_path = "Output/histogram_education.png" #path
from_left = Inches(1)
from_top = Inches(1)
width = Inches(9)
height = Inches(6)
add_picture = slide6.shapes.add_picture(img_path,
                                        from_left,
                                        from_top,
                                        width=width,
                                        height=height)

## save presentation
pr.save('Output/presentation.pptx')