In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import mean_absolute_error, r2_score
import altair as alt

# Enable the Altair renderer for Jupyter Notebook (or your environment)
alt.renderers.enable('notebook')  

# Ler os arquivos CSV em Dataframes Pandas
try:
    df_performance = pd.read_csv('/home/rhudson/Documentos/PUC/Projeto em ciência de dados I/ppl-cd-pcd-sist-int-2024-1-sleepresearch-2024-1/assets/data/Student_Performance.csv', on_bad_lines='warn')
    df_stress = pd.read_csv('/home/rhudson/Documentos/PUC/Projeto em ciência de dados I/ppl-cd-pcd-sist-int-2024-1-sleepresearch-2024-1/assets/data/Student Stress Factors.csv', on_bad_lines='warn')
except FileNotFoundError as e:
    print(f"Error loading CSV files: {e}")
    exit(1)  # Exit if files are not found

# Criar uma nova coluna `ID do Estudante` no `df_performance`, atribuindo um ID único inteiro para cada estudante (de 1 a 10000).
df_performance['ID do Estudante'] = range(1, len(df_performance) + 1)

# Randomiza 520 linhas do `df_performance` e armazená-las em `df_performance_sample`.
df_performance_sample = df_performance.sample(n=520, random_state=1)

# Combinar `df_performance_sample` com `df_stress` com base no índice.
try:
    df_combined = pd.merge(df_performance_sample, df_stress, left_index=True, right_index=True)
except KeyError as e:
    print(f"Error merging DataFrames: {e}")
    exit(1)  # Exit if there's a merge error

# Identify categorical columns
categorical_cols = df_combined.select_dtypes(include='object').columns

try:
    # One-Hot Encoding
    ohe = OneHotEncoder(drop='first')
    df_combined_encoded = pd.DataFrame(ohe.fit_transform(df_combined[categorical_cols]), columns=ohe.get_feature_names_out(categorical_cols))
    df_combined = pd.concat([df_combined.drop(categorical_cols, axis=1), df_combined_encoded], axis=1)
except ValueError as e:
    print(f"Error during OneHotEncoding: {e}")
    print(df_combined[categorical_cols].head())
    exit(1)  # Exit if there's an error during encoding

# Separar os recursos (X) e a variável alvo (y) do `df_combined`.
X = df_combined.drop('Performance Index', axis=1)
y = df_combined['Performance Index']

# Dividir os dados em conjuntos de treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

# Normalizar os recursos usando MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Construir, compilar e treinar o modelo de rede neural
model = Sequential()
model.add(Dense(X.shape[1], input_dim=X.shape[1], activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='linear'))
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_absolute_error'])
history = model.fit(X_train, y_train, epochs=100, validation_split=0.2)

# Prever no conjunto de teste e calcular métricas
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Absolute Error on Test Set: {mae:.2f}')
print(f'R-squared on Test Set: {r2:.2f}')


# Criar o gráfico de dispersão de valores reais vs. preditos
df_plot = pd.DataFrame({'Real Values': y_test, 'Predicted Values': y_pred.flatten()})
scatter_plot = alt.Chart(df_plot).mark_circle(size=60).encode(
    x=alt.X('Real Values', title='Valores Reais'),
    y=alt.Y('Predicted Values', title='Valores Preditos'),
    tooltip=['Real Values', 'Predicted Values']
).properties(
    title='Valores Reais vs. Valores Preditos'
).interactive()

# Adicionar a linha x=y ao gráfico
line = alt.Chart(pd.DataFrame({'x': [y.min(), y.max()], 'y': [y.min(), y.max()]})).mark_line().encode(
    x='x', y='y', color=alt.value('red')
)

# Combinar o gráfico de dispersão e a linha
chart = scatter_plot + line

# Display the chart
chart


ValueError: 
To use the 'notebook' renderer, you must install the vega package
and the associated Jupyter extension.
See https://altair-viz.github.io/getting_started/installation.html
for more information.
