# **Cargar librerias y dataset**

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler

# Conectar a Google Drive y cargar dataset
from google.colab import drive
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/Colab Notebooks/Talento Tech/Proyecto/01_Regresión_nota_de_examen/student_performance_factors.csv'
df = pd.read_csv(file_path)

# Mostrar las primeras filas del dataframe
display(df.head())

Mounted at /content/drive


Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Near,Male,67
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Moderate,Female,61
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Near,Male,74
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Moderate,Male,71
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Near,Female,70


# **Análisis Exploratorio de Datos (EDA)**

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6607 entries, 0 to 6606
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Hours_Studied               6607 non-null   int64 
 1   Attendance                  6607 non-null   int64 
 2   Parental_Involvement        6607 non-null   object
 3   Access_to_Resources         6607 non-null   object
 4   Extracurricular_Activities  6607 non-null   object
 5   Sleep_Hours                 6607 non-null   int64 
 6   Previous_Scores             6607 non-null   int64 
 7   Motivation_Level            6607 non-null   object
 8   Internet_Access             6607 non-null   object
 9   Tutoring_Sessions           6607 non-null   int64 
 10  Family_Income               6607 non-null   object
 11  Teacher_Quality             6529 non-null   object
 12  School_Type                 6607 non-null   object
 13  Peer_Influence              6607 non-null   obje

In [7]:
# Detectar valores nulos
df.isnull().sum()

Unnamed: 0,0
Hours_Studied,0
Attendance,0
Parental_Involvement,0
Access_to_Resources,0
Extracurricular_Activities,0
Sleep_Hours,0
Previous_Scores,0
Motivation_Level,0
Internet_Access,0
Tutoring_Sessions,0


In [13]:
# Imputar valores nulos con la moda (for categorical columns)
df['Teacher_Quality'] = df['Teacher_Quality'].fillna(df['Teacher_Quality'].mode()[0])
df['Parental_Education_Level'] = df['Parental_Education_Level'].fillna(df['Parental_Education_Level'].mode()[0])
df['Distance_from_Home'] = df['Distance_from_Home'].fillna(df['Distance_from_Home'].mode()[0])

In [14]:
# Detectar valores duplicados
df.duplicated().sum()

np.int64(0)

# **Procesamiento de datos**

In [15]:
# Definir las columnas numéricas
numeric_features = df.select_dtypes(include=['int64']).columns

# Definir las columnas categóricas
categorical_features = df.select_dtypes(include=['object']).columns

# Categóricas en números - LabelEncoder
label_encoder = LabelEncoder()
for col in categorical_features:
    df[col] = label_encoder.fit_transform(df[col])

df.head()

# Escalar numéricas - Normalización
minmax_scaler = MinMaxScaler(feature_range=(0, 1))
df[numeric_features] = minmax_scaler.fit_transform(df[numeric_features])

df.head()

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,0.511628,0.6,1,0,0,0.5,0.46,1,1,0.0,1,2,1,2,0.5,0,1,2,1,0.26087
1,0.418605,0.1,1,2,0,0.666667,0.18,1,1,0.25,2,2,1,0,0.666667,0,0,1,0,0.130435
2,0.534884,0.95,2,2,1,0.5,0.82,2,1,0.25,2,2,1,1,0.666667,0,2,2,1,0.413043
3,0.651163,0.725,1,2,1,0.666667,0.96,2,1,0.125,2,2,1,0,0.666667,0,1,1,1,0.347826
4,0.418605,0.8,2,2,1,0.333333,0.3,2,1,0.375,2,0,1,1,0.666667,0,0,2,0,0.326087


In [16]:
# Dividir caracteristicas X y variable objetivo Y
X = df.drop('Exam_Score', axis=1)
y = df['Exam_Score']

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# **Entrenamiento del modelo - Probar otra construcción**

In [17]:
from sklearn.linear_model import LinearRegression

# Llamar al modelo
model_lr = LinearRegression()

In [18]:
# Entrenar el modelo
model_lr.fit(X_train, y_train)

In [19]:
# Realizar predicciones
y_pred = model_lr.predict(X_test)

In [20]:
from sklearn.metrics import mean_squared_error, r2_score

# Evaluar modelo
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 0.0020790527122736074
R-squared: 0.6887688020307756
