# Regresión lineal múltiple

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Cargar los archivos CSV proporcionados
train_data = pd.read_csv('data/Finaltrain.csv')

# Seleccionar características relevantes para la regresión (excluyendo columnas que no aportan al análisis)
features = ['text_length', 'word_count', 'number_count', 'punctuation_count', 'stopword_count']
X = train_data[features]

# Variables objetivo
y_content = train_data['content']
y_wording = train_data['wording']

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_val, y_content_train, y_content_val, y_wording_train, y_wording_val = train_test_split(
    X, y_content, y_wording, test_size=0.2, random_state=42
)

# Crear y entrenar modelos de regresión lineal múltiple para ambas variables objetivo
model_content = LinearRegression()
model_wording = LinearRegression()

model_content.fit(X_train, y_content_train)
model_wording.fit(X_train, y_wording_train)

# Realizar predicciones en el conjunto de validación
y_content_pred = model_content.predict(X_val)
y_wording_pred = model_wording.predict(X_val)

# Evaluar los modelos
content_mse = mean_squared_error(y_content_val, y_content_pred)
content_r2 = r2_score(y_content_val, y_content_pred)

wording_mse = mean_squared_error(y_wording_val, y_wording_pred)
wording_r2 = r2_score(y_wording_val, y_wording_pred)

print("\nModelo para 'content':")
print(f"MSE: {content_mse}")
print(f"R^2: {content_r2}")

print("\nModelo para 'wording':")
print(f"MSE: {wording_mse}")
print(f"R^2: {wording_r2}")


Modelo para 'content':
MSE: 0.36674286901536673
R^2: 0.6613310992158707

Modelo para 'wording':
MSE: 0.6960540949495193
R^2: 0.31601047074818256


In [6]:
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

# Normalizar las características
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Agregar características polinómicas para mejorar el rendimiento del modelo
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
X_poly = poly.fit_transform(X_scaled)

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_val, y_content_train, y_content_val, y_wording_train, y_wording_val = train_test_split(
    X_poly, y_content, y_wording, test_size=0.2, random_state=42
)

# Crear y entrenar modelos de regresión lineal múltiple para ambas variables objetivo
model_content = LinearRegression()
model_wording = LinearRegression()

model_content.fit(X_train, y_content_train)
model_wording.fit(X_train, y_wording_train)

# Realizar predicciones en el conjunto de validación
y_content_pred = model_content.predict(X_val)
y_wording_pred = model_wording.predict(X_val)

# Evaluar los modelos
content_mse = mean_squared_error(y_content_val, y_content_pred)
content_r2 = r2_score(y_content_val, y_content_pred)

wording_mse = mean_squared_error(y_wording_val, y_wording_pred)
wording_r2 = r2_score(y_wording_val, y_wording_pred)

# Resultados de la evaluación
print("\nModelo para 'content':")
print(f"MSE: {content_mse}")
print(f"R^2: {content_r2}")

print("\nModelo para 'wording':")
print(f"MSE: {wording_mse}")
print(f"R^2: {wording_r2}")


Modelo para 'content':
MSE: 0.2731908887374702
R^2: 0.7477217260110334

Modelo para 'wording':
MSE: 0.6326484177824329
R^2: 0.378317150059585


In [13]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV

# Crear un pipeline para la regresión lineal con características polinómicas
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('poly_features', PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)),
    ('regressor', LinearRegression())
])

# Validación cruzada para el modelo de 'content'
cross_val_scores_content = cross_val_score(pipeline, X_train, y_content_train, cv=5, scoring='r2')
print("Validación cruzada (R^2) para 'content':", cross_val_scores_content)
print("Media de validación cruzada (R^2) para 'content':", cross_val_scores_content.mean())

# Validación cruzada para el modelo de 'wording'
cross_val_scores_wording = cross_val_score(pipeline, X_train, y_wording_train, cv=5, scoring='r2')
print("\nValidación cruzada (R^2) para 'wording':", cross_val_scores_wording)
print("Media de validación cruzada (R^2) para 'wording':", cross_val_scores_wording.mean())

# Búsqueda de hiperparámetros utilizando GridSearchCV
param_grid = {
    'poly_features__degree': [1, 2, 3, 4, 5],
    'poly_features__interaction_only': [True, False],
    'regressor__fit_intercept': [True, False],
    'regressor__copy_X': [True, False]
}

grid_search_content = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search_content.fit(X_train, y_content_train)
print("\nMejores hiperparámetros para 'content':", grid_search_content.best_params_)

# Ajustar el pipeline con los mejores parámetros
best_pipeline_content = grid_search_content.best_estimator_
best_pipeline_content.fit(X_train, y_content_train)

# Realizar predicciones y evaluar el modelo ajustado para 'content'
y_content_pred = best_pipeline_content.predict(X_val)
content_mse = mean_squared_error(y_content_val, y_content_pred)
content_r2 = r2_score(y_content_val, y_content_pred)

print("\nModelo ajustado para 'content':")
print(f"MSE: {content_mse}")
print(f"R^2: {content_r2}")

# Búsqueda de hiperparámetros para 'wording'
grid_search_wording = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search_wording.fit(X_train, y_wording_train)
print("\nMejores hiperparámetros para 'wording':", grid_search_wording.best_params_)

# Ajustar el pipeline con los mejores parámetros
best_pipeline_wording = grid_search_wording.best_estimator_
best_pipeline_wording.fit(X_train, y_wording_train)

# Realizar predicciones y evaluar el modelo ajustado para 'wording'
y_wording_pred = best_pipeline_wording.predict(X_val)
wording_mse = mean_squared_error(y_wording_val, y_wording_pred)
wording_r2 = r2_score(y_wording_val, y_wording_pred)

print("\nModelo ajustado para 'wording':")
print(f"MSE: {wording_mse}")
print(f"R^2: {wording_r2}")

Validación cruzada (R^2) para 'content': [0.73006456 0.7298759  0.73185989 0.72717356 0.72896152]
Media de validación cruzada (R^2) para 'content': 0.7295870840530473

Validación cruzada (R^2) para 'wording': [0.38871869 0.3759046  0.38241524 0.39263802 0.39125291]
Media de validación cruzada (R^2) para 'wording': 0.38618589116079277

Mejores hiperparámetros para 'content': {'poly_features__degree': 4, 'poly_features__interaction_only': True, 'regressor__copy_X': True, 'regressor__fit_intercept': True}

Modelo ajustado para 'content':
MSE: 0.2642607868463142
R^2: 0.7559682334332168

Mejores hiperparámetros para 'wording': {'poly_features__degree': 2, 'poly_features__interaction_only': False, 'regressor__copy_X': True, 'regressor__fit_intercept': True}

Modelo ajustado para 'wording':
MSE: 0.6326484177824341
R^2: 0.3783171500595838
