# Support Vector Machine (SVM) SVR 

In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

### Carga de datos

In [21]:
df = pd.read_csv('data/Finaltrain.csv')
df.head()

Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text,text_length,word_count,number_count,punctuation_count,stopword_count
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background \nThe Third Wave ex...,346,61,0,3,21
1,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,Summarize how the Third Wave developed over su...,The Third Wave,Background \nThe Third Wave ex...,1225,203,3,29,79
2,0095993991fe,814d6b,The third wave only started as an experiment w...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background \nThe Third Wave ex...,345,60,0,9,24
3,00c20c6ddd23,814d6b,The experimen was orginally about how even whe...,0.567975,0.969062,Summarize how the Third Wave developed over su...,The Third Wave,Background \nThe Third Wave ex...,451,76,0,10,33
4,00d40ad10dc9,814d6b,The third wave developed so quickly due to the...,-0.910596,-0.081769,Summarize how the Third Wave developed over su...,The Third Wave,Background \nThe Third Wave ex...,145,27,0,2,10


In [22]:
X = df[['text_length', 'word_count', 'number_count', 'punctuation_count', 'stopword_count']]
y_content = df['content']
y_wording = df['wording']


### División en conjuntos de entrenamiento y prueba


In [23]:
X_train, X_test, y_content_train, y_content_test = train_test_split(X, y_content, test_size=0.2, random_state=42)
X_train, X_test, y_wording_train, y_wording_test = train_test_split(X, y_wording, test_size=0.2, random_state=42)

In [24]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Implementación de grid search

In [25]:
param_grid = {
    'C': [1, 10, 100, 1000],
    'gamma': ['scale', 'auto', 0.1, 0.01, 0.001],
    'kernel': ['rbf']
}

### Entrenamiento para content

In [26]:
svr_content = SVR()
grid_search_content = GridSearchCV(svr_content, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_content.fit(X_train_scaled, y_content_train)

### Predicción para content

In [27]:
best_svr_content = grid_search_content.best_estimator_

y_content_pred = best_svr_content.predict(X_test_scaled)

mse_content = mean_squared_error(y_content_test, y_content_pred)
print(f"MSE for content score: {mse_content}")

MSE for content score: 0.26390815822258995


### Entrenamiento para wording

In [28]:
svr_wording = SVR()
grid_search_wording = GridSearchCV(svr_wording, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search_wording.fit(X_train_scaled, y_wording_train)

best_svr_wording = grid_search_wording.best_estimator_

### Predicción para wording

In [29]:
y_wording_pred = best_svr_wording.predict(X_test_scaled)

mse_wording = mean_squared_error(y_wording_test, y_wording_pred)
print(f"MSE for wording score: {mse_wording}")

MSE for wording score: 0.5934307074322386


In [30]:
print(f"R^2 for content score: {best_svr_content.score(X_test_scaled, y_content_test)}")
print(f"R^2 for wording score: {best_svr_wording.score(X_test_scaled, y_wording_test)}")

R^2 for content score: 0.7562938685265517
R^2 for wording score: 0.4168551077203454
