In [88]:
import pandas as pd
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures


### Attribute Information:

- RMSD-Size of the residue.
- F1 - Total surface area.
- F2 - Non polar exposed area.
- F3 - Fractional area of exposed non polar residue.
- F4 - Fractional area of exposed non polar part of residue.
- F5 - Molecular mass weighted exposed area.
- F6 - Average deviation from standard exposed area of residue.
- F7 - Euclidian distance.
- F8 - Secondary structure penalty.
- F9 - Spacial Distribution constraints (N,K Value)

In [89]:
columns = ["RMSD","F1","F2","F3","F4","F5","F6","F7","F8","F9"]
df = pd.read_csv("CASP.csv")

In [90]:
df.describe()

Unnamed: 0,RMSD,F1,F2,F3,F4,F5,F6,F7,F8,F9
count,45730.0,45730.0,45730.0,45730.0,45730.0,45730.0,45730.0,45730.0,45730.0,45730.0
mean,7.748528,9871.596995,3017.367175,0.302392,103.492433,1368299.0,145.638061,3989.75599,69.975071,34.523664
std,6.118312,4058.138034,1464.324663,0.062886,55.424985,564036.7,69.99923,1993.574575,56.493443,5.979755
min,0.0,2392.05,403.5,0.0925,10.3101,319490.2,31.9704,0.0,0.0,15.228
25%,2.305,6936.68,1979.045,0.25874,63.5639,953591.2,94.7575,3165.3225,31.0,30.424725
50%,5.03,8898.805,2668.155,0.30015,87.7408,1237219.0,126.176,3840.17,54.0,35.2993
75%,13.379,12126.15,3786.41,0.34289,133.64675,1690920.0,181.4685,4644.1925,91.0,38.8708
max,20.999,40034.9,15312.0,0.57769,369.317,5472011.0,598.408,105948.17,350.0,55.3009


In [91]:
features = columns[1:]
target = 'RMSD'

In [92]:
input = df[features]
output = df[target]

In [93]:
seed = (-1)*80538738812075974**3 + 80435758145817515**3 + 12602123297335631**3
input_train, input_test,output_train, output_test = train_test_split(input,output, test_size=0.3, random_state=seed)

In [94]:
regr = linear_model.LinearRegression()

In [95]:
regr.fit(input_train, output_train)

In [96]:
output_pred = regr.predict(input_test)

Analizando el dataset como viene sin tener en cuenta EDA se obtienen los siguientes resultados.

In [97]:
# The coefficients
print("Coefficients: \n", regr.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(output_test, output_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(output_test, output_pred))

Coefficients: 
 [ 1.67523149e-03  1.40033970e-03  1.82525088e+01 -1.08062674e-01
 -4.70064715e-06 -2.39016151e-02 -1.64317884e-04  1.52408263e-02
 -1.05161145e-01]
Mean squared error: 27.10
Coefficient of determination: 0.28


### A continuacion se busca mejorar la performance del predictor.

### Normalizando los valores del dataset

In [104]:
df_rmsd= df['RMSD']
df = df.drop(columns='RMSD')
df = (df-df.mean())/df.std()
df = pd.concat([df,df_rmsd],axis=1)
input = df[features]
output = df[target]
input_train, input_test,output_train, output_test = train_test_split(input,output, test_size=0.3, random_state=seed)
regr = linear_model.LinearRegression()
regr.fit(input_train, output_train)
output_pred = regr.predict(input_test)
# The coefficients
print("Coefficients: \n", regr.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(output_test, output_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(output_test, output_pred))

Coefficients: 
 [ 3.90549468  2.16078031  1.10580568 -5.93348126 -1.54787225 -0.32617782
  0.88654474 -0.58636628]
Mean squared error: 27.10
Coefficient of determination: 0.28


In [99]:
# df = (df-df.mean())/df.std()
# input = df[features]
# output = df[target]
# input_train, input_test,output_train, output_test = train_test_split(input,output, test_size=0.3, random_state=seed)
# regr = linear_model.LinearRegression()
# regr.fit(input_train, output_train)
# output_pred = regr.predict(input_test)
# # The coefficients
# print("Coefficients: \n", regr.coef_)
# # The mean squared error
# print("Mean squared error: %.2f" % mean_squared_error(output_test, output_pred))
# # The coefficient of determination: 1 is perfect prediction
# print("Coefficient of determination: %.2f" % r2_score(output_test, output_pred))

se observa que al normalizar, la metrica MSE no cambia

### Elimino el feature F5 debido a que en el EDA se aprecia que tiene una fuerte correlacion con F1 y probablemente no aporte informacion.

In [100]:
bad_col = ["F5"]
df = df.drop(bad_col, axis=1) #Elimino las conflictivas...
for col in bad_col:
    columns.remove(col)
features = columns[1:]
target = 'RMSD'
input = df[features]
output = df[target]
input_train, input_test,output_train, output_test = train_test_split(input,output, test_size=0.3, random_state=seed)
regr = linear_model.LinearRegression()
regr.fit(input_train, output_train)
output_pred = regr.predict(input_test)
# The coefficients
print("Coefficients: \n", regr.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(output_test, output_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(output_test, output_pred))

Coefficients: 
 [ 9.62385863e-04  1.47561559e-03  1.75843367e+01 -1.07054270e-01
 -2.21127039e-02 -1.63614557e-04  1.56928785e-02 -9.80585721e-02]
Mean squared error: 27.10
Coefficient of determination: 0.28


Efectivamente, no mejoro

### Aplico Feature Engineering
Busco aplicar features polinomiales de distinto orden para encontrar correlaciones no lineales entre las variables

### orden 2

In [101]:
poly = PolynomialFeatures(2,include_bias=False)
input = df[features]
output = df[target]
input = poly.fit_transform(input,)
input_train, input_test,output_train, output_test = train_test_split(input,output, test_size=0.3, random_state=seed)
regr = linear_model.LinearRegression()
regr.fit(input_train, output_train)
output_pred = regr.predict(input_test)
# The coefficients
print("Coefficients: \n", regr.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(output_test, output_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(output_test, output_pred))

Coefficients: 
 [ 8.80958146e-03  7.67103708e-01  2.24988850e+02 -9.98018177e-03
 -3.33016206e-01 -4.40913157e-04  5.11937644e-02  1.49806601e+00
  2.60034523e-07  1.55452679e-07 -7.84693542e-01 -1.46517463e-05
 -2.92846955e-05  3.16353063e-08 -6.53703302e-06 -1.20487157e-04
 -1.73838759e-07  1.40397916e-02  2.39122161e-05 -2.41920826e-05
  9.61479911e-08  6.79204982e-06  1.57652137e-04 -7.16840166e+01
 -3.61537564e-01  4.52374063e-01  1.39166866e-03 -1.29110786e-01
 -4.07639468e+00  1.26258920e-05  4.54203747e-04  5.38430596e-06
  3.62195278e-04 -1.51423972e-03  1.06909803e-03 -7.37302068e-06
  3.13306750e-04  5.05687228e-03  4.53336831e-09 -2.70389034e-06
 -6.40272452e-06 -2.87960763e-04  7.18269249e-04 -3.55433547e-03]
Mean squared error: 24.22
Coefficient of determination: 0.35


### orden 3

In [102]:
poly = PolynomialFeatures(3,include_bias=False)
input = df[features]
output = df[target]
input = poly.fit_transform(input,)
input_train, input_test,output_train, output_test = train_test_split(input,output, test_size=0.3, random_state=seed)
regr = linear_model.LinearRegression()
regr.fit(input_train, output_train)
output_pred = regr.predict(input_test)
# The coefficients
print("Coefficients: \n", regr.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(output_test, output_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(output_test, output_pred))

Coefficients: 
 [-2.60702730e-02 -8.96981017e-02  1.49142740e-01  1.13974534e+00
 -1.45680372e+00 -2.20714445e-02  2.85814569e+00 -2.48104788e+01
  6.48718940e-06  2.26526058e-04  1.64682874e-01 -3.39529451e-04
 -2.30880702e-04  1.83329384e-06 -2.55333979e-04  4.16894688e-04
 -1.59214668e-03 -1.12145129e-01 -7.96314726e-02  4.35777740e-02
  2.57185123e-03 -1.43295157e-03 -1.60524640e-01  5.31031844e-02
 -1.05624866e+01  2.07093084e+00  1.07845140e-01 -8.07585936e+00
 -2.28030378e+00  9.20300441e-03 -6.16045210e-04 -1.80068795e-04
  9.35492718e-04  3.97845387e-02  7.89656907e-03 -6.28210296e-06
  4.72531892e-03  3.98737670e-02  5.40934075e-08  4.32033188e-05
 -9.04517766e-05 -5.42883715e-03 -6.58516441e-02  5.60128105e-01
 -2.81008328e-10  1.23889832e-09 -2.54015581e-04  1.93044052e-08
  1.41687080e-08 -9.88285009e-11  1.56969326e-09 -4.39382930e-09
 -1.35412481e-09  1.62617098e-03 -6.38056941e-08 -7.14010708e-08
  3.93159283e-10 -1.27467281e-09  4.29014246e-08  7.65033273e-04
  8.15674

Se concluye que realizando el feature polinomial de orden 2 mejora el MSE, pero a partir de orden 3 se comienza a tener problema de overfitting