# Find an Optimal Model for Predicting the Critical Temperatures of Superconductors

<b> Load the necessary libraries </b>

In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures

<b> Read in the data from the superconduct folder </b>

In [2]:
# url path
url_path = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter07/Dataset/superconduct/train.csv'

In [3]:
df = pd.read_csv(url_path)
df.head()

Unnamed: 0,number_of_elements,mean_atomic_mass,wtd_mean_atomic_mass,gmean_atomic_mass,wtd_gmean_atomic_mass,entropy_atomic_mass,wtd_entropy_atomic_mass,range_atomic_mass,wtd_range_atomic_mass,std_atomic_mass,...,wtd_mean_Valence,gmean_Valence,wtd_gmean_Valence,entropy_Valence,wtd_entropy_Valence,range_Valence,wtd_range_Valence,std_Valence,wtd_std_Valence,critical_temp
0,4,88.944468,57.862692,66.361592,36.116612,1.181795,1.062396,122.90607,31.794921,51.968828,...,2.257143,2.213364,2.219783,1.368922,1.066221,1,1.085714,0.433013,0.437059,29.0
1,5,92.729214,58.518416,73.132787,36.396602,1.449309,1.057755,122.90607,36.161939,47.094633,...,2.257143,1.888175,2.210679,1.557113,1.047221,2,1.128571,0.632456,0.468606,26.0
2,4,88.944468,57.885242,66.361592,36.122509,1.181795,0.97598,122.90607,35.741099,51.968828,...,2.271429,2.213364,2.232679,1.368922,1.029175,1,1.114286,0.433013,0.444697,19.0
3,4,88.944468,57.873967,66.361592,36.11956,1.181795,1.022291,122.90607,33.76801,51.968828,...,2.264286,2.213364,2.226222,1.368922,1.048834,1,1.1,0.433013,0.440952,22.0
4,4,88.944468,57.840143,66.361592,36.110716,1.181795,1.129224,122.90607,27.848743,51.968828,...,2.242857,2.213364,2.206963,1.368922,1.096052,1,1.057143,0.433013,0.428809,23.0


<b> Prepare the X and y variables </b>

In [4]:
X = df.drop(['critical_temp'], axis=1).values
y = df['critical_temp'].values

<b> Split the data into training and evaluation sets </b>

In [5]:
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.2, random_state=0)

<b> Create a baseline linear regression model </b>

In [6]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

<b> Print out the R2 score and MSE of the model </b>

In [7]:
print(f'lr R^2 score: {lr.score(X_eval, y_eval)}')

lr R^2 score: 0.7350976364618292


In [8]:
# make predictions
y_pred = lr.predict(X_eval)

In [9]:
print(f'lr MSE: {mean_squared_error(y_eval, y_pred)}')

lr MSE: 308.32127118918373


<b> Create a pipeline to engineer polynomial features and train a linear regression model </b>

In [10]:
steps = [
    ('scaler', MinMaxScaler()),
    ('poly', PolynomialFeatures(interaction_only=True)),
    ('lr', LinearRegression())
    ]

In [11]:
lr_model_2 = Pipeline(steps)
lr_model_2.fit(X_train, y_train)

Pipeline(steps=[('scaler', MinMaxScaler()),
                ('poly', PolynomialFeatures(interaction_only=True)),
                ('lr', LinearRegression())])

<b> Print out the R2 score and MSE </b>

In [12]:
print(f'lr_model_2 R^2 score: {lr_model_2.score(X_eval, y_eval)}')

lr_model_2 R^2 score: 0.6781960274985629


<b> Determine that this new model is overfitting </b>

The first model had a score of 0.73. You need a model with a higher score. The second model has a score of 0.67, which is significantly worse. The second model is overfitting.

<b> Create a pipeline to engineer polynomial features and train a ridge or lasso model </b>

In [13]:
steps = [
    ('scaler', MinMaxScaler()),
    ('poly', PolynomialFeatures(interaction_only=True)),
    ('model', Lasso(alpha=0.001, max_iter=2000))
]

In [16]:
lasso_model = Pipeline(steps)
lasso_model.fit(X_train, y_train)

  positive)


Pipeline(steps=[('scaler', MinMaxScaler()),
                ('poly', PolynomialFeatures(interaction_only=True)),
                ('model', Lasso(alpha=0.001, max_iter=2000))])

<b> Print out the R2 score and MSE </b>

In [17]:
print(f'lr_model_2 R^2 score: {lasso_model.score(X_eval, y_eval)}')

lr_model_2 R^2 score: 0.8156621589070521


<b> Determine that this model is no longer overfitting. This is the model to put into production </b>

The score is now back up to 0.8157.