In [1]:
import pandas as pd
from sklearn import preprocessing
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import r2_score

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, RandomizedSearchCV
from xgboost import XGBRegressor

from sklearn.linear_model import ElasticNetCV, ElasticNet

import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('train.csv')
train.head(5)

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,...,feature73,feature74,feature75,feature76,feature77,feature78,feature79,feature80,feature81,critical_temperature
0,4,95.95015,87.22194,89.534122,81.423258,1.31759,1.301059,96.119,20.2858,36.105357,...,4.1,4.0536,3.944244,1.339718,1.245504,3,1.4,1.299038,1.135782,21.5
1,5,83.24476,59.2244,59.506081,36.126175,1.31151,1.37205,192.981,25.192187,67.233835,...,2.4,2.402249,2.259897,1.519383,1.365077,3,0.933333,1.2,1.019804,62.0
2,2,23.03585,26.64751,21.192566,25.010514,0.614232,0.435817,18.0583,18.24349,9.02915,...,5.1,4.242641,4.873514,0.636514,0.465999,3,3.3,1.5,1.374773,0.29
3,4,81.756699,79.833804,76.282833,75.275775,1.312596,1.149324,81.482,28.789976,32.890369,...,3.62,3.309751,3.413039,1.333736,1.019322,3,1.92,1.118034,1.198165,19.5
4,2,67.48525,69.699016,57.442709,59.574464,0.548263,0.521345,70.8405,39.638078,35.42025,...,6.0,6.0,6.0,0.693147,0.691193,0,0.375,0.0,0.0,5.36


In [3]:
test = pd.read_csv('test.csv')

In [4]:
formula_train = pd.read_csv('formula_train.csv')
formula_train.head(5)

Unnamed: 0,H,He,Li,Be,B,C,N,O,F,Ne,...,Pt,Au,Hg,Tl,Pb,Bi,Po,At,Rn,material
0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,Eu1Fe1.5Ru0.5As2
1,0.0,0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,2.0,0,0,0,Bi2Sr2Ca1Cu2O8
2,0.0,0,0.0,0.0,0.0,0.0,0.3,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,N0.3S0.7
3,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,Ba1Fe1.9Co0.1As2
4,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,Rh17S15


In [5]:
formula_test = pd.read_csv('formula_test.csv')

In [6]:
data_train = pd.concat([train, formula_train.drop('material', axis=1)], axis=1)

In [7]:
data_test = pd.concat([test, formula_test.drop('material', axis=1)], axis=1)

In [8]:
Y = data_train.critical_temperature

In [9]:
full_data = pd.concat([data_train.drop('critical_temperature', axis=1), data_test], axis=0)
full_data.head(5)

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,...,Ir,Pt,Au,Hg,Tl,Pb,Bi,Po,At,Rn
0,4,95.95015,87.22194,89.534122,81.423258,1.31759,1.301059,96.119,20.2858,36.105357,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
1,5,83.24476,59.2244,59.506081,36.126175,1.31151,1.37205,192.981,25.192187,67.233835,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0,0,0
2,2,23.03585,26.64751,21.192566,25.010514,0.614232,0.435817,18.0583,18.24349,9.02915,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
3,4,81.756699,79.833804,76.282833,75.275775,1.312596,1.149324,81.482,28.789976,32.890369,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
4,2,67.48525,69.699016,57.442709,59.574464,0.548263,0.521345,70.8405,39.638078,35.42025,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0


Всего получилось 167 признаков. Исследуем их на корреляцию и удалим признаки с корреляцией больше 0,9 по модулю.

In [10]:
def correlation(dataset, threshold):
    col_corr = set() # Set of all the names of deleted columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if (corr_matrix.iloc[i, j] >= threshold) and (corr_matrix.columns[j] not in col_corr):
                colname = corr_matrix.columns[i] # getting the name of column
                col_corr.add(colname)
                if colname in dataset.columns:
                    del dataset[colname] # deleting the column from the dataset

    pd.DataFrame(dataset)
    return pd.DataFrame(dataset)

In [11]:
full_data = correlation(full_data, 0.9)

In [12]:
standardized_data = preprocessing.scale(full_data)
standardized_data = pd.DataFrame(standardized_data)

Разделим выборки для обучения и предсказания

In [13]:
x_train = standardized_data[0:17000]
x_test = standardized_data[17000:]

In [14]:
Y

0        21.500
1        62.000
2         0.290
3        19.500
4         5.360
          ...  
16995    35.000
16996    89.000
16997     0.584
16998    45.000
16999    13.500
Name: critical_temperature, Length: 17000, dtype: float64

Построение модели

In [15]:
X_train, X_val, y_train, y_val = train_test_split(x_train, Y,test_size = 0.33, random_state = 42)

1. Линейная регрессия с регуляризацией ElasticNet

In [16]:
regr = ElasticNet(random_state=42,max_iter=1e7,alpha =0.5,l1_ratio =0.7 )
regr.fit(X_train, y_train)

ElasticNet(alpha=0.5, l1_ratio=0.7, max_iter=10000000.0, random_state=42)

In [17]:
test_pre2 = regr.predict(X_val)
train_pre2 = regr.predict(X_train)

In [18]:
print('R2 on train =' , r2_score(y_train, train_pre2))
print('R2 on test =' , r2_score(y_val, test_pre2))

R2 on train = 0.707810810016673
R2 on test = 0.7001087091503203


2. XGBRegressor

In [19]:
model = XGBRegressor()

In [20]:
params = {
        'min_child_weight': [1, 3, 6],
        'subsample': [0.5, 0.7, 1],
        'colsample_bytree': [0.5, 0.7, 1],
        'max_depth': [ 6, 8, 10],
        'n_estimators': [ 100, 150, 200],
        'learning_rate': [ 0.001, 0.01, 0.1, 0.2, 0.3],
        'colsample_bylevel' : [0.5, 0.7, 1],
        'colsample_bytree': [0.7, 1],
        'gamma':[0,1,5,10]
        }

In [21]:
regr = RandomizedSearchCV(model, params, n_jobs=-1, cv=5, verbose=3, refit=True, 
                         n_iter=10)
regr

RandomizedSearchCV(cv=5, estimator=XGBRegressor(), n_jobs=-1,
                   param_distributions={'colsample_bylevel': [0.5, 0.7, 1],
                                        'colsample_bytree': [0.7, 1],
                                        'gamma': [0, 1, 5, 10],
                                        'learning_rate': [0.001, 0.01, 0.1, 0.2,
                                                          0.3],
                                        'max_depth': [6, 8, 10],
                                        'min_child_weight': [1, 3, 6],
                                        'n_estimators': [100, 150, 200],
                                        'subsample': [0.5, 0.7, 1]},
                   verbose=3)

In [22]:
regr.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   49.2s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.9min finished




RandomizedSearchCV(cv=5, estimator=XGBRegressor(), n_jobs=-1,
                   param_distributions={'colsample_bylevel': [0.5, 0.7, 1],
                                        'colsample_bytree': [0.7, 1],
                                        'gamma': [0, 1, 5, 10],
                                        'learning_rate': [0.001, 0.01, 0.1, 0.2,
                                                          0.3],
                                        'max_depth': [6, 8, 10],
                                        'min_child_weight': [1, 3, 6],
                                        'n_estimators': [100, 150, 200],
                                        'subsample': [0.5, 0.7, 1]},
                   verbose=3)

In [23]:
best_regr = regr.best_estimator_
best_regr

XGBRegressor(colsample_bylevel=0.7, colsample_bytree=0.7, max_depth=10,
             min_child_weight=3)

In [24]:
y_pred_train = best_regr.predict(X_train)
y_pred = best_regr.predict(X_val)

In [25]:
print('R2 on train =' , r2_score(y_train, y_pred_train))
print('R2 on test =' , r2_score(y_val, y_pred))

R2 on train = 0.9779573391148921
R2 on test = 0.9174564190047825


Получение итоговых результатов в нужном формате.

In [26]:
result = best_regr.predict(x_test)

In [27]:
res = pd.DataFrame(result, index =None)

In [28]:
res

Unnamed: 0,0
0,41.551167
1,3.298966
2,15.678526
3,86.005020
4,20.372606
...,...
4258,2.059228
4259,84.879326
4260,33.915958
4261,27.532667


In [None]:
res.to_csv('answer.csv',index=False)