In [21]:
import os
base_dir = os.getcwd()

data_csv_path = os.path.join(base_dir, 'results.csv')
pred_csv_path = os.path.join(base_dir, 'predictions.csv')

In [22]:
base_dir

'C:\\Users\\Eugene\\notebooks'

In [23]:
import pandas as pd
import numpy as np
import plotly 
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from warnings import filterwarnings

filterwarnings('ignore')

In [24]:
df = pd.read_csv(data_csv_path, header=None)
df.columns = ['res', 'time', 'score', 'algo']
df.drop('res', axis=1, inplace=True)

### Vizualise data

In [25]:
import plotly.express as px

fig = px.scatter(df, x="time", y="score", color="algo")

fig.update_traces(marker=dict(size=14,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))
fig.show()

### Convert categorical data to numeric

In [26]:
df['algo'] = df['algo'].map({'minimax': 0, 'expectimax': 1})

### Split dataframe into train and test parts

In [27]:
train_df, test_df = train_test_split(df, test_size=5)

### Define features and target variable

In [28]:
y, X = df.pop('score'), df

### Create a simple pipeline

In [29]:
model = Pipeline([
    ('ss', StandardScaler()),
    ('lreg', LinearRegression())
])

### Implement repeated kfold validation

In [30]:
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)
scores = cross_val_score(model, X, y, scoring='r2', cv=cv, n_jobs=-1)

In [31]:
print('R2: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

R2: 0.138 (0.866)


### Fit the model

In [32]:
model.fit(X, y)
model['lreg'].coef_, model['lreg'].intercept_

(array([ 196.76821745, -356.49415238]), 1207.8431372549019)

### Make predictions

In [33]:
test_df['score_predicted'] = model.predict(test_df[['time', 'algo']])

### Caclulate RMSE using predicted scores

In [34]:
np.sqrt(mean_squared_error(test_df['score'], test_df['score_predicted']))

537.7429006771956

### Write predictions to csv file

In [35]:
test_df.to_csv(pred_csv_path, index=False)