In [1]:
import sklearn
import pandas as pd
import numpy as np

from sklearn import metrics, ensemble
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LinearRegression, SGDClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [None]:
# train test split

data = pd.read_csv("data/train.csv")

X_train, X_test = train_test_split(data, test_size=0.2, random_state=1)

# alternative:
y_data = data["Survived"]
X_data = data.drop("Survived", axis=1)
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)
X_train.head()

In [None]:
# pipelines

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

cat_pipeline = Pipeline([
        ('onehot_encoder', OneHotEncoder())
    ])

num_attribs = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
cat_attribs = ['Sex', 'Embarked']

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', cat_pipeline, cat_attribs)
])

data_prepared = full_pipeline.fit_transform(X_train)
feauture_names = full_pipeline.get_feature_names_out()
pd.DataFrame(data_prepared, columns=feauture_names).head()

In [None]:
# label encoder

le = LabelEncoder()
le.fit(["paris", "paris", "tokyo", "amsterdam"])
print(le.classes_)
print(le.transform(["tokyo", "tokyo", "paris"]))
print(le.inverse_transform([2, 2, 1]))

In [None]:
# linear regression

X_train_reg = X_train["Fare"].values.reshape(-1, 1)
y_train_reg = X_train["Pclass"].values

# simple prediction
lin_reg = LinearRegression()
lin_reg.fit(X_train_reg, y_train_reg)
data_predictions = lin_reg.predict(X_train_reg)

# root-mean-square error, mean absolute error
lin_rmse = np.sqrt(metrics.mean_squared_error(y_train_reg, data_predictions))
lin_mae = metrics.mean_absolute_error(y_train_reg, data_predictions)
print(lin_rmse, lin_mae)

# cross validation
scores = cross_val_score(lin_reg, X_train_reg, y_train_reg, scoring="neg_mean_squared_error", cv=5)
rmse_scores = np.sqrt(-scores)
print(rmse_scores, rmse_scores.mean(), rmse_scores.std())

In [None]:
# other sklearn regression models and ensemble

r1 = ensemble.AdaBoostRegressor()
r2 = ensemble.RandomForestRegressor(n_estimators=10)
r3 = sklearn.neighbors.KNeighborsRegressor()

vot_reg = ensemble.VotingRegressor([('ab', r1), ('rf', r2), ('kn', r3)], weights=[1, 5, 2], n_jobs=-1)
vot_reg.fit(X_train_reg, y_train_reg)
#vot_reg.predict(X_train_reg[:15])

scores = cross_val_score(vot_reg, X_train_reg, y_train_reg, scoring="neg_mean_squared_error", cv=5)
print(np.sqrt(-scores).mean())

In [None]:
# linear classifier

X_train_clf = X_train["Fare"].values.reshape(-1, 1)

# simple prediction
sgd_clf = SGDClassifier(random_state=1)
sgd_clf.fit(X_train_clf, y_train)
data_predictions = sgd_clf.predict(X_train_clf)

# cross validation
scores = cross_val_score(sgd_clf, X_train_clf, y_train, cv=5, scoring="accuracy")
print(scores)

# scores of cross validation predictions
y_train_pred = cross_val_predict(sgd_clf, X_train_clf, y_train, cv=3)
print(metrics.confusion_matrix(y_train, y_train_pred)) # TP, TN, FP, FN

print(metrics.precision_score(y_train, y_train_pred))
print(metrics.recall_score(y_train, y_train_pred))
print(metrics.f1_score(y_train, y_train_pred))

In [None]:
# grid search

rf = ensemble.RandomForestClassifier(n_estimators=10)

param_grid = [
    { 'n_estimators': [10, 50, 100],
      'max_depth': [5, None] },
    { 'n_estimators': [10, 100],
      'criterion': ['entropy', 'log_loss']  }
  ]

grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', return_train_score=True)
grid_search.fit(X_train_clf, y_train)
print(grid_search.best_params_, grid_search.best_estimator_)

cvres = grid_search.cv_results_
for score, params in sorted(list(zip(cvres["mean_test_score"], cvres["params"])), key=lambda x: -x[0]):
    print(score, params)

In [None]:
data_prepared

In [None]:
# feature selection

# input X must be non-negative
data_prepared_kbest = abs(data_prepared)
new_data = SelectKBest(chi2, k=5).fit_transform(data_prepared_kbest, y_train)
new_data, new_data.shape