In [22]:
import carpentry
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, cross_val_score
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
import multiprocessing

In [16]:
ss_titanic_train, ss_titainc_test = carpentry.get_data_frames()

In [17]:
ss_titanic_train.head()

Unnamed: 0,PassengerId,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,...,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Side_P,Side_S
0,0001_01,0.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0,...,0,1,0,0,0,0,0,0,1,0
1,0002_01,0.0,24.0,0.0,109.0,9.0,25.0,549.0,44.0,1,...,0,0,0,0,0,1,0,0,0,1
2,0003_01,0.0,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,0,...,1,0,0,0,0,0,0,0,0,1
3,0003_02,0.0,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,0,...,1,0,0,0,0,0,0,0,0,1
4,0004_01,0.0,16.0,0.0,303.0,70.0,151.0,565.0,2.0,1,...,0,0,0,0,0,1,0,0,0,1


In [4]:
def df_prep(df, scaler):
    
    vars_to_drop = ['PassengerId', 'Transported']
    
    vars_to_drop = list(set(vars_to_drop).intersection(df.columns))
    
    df_copy = df.copy().drop(vars_to_drop, axis = 1)
    
    # Scale
    if scaler == 'standard':
        df_scaled = StandardScaler().fit_transform(df_copy)
        df_scaled = pd.DataFrame(df_scaled, columns = df_copy.columns)
        
    if scaler == 'minmax':
        df_scaled = MinMaxScaler().fit_transform(df_copy)
        df_scaled = pd.DataFrame(df_scaled, columns = df_copy.columns)
    
    if 'Transported' in vars_to_drop:
        df_scaled['Transported'] = df['Transported']
    
    return df_scaled

In [5]:
def create_split(df, n, scaler = 'minmax'):

    X = df_prep(df, scaler)
    y = X.pop('Transported')

    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = n)

    X_train = pd.DataFrame(X_train)

    return X_train, X_test, y_train, y_test

In [6]:
def get_accuracy_score(model, X_train, X_test, y_train, y_test):

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_pred, y_test)

    return accuracy

In [7]:
def random_state_generator(df, n_states, scaler = 'minmax'):

    best_score = 0
    worst_score = 1
    best_features_diff = 1000

    # Create X to compare train results
    X = df_prep(df, scaler)
    X = X.drop('Transported', axis = 1)
      
    xgb = XGBClassifier(n_estimators = 150, random_state = 0)

    for n in np.arange(n_states + 1):

        # Create split will prep df
        X_train, x_test, y_train, y_test = create_split(df, n)
        
        score = get_accuracy_score(xgb, X_train, x_test, y_train, y_test)

        features_diff = sum(abs(X.mean() - X_train.mean()))

        if score > best_score:
            best_score = score
            best_score_state = n

        if score < worst_score:
            worst_score = score
            worst_score_state = n

        if features_diff < best_features_diff:
            best_features_diff = features_diff
            best_features_state = n

    print(f'In {n_states} iterations:')
    print(f'Best accuracy score random state: {best_score_state}')
    print(f'Worst accuracy score random state: {worst_score_state}')
    print(f'Best feature comparison random state: {best_features_state}')
    
    return best_score_state, worst_score_state, best_features_state

In [11]:
n_states = 500
for_best_score, for_worst_score, for_best_features = random_state_generator(ss_titanic_train, n_states)

In 500 iterations:
Best accuracy score random state: 134
Worst accuracy score random state: 431
Best feature comparison random state: 77


In [13]:
n_states = 1000
for_best_score, for_worst_score, for_best_features = random_state_generator(ss_titanic_train, n_states)

In 1000 iterations:
Best accuracy score random state: 134
Worst accuracy score random state: 431
Best feature comparison random state: 701


In [10]:
n_states = 3000
for_best_score, for_worst_score, for_best_features = random_state_generator(ss_titanic_train, n_states)

In 3000 iterations:
Best accuracy score random state: 1636
Worst accuracy score random state: 2741
Best feature comparison random state: 978
In 3000 iterations:
Best accuracy score random state: 1636
Worst accuracy score random state: 2741
Best feature comparison random state: 978


In [14]:
n_states = 10000
for_best_score, for_worst_score, for_best_features = random_state_generator(ss_titanic_train, n_states)

In 10000 iterations:
Best accuracy score random state: 1636
Worst accuracy score random state: 2741
Best feature comparison random state: 9432


In [25]:
def best_cv_score():
    best_score = 0
    xgb = XGBClassifier(random_state = 0)
    X = ss_titanic_train.copy().drop('PassengerId', axis=1)
    y = X.pop('Transported')
    for n in np.arange(100):
        sss = StratifiedShuffleSplit(n_splits = 5, random_state = n)
    
        score = cross_val_score(xgb, X, y, scoring = 'accuracy').mean()
    
        print(score)

In [23]:
multiprocessing.cpu_count()

8

In [27]:
multiprocessing.gpu_count()

AttributeError: module 'multiprocessing' has no attribute 'gpu_count'

In [28]:
pool = multiprocessing.Pool(8)

pool.map(best_cv_score())

0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151860354
0.7984610151

TypeError: Pool.map() missing 1 required positional argument: 'iterable'

Process SpawnPoolWorker-30:
Process SpawnPoolWorker-31:
Process SpawnPoolWorker-32:
Process SpawnPoolWorker-29:
Process SpawnPoolWorker-28:
Process SpawnPoolWorker-27:
Process SpawnPoolWorker-26:
Process SpawnPoolWorker-25:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/anaconda3/envs/tf_env/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/opt/anaconda3/envs/tf_env/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/anaconda3/envs/tf_env/lib/python3.12/multiprocessing/pool.py", line 114, in worker
    task = get()
           ^^^^^
  File "/opt/anaconda3/envs/tf_env/lib/python3.12/multiprocessing/queues.py", line 386, in get
    with self._rlock:
  File "/opt/anaconda3/e