In [1]:
!pip3 install -q pandas==2.1.0
!pip3 install -q numpy==1.22.4
!pip3 install -q scikit-learn

In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OrdinalEncoder

import os

# Чтение данных

In [3]:
path1 = '/datasets/train_cleaned.csv'
path2 = '/datasets/test_cleaned.csv'

def read_file(path):
    df = pd.DataFrame()
    if os.path.exists(path):
        df = pd.read_csv(path, sep=',')
    elif os.path.exists(path[1:]):
        df = pd.read_csv(path[1:], sep=',')
    else:
        print('No such file or directory') 
        raise FileNotFoundError('No such file or directory')
    return df

df_train_cl = read_file(path1)
df_test_cl = read_file(path2)

# Препроцессинг данных

In [4]:
df_train_cl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8673 entries, 0 to 8672
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8673 non-null   object 
 1   CryoSleep     8673 non-null   bool   
 2   Cabin         8673 non-null   object 
 3   Destination   8673 non-null   object 
 4   Age           8673 non-null   float64
 5   VIP           8673 non-null   bool   
 6   RoomService   8673 non-null   float64
 7   FoodCourt     8673 non-null   float64
 8   ShoppingMall  8673 non-null   float64
 9   Spa           8673 non-null   float64
 10  VRDeck        8673 non-null   float64
 11  Transported   8673 non-null   bool   
 12  has_services  8673 non-null   bool   
dtypes: bool(4), float64(6), object(3)
memory usage: 643.8+ KB


In [5]:
df_test_cl['CryoSleep'] = df_test_cl['CryoSleep'].astype(int)
df_test_cl['VIP'] = df_test_cl['VIP'].astype(int)
df_test_cl['has_services'] = df_test_cl['has_services'].astype(int)
df_test_cl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3355 entries, 0 to 3354
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    3355 non-null   object 
 1   CryoSleep     3355 non-null   int32  
 2   Cabin         3355 non-null   object 
 3   Destination   3355 non-null   object 
 4   Age           3355 non-null   float64
 5   VIP           3355 non-null   int32  
 6   RoomService   3355 non-null   float64
 7   FoodCourt     3355 non-null   float64
 8   ShoppingMall  3355 non-null   float64
 9   Spa           3355 non-null   float64
 10  VRDeck        3355 non-null   float64
 11  has_services  3355 non-null   int32  
dtypes: float64(6), int32(3), object(3)
memory usage: 275.3+ KB


In [6]:
categorical_features = df_train_cl.select_dtypes(include=['object']).columns.tolist()
categorical_features

['HomePlanet', 'Cabin', 'Destination']

In [7]:
encoder = OrdinalEncoder(categories='auto', dtype=int, handle_unknown='use_encoded_value', unknown_value=-1)
encoder.fit(df_train_cl[categorical_features])

df_train_cl[categorical_features] = encoder.fit_transform(df_train_cl[categorical_features])
df_train_cl.reset_index(drop=True, inplace=True)

df_train_cl.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,has_services
0,1,False,0,2,39.0,False,0.0,0.0,0.0,0.0,0.0,False,False
1,0,False,1,2,24.0,False,109.0,9.0,25.0,549.0,44.0,True,True
2,1,False,1,2,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,True
3,1,False,1,2,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,True
4,0,False,1,2,16.0,False,303.0,70.0,151.0,565.0,2.0,True,True


In [8]:
df_test_cl[categorical_features] = encoder.fit_transform(df_test_cl[categorical_features])
df_test_cl.reset_index(drop=True, inplace=True)

df_test_cl.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,has_services
0,0,1,1,2,27.0,0,0.0,0.0,0.0,0.0,0.0,0
1,0,0,1,2,19.0,0,0.0,9.0,0.0,2823.0,0.0,1
2,1,1,1,0,31.0,0,0.0,0.0,0.0,0.0,0.0,0
3,1,0,1,2,38.0,0,0.0,6652.0,0.0,181.0,585.0,1
4,0,0,1,2,20.0,0,10.0,0.0,635.0,0.0,0.0,1


---

In [9]:
new_directory = 'datasets/tree'

os.makedirs(new_directory, exist_ok=True)

In [10]:
path1 = 'datasets/tree/train_tree.csv'
path2 = 'datasets/tree/test_tree.csv'

df_train_cl.to_csv(path1, index=False)
df_test_cl.to_csv(path2, index=False)