In [12]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [13]:
train_df = pd.read_csv("C:/Users/LAPTOPMART/Downloads/Code/train.csv")
test_df = pd.read_csv("C:/Users/LAPTOPMART/Downloads/Code/test.csv")


In [14]:
train_df.head()
test_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [15]:
train_df.tail(5)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False
8692,9280_02,Europa,False,E/608/S,TRAPPIST-1e,44.0,False,126.0,4688.0,0.0,0.0,12.0,Propsh Hontichre,True


In [16]:
train_df.nunique()

PassengerId     8693
HomePlanet         3
CryoSleep          2
Cabin           6560
Destination        3
Age               80
VIP                2
RoomService     1273
FoodCourt       1507
ShoppingMall    1115
Spa             1327
VRDeck          1306
Name            8473
Transported        2
dtype: int64

In [17]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [18]:
# Preprocessing
def preprocess_data(df, is_train=True):
    df = df.copy()
    
    if 'PassengerId' in df.columns:
        df['GroupID'] = df['PassengerId'].apply(lambda x: x.split('_')[0]).astype(int)
        df['PassengerNumber'] = df['PassengerId'].apply(lambda x: x.split('_')[1]).astype(int)
        passenger_ids = df['PassengerId']  # Store PassengerId before dropping
        df.drop(columns=['PassengerId'], inplace=True)
    else:
        passenger_ids = None
    

    if 'Cabin' in df.columns:
        df[['Deck', 'CabinNumber', 'Side']] = df['Cabin'].str.split('/', expand=True)
        df.drop(columns=['Cabin'], inplace=True)
    
    # Fill missing values
    df.fillna(df.select_dtypes(include=np.number).median(), inplace=True)
    categorical_cols = df.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        df[col].fillna(df[col].mode()[0], inplace=True)
    
    # Convert boolean-like columns to integers
    for col in ['CryoSleep', 'VIP']:
        if col in df.columns:
            df[col] = df[col].astype(int)
    

    df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
    

    if {'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'}.issubset(df.columns):
        df['TotalSpending'] = df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
    
    return df, passenger_ids

In [19]:
# Preprocess train and test data
train_df, _ = preprocess_data(train_df, is_train=True)
test_df, test_passenger_ids = preprocess_data(test_df, is_train=False)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
  df[col].fillna(df[col].mode()[0], inplace=True)


In [11]:

missing_cols = set(train_df.columns) - set(test_df.columns)
for col in missing_cols:
    test_df[col] = 0 

test_df = test_df[train_df.columns.drop('Transported')]

  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[col] = 0
  test_df[co

In [20]:

X = train_df.drop(columns=['Transported'])
y = train_df['Transported'].astype(int)

#train test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [21]:

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_val)


In [29]:
# Ensure test dataset has the same columns as train dataset
test_df = test_df.reindex(columns=X_train.columns, fill_value=0)


In [30]:


# Generate predictions for the test set
test_predictions = clf.predict(test_df)

In [31]:
#submission file
submission = pd.DataFrame({
    "PassengerId": test_passenger_ids,
    "Transported": test_predictions.astype(bool) 
})
submission.to_csv("submission.csv", index=False)
print("Submission file saved as submission.csv")

Submission file saved as submission.csv
