In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Machine Learning/Data/Spaceship.csv')

In [None]:
df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


#Imputations

In [None]:
df['SHomePlanet'] = df['HomePlanet']

In [None]:
df['SHomePlanet'].value_counts().idxmax

<bound method Series.idxmax of Earth     4602
Europa    2131
Mars      1759
Name: SHomePlanet, dtype: int64>

In [None]:
df['SHomePlanet'].fillna('Earth', inplace=True)

In [None]:
df['SCryoSleep'] = df['CryoSleep']

In [None]:
df['SCryoSleep'].value_counts().idxmax

<bound method Series.idxmax of False    5439
True     3037
Name: SCryoSleep, dtype: int64>

In [None]:
df['SCryoSleep'].fillna(False, inplace=True)

In [None]:
df['SCryoSleep'] = df['SCryoSleep'].astype(bool).astype(int)

In [None]:
df['SCabin'] = df['Cabin']

In [None]:
df['SCabin'].value_counts().idxmax

<bound method Series.idxmax of G/734/S     8
G/109/P     7
B/201/P     7
G/1368/P    7
G/981/S     7
           ..
G/556/P     1
E/231/S     1
G/545/S     1
G/543/S     1
F/947/P     1
Name: SCabin, Length: 6560, dtype: int64>

In [None]:
df['SCabin'].fillna('G/734/S', inplace=True)

In [None]:
df['SDestination'] = df['Destination']

In [None]:
df['SDestination'].value_counts().idxmax

<bound method Series.idxmax of TRAPPIST-1e      5915
55 Cancri e      1800
PSO J318.5-22     796
Name: SDestination, dtype: int64>

In [None]:
df['SDestination'].fillna('TRAPPIST-1e', inplace=True)

In [None]:
df['SAge'] = df['Age']

In [None]:
df['SAge'].mean()

28.82793046746535

In [None]:
df['SAge'].fillna('28', inplace=True)

In [None]:
df['SVIP'] = df['VIP']

In [None]:
df['SVIP'].value_counts().idxmax

<bound method Series.idxmax of False    8291
True      199
Name: SVIP, dtype: int64>

In [None]:
df['SVIP'].fillna(False, inplace=True)

In [None]:
df['SVIP'] = df['SVIP'].astype(bool).astype(int)

#Set Up

In [None]:
df.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported', 'SHomePlanet', 'SCryoSleep', 'SCabin',
       'SDestination', 'SAge', 'SVIP'],
      dtype='object')

In [None]:
df.shape

(8693, 20)

In [None]:
df.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
SHomePlanet       0
SCryoSleep        0
SCabin            0
SDestination      0
SAge              0
SVIP              0
dtype: int64

In [None]:
cols = ['PassengerId', 'SHomePlanet', 'SCryoSleep', 'SCabin', 'SAge',
       'SVIP']

In [None]:
df.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
SHomePlanet       0
SCryoSleep        0
SCabin            0
SDestination      0
SAge              0
SVIP              0
dtype: int64

In [None]:
X = df[cols]
X

Unnamed: 0,PassengerId,SHomePlanet,SCryoSleep,SCabin,SAge,SVIP
0,0001_01,Europa,0,B/0/P,39.0,0
1,0002_01,Earth,0,F/0/S,24.0,0
2,0003_01,Europa,0,A/0/S,58.0,1
3,0003_02,Europa,0,A/0/S,33.0,0
4,0004_01,Earth,0,F/1/S,16.0,0
...,...,...,...,...,...,...
8688,9276_01,Europa,0,A/98/P,41.0,1
8689,9278_01,Earth,1,G/1499/S,18.0,0
8690,9279_01,Earth,0,G/1500/S,26.0,0
8691,9280_01,Europa,0,E/608/S,32.0,0


In [None]:
y = df['Transported']
y

0       False
1        True
2       False
3       False
4        True
        ...  
8688    False
8689    False
8690     True
8691    False
8692     True
Name: Transported, Length: 8693, dtype: bool

In [None]:
logreg = LogisticRegression(solver='liblinear', random_state = 1) #instantiating

In [None]:
ohe = OneHotEncoder()
ohe.fit_transform(df[['SHomePlanet', 'SCabin']])

<8693x6563 sparse matrix of type '<class 'numpy.float64'>'
	with 17386 stored elements in Compressed Sparse Row format>

In [None]:
ct = make_column_transformer(
    (ohe, ['SHomePlanet', 'SCabin']),
    remainder = 'passthrough')

In [None]:
ct.fit_transform(X)

<8693x6567 sparse matrix of type '<class 'numpy.float64'>'
	with 37830 stored elements in Compressed Sparse Row format>

In [None]:
pipe = make_pipeline(ct, logreg)

In [None]:
pipe.fit(X, y)

In [None]:
logreg.fit(ct.fit_transform(X), y)

#Prediction

In [None]:
df2 = pd.read_csv('/content/drive/MyDrive/Machine Learning/Data/SpaceshipTest.csv')

In [None]:
df2.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name'],
      dtype='object')

In [None]:
cols

['PassengerId', 'SHomePlanet', 'SCryoSleep', 'SCabin', 'SAge', 'SVIP']

In [None]:
df2['SHomePlanet'] = df['SHomePlanet']

In [None]:
df2['SCryoSleep'] = df['SCryoSleep']

In [None]:
df2['SCabin'] = df['SCabin']

In [None]:
df2['SAge'] = df['SAge']

In [None]:
df2['SVIP'] = df['SVIP']

In [None]:
X_new = df2[cols]

In [None]:
y_predK = pipe.predict(X_new)

In [None]:
y_predK

array([False, False, False, ...,  True, False,  True])

In [None]:
kaggle = pd.DataFrame(y_predK)

In [None]:
kaggle = pd.DataFrame(data = {'PassengerId':df2['PassengerId'], 'Transported':y_predK})

In [None]:
kaggle

Unnamed: 0,PassengerId,Transported
0,0013_01,False
1,0018_01,False
2,0019_01,False
3,0021_01,False
4,0023_01,False
...,...,...
4272,9266_02,True
4273,9269_01,True
4274,9271_01,True
4275,9273_01,False


In [None]:
kaggle.to_csv('/content/drive/MyDrive/Machine Learning/Data/SpaceSubmission2.csv')

In [None]:
kaggle.shape

(4277, 2)