<a href="https://colab.research.google.com/github/JulianaCarvajal/Spaceship_Titanic/blob/workOnPreprocess%2FBustamJos3/dataset_cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#libraries
from zipfile import ZipFile
import pandas as pd
import numpy as np

# Working with the next cell [Online]
For now on, import cell from colab will have this code (**On local notebook, respective changes must be made in order to import .json token and datasets from kaggle**).

In [2]:
#call API
%pip install kaggle
#upload kaggle token .json
from google.colab import files
files.upload()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"bustamjos3","key":"a80d020fd1917115be9ce5ce749b71d9"}'}

In [3]:
#Make a directory named kaggle and copy the kaggle.json file there.
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
# change the permission of the file
!chmod 600 ~/.kaggle/kaggle.json
# Import data files from the Kaggle API
!kaggle competitions download -c spaceship-titanic
#unzip files
file_name = "spaceship-titanic.zip" #the file is the dataset exact name
with ZipFile("spaceship-titanic.zip", 'r') as zip:
  zip.extractall()
  print('Done')
# .zip is now not necessary
!rm *.zip

Downloading spaceship-titanic.zip to /content
  0% 0.00/299k [00:00<?, ?B/s]
100% 299k/299k [00:00<00:00, 77.3MB/s]
Done


# End of working [Online]

# Working on Local

In [None]:
#call API
%pip install kaggle
# Import data files from the Kaggle API
!kaggle competitions download -c spaceship-titanic

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Unzip the data files
with ZipFile("spaceship-titanic.zip", 'r') as zip_ref:
    zip_ref.extractall()
    print('Done')
# .zip is now not necessary
!rm *.zip

# End of working on Local

In [4]:
# Read the data files into pandas dataframes
d_sTitanic = pd.read_csv("train.csv")
d_sTitanic.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


# Pipeline for work on preprocessing

In [5]:
#convert types of cols to the best possible dtype
d_sTitanic=d_sTitanic.convert_dtypes(infer_objects=True)

In [6]:
d_sTitanic.dtypes

PassengerId      string
HomePlanet       string
CryoSleep       boolean
Cabin            string
Destination      string
Age               Int64
VIP             boolean
RoomService       Int64
FoodCourt         Int64
ShoppingMall      Int64
Spa               Int64
VRDeck            Int64
Name             string
Transported     boolean
dtype: object

## Drop at 1st looking, not relevant columns:
* ```d_sTitanic['PassengerId']```
* ```d_sTitanic['Name']```

In [7]:
#drop those columns
d_sTitanic.drop(['PassengerId','Name'],axis=1,inplace=True)

In [8]:
# separation of X, y
y=d_sTitanic.iloc[:,-1].values

In [9]:
#also drop
d_sTitanic.drop(['Transported'],axis=1,inplace=True)

In [10]:
# take name cols which are categorical
listCategoric=[str(i) for i in (d_sTitanic.dtypes=='string').loc[(d_sTitanic.dtypes=='string')==True].index]

In [11]:
# take the categoric types only
#to handle oneHotEncoder, replace NaN values with 'wanted'
XCategoric=d_sTitanic[listCategoric].fillna('wanted').values
XCategoric

array([['Europa', 'B/0/P', 'TRAPPIST-1e'],
       ['Earth', 'F/0/S', 'TRAPPIST-1e'],
       ['Europa', 'A/0/S', 'TRAPPIST-1e'],
       ...,
       ['Earth', 'G/1500/S', 'TRAPPIST-1e'],
       ['Europa', 'E/608/S', '55 Cancri e'],
       ['Europa', 'E/608/S', 'TRAPPIST-1e']], dtype=object)

In [12]:
#numeric col names
listNumeric=[str(i) for i in (d_sTitanic.dtypes=='Int64').loc[(d_sTitanic.dtypes=='Int64')==True].index]

In [45]:
# take the numeric types only
XNumeric=d_sTitanic[listNumeric].fillna(-1).values
XNumeric

array([[39, 0, 0, 0, 0, 0],
       [24, 109, 9, 25, 549, 44],
       [58, 43, 3576, 0, 6715, 49],
       ...,
       [26, 0, 0, 1872, 1, 0],
       [32, 0, 1049, 0, 353, 3235],
       [44, 126, 4688, 0, 0, 12]], dtype=object)

## oneHotEncoding
Necessary to be done on previous 'cause working with the next methods require array-like data

In [39]:
from sklearn.preprocessing import OneHotEncoder

In [15]:
# instancing
oneHotEncoder=OneHotEncoder(handle_unknown='error',sparse=False)

In [16]:
#fitting with categorical X
oneHotEncoder.fit(XCategoric)

OneHotEncoder(sparse=False)

In [17]:
#array-like object with categories of the n-categoric cols
#'wanted' will be the nan values, placed at the end of each categoric col
oneHotEncoder.categories_

[array(['Earth', 'Europa', 'Mars', 'wanted'], dtype=object),
 array(['A/0/P', 'A/0/S', 'A/1/S', ..., 'T/2/S', 'T/3/P', 'wanted'],
       dtype=object),
 array(['55 Cancri e', 'PSO J318.5-22', 'TRAPPIST-1e', 'wanted'],
       dtype=object)]

In [18]:
# tranformation to get convertion to OHE
convertedOHE=oneHotEncoder.transform(XCategoric)
convertedOHE

array([[0., 1., 0., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       ...,
       [1., 0., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 1., 0.]])

In [19]:
#TO_CONSIDER!: categories for cabin column are a little infrequent***
oneHotEncoder.categories_[0].shape[0]+oneHotEncoder.categories_[1].shape[0]+oneHotEncoder.categories_[2].shape[0]

6569

## ```NaN``` Imputation

In [20]:
from sklearn.impute import KNNImputer

In [46]:
#instancing
kNNImputer=KNNImputer(n_neighbors=1, missing_values=-1,weights='distance')

In [47]:
#imputation
imputedNumeric=kNNImputer.fit_transform(XNumeric)
imputedNumeric

array([[3.900e+01, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00],
       [2.400e+01, 1.090e+02, 9.000e+00, 2.500e+01, 5.490e+02, 4.400e+01],
       [5.800e+01, 4.300e+01, 3.576e+03, 0.000e+00, 6.715e+03, 4.900e+01],
       ...,
       [2.600e+01, 0.000e+00, 0.000e+00, 1.872e+03, 1.000e+00, 0.000e+00],
       [3.200e+01, 0.000e+00, 1.049e+03, 0.000e+00, 3.530e+02, 3.235e+03],
       [4.400e+01, 1.260e+02, 4.688e+03, 0.000e+00, 0.000e+00, 1.200e+01]])

In [48]:
#let calculate the variance of imputedNumeric
np.var(imputedNumeric,axis=0)

array([2.08723453e+02, 4.44292139e+05, 2.55328508e+06, 3.58256622e+05,
       1.27179322e+06, 1.29852378e+06])

In [49]:
#let calculate the variance of convertedOHE
np.var(convertedOHE,axis=0)

array([0.24913614, 0.18504626, 0.16140252, ..., 0.08318324, 0.2174441 ,
       0.02049805])

### Now merge ```imputedNumeric``` and ```convertedOHE```

In [50]:
# using numpy.concantenate
X=np.concatenate( (imputedNumeric, convertedOHE), axis=1 )
X

array([[3.900e+01, 0.000e+00, 0.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [2.400e+01, 1.090e+02, 9.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [5.800e+01, 4.300e+01, 3.576e+03, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       ...,
       [2.600e+01, 0.000e+00, 0.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [3.200e+01, 0.000e+00, 1.049e+03, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [4.400e+01, 1.260e+02, 4.688e+03, ..., 0.000e+00, 1.000e+00,
        0.000e+00]])

In [51]:
# standardization is required 'cause many models work badly is data does'nt look like normally distributed data (see biblio)
from sklearn.preprocessing import StandardScaler

In [52]:
# scaling with train
standardS=StandardScaler().fit(X)

## Categorical values converted to OHE shoulb be standardized?

In [53]:
XScaled=standardS.transform(X)
XScaled

array([[ 0.69747552, -0.33679313, -0.28266174, ..., -0.31748665,
         0.68531265, -0.146233  ],
       [-0.34078331, -0.17326511, -0.27702934, ..., -0.31748665,
         0.68531265, -0.146233  ],
       [ 2.01260337, -0.27228208,  1.95527526, ..., -0.31748665,
         0.68531265, -0.146233  ],
       ...,
       [-0.2023488 , -0.33679313, -0.28266174, ..., -0.31748665,
         0.68531265, -0.146233  ],
       [ 0.21295473, -0.33679313,  0.37382482, ..., -0.31748665,
        -1.45918801, -0.146233  ],
       [ 1.0435618 , -0.14776074,  2.65118856, ..., -0.31748665,
         0.68531265, -0.146233  ]])

# End of work on preprocessing

In [54]:
# export preprocessed X to .json
pprocessedX=pd.DataFrame( XScaled )
pprocessedX.to_json('pprocessedX.json', orient='index')

### Train-Test splitting
I think ```splitting``` shoulb be **on the next step**

In [29]:
# import train_test split function
from sklearn.model_selection import train_test_split

In [30]:
# train validation split
X_trainST,X_valST,y_trainST,y_valST=train_test_split(X,y,test_size=0.33,random_state=42)