In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from IPython.display import display

In [2]:
train_data = pd.read_csv("data/train.csv")
train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [3]:
train_data.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [5]:
fig = px.imshow(
    train_data.isnull(),
    aspect="auto",
    labels=dict(
        x="feature",
        y="index",
        showscale=False
    ),
    title="Missing Values by Index"
)
fig.update_coloraxes(showscale=False)
fig.show()

In [6]:
px.imshow(
    train_data.corr("pearson"),
    text_auto=True,
    aspect="auto",
    title="Pearson Correlation"
)

In [7]:
fig = go.Figure()
fig.add_trace(go.Histogram(
    x=train_data.loc[train_data.Transported == True, "Age"],
    name="Transported"
))
fig.add_trace(go.Histogram(
    x=train_data.loc[train_data.Transported == False, "Age"],
    name="Not Transported"
))
fig.update_layout(
    title_text="Age histogram of Transported and Not Transported Passenger",
    xaxis_title_text="age",
    yaxis_title_text="count",
)

In [8]:
cols = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
nb_col = 3
fig = make_subplots(
    rows=len(cols)//nb_col+1 if len(cols)%nb_col else len(cols)//nb_col,
    cols=nb_col,
    subplot_titles=cols
)
for i, col in enumerate(cols):
    fig.add_trace(go.Scatter(
        y=train_data.loc[train_data.Transported == True, col].sort_values(),
        x=train_data.index,
        mode="lines",
        name="Transported",
    ),row=i//nb_col+1, col=i%nb_col+1)
    fig.add_trace(go.Scatter(
        y=train_data.loc[train_data.Transported == False, col].sort_values(),
        x=train_data.index,
        mode="lines",
        name="Not Transported",
    ),row=i//nb_col+1, col=i%nb_col+1)
    fig.update_layout(
        title_text="Correlation between Transported or not and how much passenger spent on the trip"
    )
fig.show()

In [9]:
fig = px.bar(
    train_data.drop(columns=train_data._get_numeric_data().columns).nunique(),
    text_auto=True,
    title="Unique Values in Categorical Features",
    labels=dict(
        index="feature",
        value="count",
    ),
)
fig.update(layout_showlegend=False)
fig.show()

In [10]:
groups_df = pd.DataFrame({
    "Transported": train_data.Transported,
    "Groups": np.array([value[5:] for value in train_data.PassengerId.values])
})
fig = go.Figure()
fig.add_trace(go.Bar(
    y=groups_df.loc[groups_df.Transported == True, "Groups"].value_counts(),
    x=groups_df.loc[groups_df.Transported == True, "Groups"].value_counts().index,
    name="Transported",
))
fig.add_trace(go.Bar(
    y=groups_df.loc[groups_df.Transported == False, "Groups"].value_counts(),
    x=groups_df.loc[groups_df.Transported == False, "Groups"].value_counts().index,
    name="Not Transported",
))
fig.update_layout(title_text="Group of passengers")
fig.show()

In [11]:
cabin_df = pd.DataFrame(dict(
    Transported = train_data.Transported,
    Deck = np.array([value.split("/") if value is not float(np.nan) else [value, value, value] for value in train_data.Cabin]).T[0],
    Num = np.array([value.split("/") if value is not float(np.nan) else [value, value, value] for value in train_data.Cabin]).T[1],
    Side = np.array([value.split("/") if value is not float(np.nan) else [value, value, value] for value in train_data.Cabin]).T[2],
))
cabin_df = pd.concat([cabin_df, pd.DataFrame({"DeckSide": [f"{arr[0]} {arr[1]}" for arr in list(zip(cabin_df.Deck.to_numpy(),cabin_df.Side.to_numpy()))]})], axis=1)

In [12]:
fig = make_subplots(rows=1, cols=cabin_df.drop(columns=["Transported", "Num"]).columns.size, subplot_titles=cabin_df.drop(columns=["Transported", "Num"]).columns)
for i, col in enumerate(cabin_df.drop(columns=["Transported", "Num"]).columns):
    fig.add_trace(go.Bar(
        y=cabin_df.loc[cabin_df.Transported == True, col].value_counts(),
        x=cabin_df.loc[cabin_df.Transported == True, col].value_counts().index,
        name="Transported",
    ), row=1, col=i+1)
    fig.add_trace(go.Bar(
        y=cabin_df.loc[cabin_df.Transported == False, col].value_counts(),
        x=cabin_df.loc[cabin_df.Transported == False, col].value_counts().index,
        name="Not Transported",
    ), row=1, col=i+1)
fig.update_layout(title_text="Cabin location of passenger")
fig.show()

In [13]:
fig = go.Figure()
fig.add_trace(go.Histogram(
    x=cabin_df.loc[cabin_df.Transported == True, "Num"].value_counts().index.astype(float).sort_values(),
    y=cabin_df.loc[cabin_df.Transported == True, "Num"].value_counts(),
    name="Transported",
    nbinsx=100
))
fig.add_trace(go.Histogram(
    x=cabin_df.loc[cabin_df.Transported == False, "Num"].value_counts().index.astype(float).sort_values(),
    y=cabin_df.loc[cabin_df.Transported == False, "Num"].value_counts(),
    name="Not Transported",
    nbinsx=100
))
fig.update_layout(title_text="Cabin Number of passenger")
fig.show()

In [14]:
cols = ["HomePlanet", "CryoSleep", "Destination", "VIP"]
nb_col = 2
fig = make_subplots(
    rows=len(cols)//nb_col+1 if len(cols)%nb_col else len(cols)//nb_col,
    cols=nb_col,
    subplot_titles=cols
)
for i, col in enumerate(cols):
    fig.add_trace(go.Bar(
        x=train_data.loc[train_data["Transported"] == True, col].value_counts().index,
        y=train_data.loc[train_data["Transported"] == True, col].value_counts(),
        name="Transported",
    ),row=i//nb_col+1, col=i%nb_col+1)
    fig.add_trace(go.Bar(
        x=train_data.loc[train_data["Transported"] == False, col].value_counts().index,
        y=train_data.loc[train_data["Transported"] == False, col].value_counts(),
        name="Not Transported",
    ),row=i//nb_col+1, col=i%nb_col+1)
    fig.update_layout(
        title_text="Correlation between Transported or not and bool features"
    )
fig.show()

In [15]:
name_df = pd.DataFrame({
    "Transported": train_data.Transported,
    "FirstName": np.array([value.split(" ")[0] if value is not np.nan else [value, value][0] for value in train_data.Name]),
    "LastName": np.array([value.split(" ")[1] if value is not np.nan else [value, value][1] for value in train_data.Name]),
})
fig = go.Figure()
fig.add_trace(go.Histogram(
    x=name_df.loc[name_df["Transported"] == True, "LastName"].value_counts().drop(index="nan"),
    name="Transported",
))
fig.add_trace(go.Histogram(
    x=name_df.loc[name_df["Transported"] == False, "LastName"].value_counts().drop(index="nan"),
    name="Not Transported",
))
fig.update_layout(title="Number of Passenger with the same firstname")
fig.show()

In [16]:
fig = go.Figure()
fig.add_trace(go.Histogram(
    x=name_df.loc[name_df["Transported"] == True, "FirstName"].value_counts().drop(index="nan"),
    name="Transported",
))
fig.add_trace(go.Histogram(
    x=name_df.loc[name_df["Transported"] == False, "FirstName"].value_counts().drop(index="nan"),
    name="Not Transported",
))
fig.update_layout(title="Number of Passenger with the same lastname")
fig.show()

In [17]:
px.bar(train_data.isnull().sum(), title="Null Values")